diff options
Diffstat (limited to 'tools')
127 files changed, 15578 insertions, 2291 deletions
diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h index 139d5e87dc95..b35d954d50c3 100644 --- a/tools/arch/arm64/include/asm/cputype.h +++ b/tools/arch/arm64/include/asm/cputype.h @@ -245,7 +245,7 @@ #define MIDR_FUJITSU_ERRATUM_010001_MASK (~MIDR_CPU_VAR_REV(1, 0)) #define TCR_CLEAR_FUJITSU_ERRATUM_010001 (TCR_NFD1 | TCR_NFD0) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/sysreg.h> @@ -338,6 +338,6 @@ static inline u32 __attribute_const__ read_cpuid_cachetype(void) { return read_cpuid(CTR_EL0); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/tools/arch/arm64/include/asm/esr.h b/tools/arch/arm64/include/asm/esr.h index bd592ca81571..bbfbd1497a2f 100644 --- a/tools/arch/arm64/include/asm/esr.h +++ b/tools/arch/arm64/include/asm/esr.h @@ -385,7 +385,7 @@ #define ESR_ELx_MOPS_ISS_SRCREG(esr) (((esr) & (UL(0x1f) << 5)) >> 5) #define ESR_ELx_MOPS_ISS_SIZEREG(esr) (((esr) & (UL(0x1f) << 0)) >> 0) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/types.h> static inline unsigned long esr_brk_comment(unsigned long esr) @@ -450,6 +450,6 @@ static inline bool esr_iss_is_eretab(unsigned long esr) } const char *esr_get_class_string(unsigned long esr); -#endif /* __ASSEMBLY */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_ESR_H */ diff --git a/tools/arch/arm64/include/asm/gpr-num.h b/tools/arch/arm64/include/asm/gpr-num.h index 05da4a7c5788..a114e4f8209b 100644 --- a/tools/arch/arm64/include/asm/gpr-num.h +++ b/tools/arch/arm64/include/asm/gpr-num.h @@ -2,7 +2,7 @@ #ifndef __ASM_GPR_NUM_H #define __ASM_GPR_NUM_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 .equ .L__gpr_num_x\num, \num @@ -11,7 +11,7 @@ .equ .L__gpr_num_xzr, 31 .equ .L__gpr_num_wzr, 31 -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #define __DEFINE_ASM_GPR_NUMS \ " .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" \ @@ -21,6 +21,6 @@ " .equ .L__gpr_num_xzr, 31\n" \ " .equ .L__gpr_num_wzr, 31\n" -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_GPR_NUM_H */ diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h index 65f2759ea27a..178b7322bf04 100644 --- a/tools/arch/arm64/include/asm/sysreg.h +++ b/tools/arch/arm64/include/asm/sysreg.h @@ -51,7 +51,7 @@ #ifndef CONFIG_BROKEN_GAS_INST -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ // The space separator is omitted so that __emit_inst(x) can be parsed as // either an assembler directive or an assembler macro argument. #define __emit_inst(x) .inst(x) @@ -70,11 +70,11 @@ (((x) >> 24) & 0x000000ff)) #endif /* CONFIG_CPU_BIG_ENDIAN */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define __emit_inst(x) .long __INSTR_BSWAP(x) -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #define __emit_inst(x) ".long " __stringify(__INSTR_BSWAP(x)) "\n\t" -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_BROKEN_GAS_INST */ @@ -1078,9 +1078,7 @@ #define GCS_CAP(x) ((((unsigned long)x) & GCS_CAP_ADDR_MASK) | \ GCS_CAP_VALID_TOKEN) -#define ARM64_FEATURE_FIELD_BITS 4 - -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro mrs_s, rt, sreg __emit_inst(0xd5200000|(\sreg)|(.L__gpr_num_\rt)) diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index ed5f3892674c..a792a599b9d6 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -31,7 +31,7 @@ #define KVM_SPSR_FIQ 4 #define KVM_NR_SPSR 5 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/psci.h> #include <linux/types.h> #include <asm/ptrace.h> diff --git a/tools/arch/riscv/include/asm/csr.h b/tools/arch/riscv/include/asm/csr.h index 56d7367ee344..21d8cee04638 100644 --- a/tools/arch/riscv/include/asm/csr.h +++ b/tools/arch/riscv/include/asm/csr.h @@ -167,7 +167,8 @@ #define VSIP_TO_HVIP_SHIFT (IRQ_VS_SOFT - IRQ_S_SOFT) #define VSIP_VALID_MASK ((_AC(1, UL) << IRQ_S_SOFT) | \ (_AC(1, UL) << IRQ_S_TIMER) | \ - (_AC(1, UL) << IRQ_S_EXT)) + (_AC(1, UL) << IRQ_S_EXT) | \ + (_AC(1, UL) << IRQ_PMU_OVF)) /* AIA CSR bits */ #define TOPI_IID_SHIFT 16 @@ -280,7 +281,7 @@ #define CSR_HPMCOUNTER30H 0xc9e #define CSR_HPMCOUNTER31H 0xc9f -#define CSR_SSCOUNTOVF 0xda0 +#define CSR_SCOUNTOVF 0xda0 #define CSR_SSTATUS 0x100 #define CSR_SIE 0x104 diff --git a/tools/arch/s390/include/uapi/asm/bitsperlong.h b/tools/arch/s390/include/uapi/asm/bitsperlong.h index d2bb620119bf..a226a1686a53 100644 --- a/tools/arch/s390/include/uapi/asm/bitsperlong.h +++ b/tools/arch/s390/include/uapi/asm/bitsperlong.h @@ -2,11 +2,7 @@ #ifndef __ASM_S390_BITSPERLONG_H #define __ASM_S390_BITSPERLONG_H -#ifndef __s390x__ -#define __BITS_PER_LONG 32 -#else #define __BITS_PER_LONG 64 -#endif #include <asm-generic/bitsperlong.h> diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 4091a776e37a..ccc01ad6ff7c 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -320,7 +320,7 @@ #define X86_FEATURE_FSRS (12*32+11) /* Fast short REP STOSB */ #define X86_FEATURE_FSRC (12*32+12) /* Fast short REP {CMPSB,SCASB} */ #define X86_FEATURE_FRED (12*32+17) /* "fred" Flexible Return and Event Delivery */ -#define X86_FEATURE_LKGS (12*32+18) /* Load "kernel" (userspace) GS */ +#define X86_FEATURE_LKGS (12*32+18) /* Like MOV_GS except MSR_KERNEL_GS_BASE = GS.base */ #define X86_FEATURE_WRMSRNS (12*32+19) /* Non-serializing WRMSR */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */ @@ -407,9 +407,12 @@ #define X86_FEATURE_ENQCMD (16*32+29) /* "enqcmd" ENQCMD and ENQCMDS instructions */ #define X86_FEATURE_SGX_LC (16*32+30) /* "sgx_lc" Software Guard Extensions Launch Control */ -/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ +/* + * Linux-defined word for use with scattered/synthetic bits. + */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */ #define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */ + #define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index c683d609934b..8f10f2943370 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -312,7 +312,6 @@ static inline int insn_offset_immediate(struct insn *insn) /** * for_each_insn_prefix() -- Iterate prefixes in the instruction * @insn: Pointer to struct insn. - * @idx: Index storage. * @prefix: Prefix byte. * * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix @@ -321,8 +320,8 @@ static inline int insn_offset_immediate(struct insn *insn) * Since prefixes.nbytes can be bigger than 4 if some prefixes * are repeated, it cannot be used for looping over the prefixes. */ -#define for_each_insn_prefix(insn, idx, prefix) \ - for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) +#define for_each_insn_prefix(insn, prefix) \ + for (int idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index 9792e329343e..1baa86dfe029 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -93,6 +93,7 @@ #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 #define EXIT_REASON_NOTIFY 75 +#define EXIT_REASON_SEAMCALL 76 #define EXIT_REASON_TDCALL 77 #define EXIT_REASON_MSR_READ_IMM 84 #define EXIT_REASON_MSR_WRITE_IMM 85 diff --git a/tools/arch/x86/tools/gen-cpu-feature-names-x86.awk b/tools/arch/x86/tools/gen-cpu-feature-names-x86.awk new file mode 100644 index 000000000000..cc4c7a3e6c2e --- /dev/null +++ b/tools/arch/x86/tools/gen-cpu-feature-names-x86.awk @@ -0,0 +1,34 @@ +#!/bin/awk -f +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) 2025, Oracle and/or its affiliates. +# +# Usage: awk -f gen-cpu-feature-names-x86.awk cpufeatures.h > cpu-feature-names.c +# + +BEGIN { + print "/* cpu feature name array generated from cpufeatures.h */" + print "/* Do not change this code. */" + print + print "static const char *cpu_feature_names[(NCAPINTS+NBUGINTS)*32] = {" + + value_expr = "\\([0-9*+ ]+\\)" +} + +/^#define X86_FEATURE_/ { + if (match($0, value_expr)) { + value = substr($0, RSTART + 1, RLENGTH - 2) + print "\t[" value "] = \"" $2 "\"," + } +} + +/^#define X86_BUG_/ { + if (match($0, value_expr)) { + value = substr($0, RSTART + 1, RLENGTH - 2) + print "\t[NCAPINTS*32+(" value ")] = \"" $2 "\"," + } +} + +END { + print "};" +} diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 009633294b09..35aeeaf5f711 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -182,7 +182,7 @@ bpftool prog tracelog bpftool prog tracelog { stdout | stderr } *PROG* Dump the BPF stream of the program. BPF programs can write to these streams - at runtime with the **bpf_stream_vprintk**\ () kfunc. The kernel may write + at runtime with the **bpf_stream_vprintk_impl**\ () kfunc. The kernel may write error messages to the standard error stream. This facility should be used only for debugging purposes. diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 32bbe29fe5f6..300a329bc581 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -315,5 +315,7 @@ endef ifeq ($(FEATURE_DISPLAY_DEFERRED),) $(call feature_display_entries) - $(info ) + ifeq ($(feature_display),1) + $(info ) + endif endif diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 49b0add392b1..95646290cb89 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -107,7 +107,7 @@ all: $(FILES) __BUILD = $(CC) $(CFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS) BUILD = $(__BUILD) > $(@:.bin=.make.output) 2>&1 BUILD_BFD = $(BUILD) -DPACKAGE='"perf"' -lbfd -ldl - BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma -lzstd + BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -ldl -lz -llzma -lzstd __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS) BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1 @@ -115,7 +115,7 @@ __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$( ############################### $(OUTPUT)test-all.bin: - $(BUILD_ALL) || $(BUILD_ALL) -lopcodes -liberty + $(BUILD_ALL) $(OUTPUT)test-hello.bin: $(BUILD) diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h index df4c3cc713ac..0a39bee261b9 100644 --- a/tools/include/nolibc/arch-s390.h +++ b/tools/include/nolibc/arch-s390.h @@ -143,13 +143,8 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( -#ifdef __s390x__ "lgr %r2, %r15\n" /* save stack pointer to %r2, as arg1 of _start_c */ "aghi %r15, -160\n" /* allocate new stackframe */ -#else - "lr %r2, %r15\n" - "ahi %r15, -96\n" -#endif "xc 0(8,%r15), 0(%r15)\n" /* clear backchain */ "brasl %r14, _start_c\n" /* transfer to c runtime */ ); diff --git a/tools/include/nolibc/arch.h b/tools/include/nolibc/arch.h index 426c89198135..ef4743aad188 100644 --- a/tools/include/nolibc/arch.h +++ b/tools/include/nolibc/arch.h @@ -27,7 +27,7 @@ #include "arch-powerpc.h" #elif defined(__riscv) #include "arch-riscv.h" -#elif defined(__s390x__) || defined(__s390__) +#elif defined(__s390x__) #include "arch-s390.h" #elif defined(__loongarch__) #include "arch-loongarch.h" diff --git a/tools/include/uapi/linux/nsfs.h b/tools/include/uapi/linux/nsfs.h index 33c9b578b3b2..a25e38d1c874 100644 --- a/tools/include/uapi/linux/nsfs.h +++ b/tools/include/uapi/linux/nsfs.h @@ -53,6 +53,76 @@ enum init_ns_ino { TIME_NS_INIT_INO = 0xEFFFFFFAU, NET_NS_INIT_INO = 0xEFFFFFF9U, MNT_NS_INIT_INO = 0xEFFFFFF8U, +#ifdef __KERNEL__ + MNT_NS_ANON_INO = 0xEFFFFFF7U, +#endif }; +struct nsfs_file_handle { + __u64 ns_id; + __u32 ns_type; + __u32 ns_inum; +}; + +#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ +#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ + +enum init_ns_id { + IPC_NS_INIT_ID = 1ULL, + UTS_NS_INIT_ID = 2ULL, + USER_NS_INIT_ID = 3ULL, + PID_NS_INIT_ID = 4ULL, + CGROUP_NS_INIT_ID = 5ULL, + TIME_NS_INIT_ID = 6ULL, + NET_NS_INIT_ID = 7ULL, + MNT_NS_INIT_ID = 8ULL, +#ifdef __KERNEL__ + NS_LAST_INIT_ID = MNT_NS_INIT_ID, +#endif +}; + +enum ns_type { + TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */ + MNT_NS = (1ULL << 17), /* CLONE_NEWNS */ + CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */ + UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */ + IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */ + USER_NS = (1ULL << 28), /* CLONE_NEWUSER */ + PID_NS = (1ULL << 29), /* CLONE_NEWPID */ + NET_NS = (1ULL << 30), /* CLONE_NEWNET */ +}; + +/** + * struct ns_id_req - namespace ID request structure + * @size: size of this structure + * @spare: reserved for future use + * @filter: filter mask + * @ns_id: last namespace id + * @user_ns_id: owning user namespace ID + * + * Structure for passing namespace ID and miscellaneous parameters to + * statns(2) and listns(2). + * + * For statns(2) @param represents the request mask. + * For listns(2) @param represents the last listed mount id (or zero). + */ +struct ns_id_req { + __u32 size; + __u32 spare; + __u64 ns_id; + struct /* listns */ { + __u32 ns_type; + __u32 spare2; + __u64 user_ns_id; + }; +}; + +/* + * Special @user_ns_id value that can be passed to listns() + */ +#define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */ + +/* List of all ns_id_req versions. */ +#define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */ + #endif /* __LINUX_NSFS_H */ diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 78a362b80027..d292f96bc06f 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -463,7 +463,9 @@ struct perf_event_attr { inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ - __reserved_1 : 26; + defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */ + defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */ + __reserved_1 : 24; union { __u32 wakeup_events; /* wake up every n events */ @@ -1239,6 +1241,22 @@ enum perf_event_type { */ PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + /* + * This user callchain capture was deferred until shortly before + * returning to user space. Previous samples would have kernel + * callchains only and they need to be stitched with this to make full + * callchains. + * + * struct { + * struct perf_event_header header; + * u64 cookie; + * u64 nr; + * u64 ips[nr]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CALLCHAIN_DEFERRED = 22, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -1269,6 +1287,7 @@ enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, PERF_CONTEXT_KERNEL = (__u64)-128, PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, PERF_CONTEXT_GUEST = (__u64)-2048, PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 80c028540656..d4e4e388e625 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -315,20 +315,20 @@ enum libbpf_tristate { ___param, sizeof(___param)); \ }) -extern int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, - __u32 len__sz, void *aux__prog) __weak __ksym; - -#define bpf_stream_printk(stream_id, fmt, args...) \ -({ \ - static const char ___fmt[] = fmt; \ - unsigned long long ___param[___bpf_narg(args)]; \ - \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ - ___bpf_fill(___param, args); \ - _Pragma("GCC diagnostic pop") \ - \ - bpf_stream_vprintk(stream_id, ___fmt, ___param, sizeof(___param), NULL);\ +extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, + __u32 len__sz, void *aux__prog) __weak __ksym; + +#define bpf_stream_printk(stream_id, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_stream_vprintk_impl(stream_id, ___fmt, ___param, sizeof(___param), NULL); \ }) /* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index dd3b2f57082d..85abc357da31 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -11325,8 +11325,6 @@ static const char *arch_specific_syscall_pfx(void) return "ia32"; #elif defined(__s390x__) return "s390x"; -#elif defined(__s390__) - return "s390"; #elif defined(__arm__) return "arm"; #elif defined(__aarch64__) @@ -12113,8 +12111,6 @@ static const char *arch_specific_lib_paths(void) return "/lib/i386-linux-gnu"; #elif defined(__s390x__) return "/lib/s390x-linux-gnu"; -#elif defined(__s390__) - return "/lib/s390-linux-gnu"; #elif defined(__arm__) && defined(__SOFTFP__) return "/lib/arm-linux-gnueabi"; #elif defined(__arm__) && !defined(__SOFTFP__) diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index c174b4086673..d1524f6f54ae 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -1376,8 +1376,6 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec #elif defined(__s390x__) -/* Do not support __s390__ for now, since user_pt_regs is broken with -m31. */ - static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) { unsigned int reg; diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 58086b101057..aadeb3abcad8 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -861,6 +861,18 @@ class TypeIndexedArray(Type): return [f"{member} = {self.c_name};", f"{presence} = n_{self.c_name};"] + def free_needs_iter(self): + return self.sub_type == 'nest' + + def _free_lines(self, ri, var, ref): + lines = [] + if self.sub_type == 'nest': + lines += [ + f"for (i = 0; i < {var}->{ref}_count.{self.c_name}; i++)", + f'{self.nested_render_name}_free(&{var}->{ref}{self.c_name}[i]);', + ] + lines += f"free({var}->{ref}{self.c_name});", + return lines class TypeNestTypeValue(Type): def _complex_member_type(self, ri): diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore index 4faa4dd72f35..73d883128511 100644 --- a/tools/objtool/.gitignore +++ b/tools/objtool/.gitignore @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only +arch/x86/lib/cpu-feature-names.c arch/x86/lib/inat-tables.c /objtool +feature +FEATURE-DUMP.objtool fixdep libsubcmd/ diff --git a/tools/objtool/Build b/tools/objtool/Build index 8cd71b9a5eef..9982e665d58d 100644 --- a/tools/objtool/Build +++ b/tools/objtool/Build @@ -8,6 +8,9 @@ objtool-y += builtin-check.o objtool-y += elf.o objtool-y += objtool.o +objtool-$(BUILD_DISAS) += disas.o +objtool-$(BUILD_DISAS) += trace.o + objtool-$(BUILD_ORC) += orc_gen.o orc_dump.o objtool-$(BUILD_KLP) += builtin-klp.o klp-diff.o klp-post-link.o diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index 021f55b7bd87..ad6e1ec706ce 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -70,6 +70,29 @@ OBJTOOL_CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED) # Always want host compilation. HOST_OVERRIDES := CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" +# +# To support disassembly, objtool needs libopcodes which is provided +# with libbdf (binutils-dev or binutils-devel package). +# +FEATURE_USER = .objtool +FEATURE_TESTS = libbfd disassembler-init-styled +FEATURE_DISPLAY = +include $(srctree)/tools/build/Makefile.feature + +ifeq ($(feature-disassembler-init-styled), 1) + OBJTOOL_CFLAGS += -DDISASM_INIT_STYLED +endif + +BUILD_DISAS := n + +ifeq ($(feature-libbfd),1) + BUILD_DISAS := y + OBJTOOL_CFLAGS += -DDISAS -DPACKAGE="objtool" + OBJTOOL_LDFLAGS += -lopcodes +endif + +export BUILD_DISAS + AWK = awk MKDIR = mkdir @@ -102,7 +125,10 @@ $(LIBSUBCMD)-clean: clean: $(LIBSUBCMD)-clean $(call QUIET_CLEAN, objtool) $(RM) $(OBJTOOL) $(Q)find $(OUTPUT) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete + $(Q)$(RM) $(OUTPUT)arch/x86/lib/cpu-feature-names.c $(OUTPUT)fixdep $(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep + $(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.objtool + $(Q)$(RM) -r -- $(OUTPUT)feature FORCE: diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c index 0115b97c526b..6cd288150f49 100644 --- a/tools/objtool/arch/loongarch/decode.c +++ b/tools/objtool/arch/loongarch/decode.c @@ -1,12 +1,24 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include <string.h> #include <objtool/check.h> +#include <objtool/disas.h> #include <objtool/warn.h> #include <asm/inst.h> #include <asm/orc_types.h> #include <linux/objtool_types.h> #include <arch/elf.h> +const char *arch_reg_name[CFI_NUM_REGS] = { + "zero", "ra", "tp", "sp", + "a0", "a1", "a2", "a3", + "a4", "a5", "a6", "a7", + "t0", "t1", "t2", "t3", + "t4", "t5", "t6", "t7", + "t8", "u0", "fp", "s0", + "s1", "s2", "s3", "s4", + "s5", "s6", "s7", "s8" +}; + int arch_ftrace_match(const char *name) { return !strcmp(name, "_mcount"); @@ -414,3 +426,14 @@ unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *tabl return reloc->sym->offset + reloc_addend(reloc); } } + +#ifdef DISAS + +int arch_disas_info_init(struct disassemble_info *dinfo) +{ + return disas_info_init(dinfo, bfd_arch_loongarch, + bfd_mach_loongarch32, bfd_mach_loongarch64, + NULL); +} + +#endif /* DISAS */ diff --git a/tools/objtool/arch/loongarch/special.c b/tools/objtool/arch/loongarch/special.c index a80b75f7b061..aba774109437 100644 --- a/tools/objtool/arch/loongarch/special.c +++ b/tools/objtool/arch/loongarch/special.c @@ -194,3 +194,8 @@ struct reloc *arch_find_switch_table(struct objtool_file *file, return rodata_reloc; } + +const char *arch_cpu_feature_name(int feature_number) +{ + return NULL; +} diff --git a/tools/objtool/arch/powerpc/decode.c b/tools/objtool/arch/powerpc/decode.c index 3a9b748216ed..e534ac1123b3 100644 --- a/tools/objtool/arch/powerpc/decode.c +++ b/tools/objtool/arch/powerpc/decode.c @@ -3,11 +3,24 @@ #include <stdio.h> #include <stdlib.h> #include <objtool/check.h> +#include <objtool/disas.h> #include <objtool/elf.h> #include <objtool/arch.h> #include <objtool/warn.h> #include <objtool/builtin.h> +const char *arch_reg_name[CFI_NUM_REGS] = { + "r0", "sp", "r2", "r3", + "r4", "r5", "r6", "r7", + "r8", "r9", "r10", "r11", + "r12", "r13", "r14", "r15", + "r16", "r17", "r18", "r19", + "r20", "r21", "r22", "r23", + "r24", "r25", "r26", "r27", + "r28", "r29", "r30", "r31", + "ra" +}; + int arch_ftrace_match(const char *name) { return !strcmp(name, "_mcount"); @@ -127,3 +140,14 @@ unsigned int arch_reloc_size(struct reloc *reloc) return 8; } } + +#ifdef DISAS + +int arch_disas_info_init(struct disassemble_info *dinfo) +{ + return disas_info_init(dinfo, bfd_arch_powerpc, + bfd_mach_ppc, bfd_mach_ppc64, + NULL); +} + +#endif /* DISAS */ diff --git a/tools/objtool/arch/powerpc/special.c b/tools/objtool/arch/powerpc/special.c index 51610689abf7..8f9bf61ca089 100644 --- a/tools/objtool/arch/powerpc/special.c +++ b/tools/objtool/arch/powerpc/special.c @@ -18,3 +18,8 @@ struct reloc *arch_find_switch_table(struct objtool_file *file, { exit(-1); } + +const char *arch_cpu_feature_name(int feature_number) +{ + return NULL; +} diff --git a/tools/objtool/arch/x86/Build b/tools/objtool/arch/x86/Build index 3dedb2fd8f3a..febee0b8ee0b 100644 --- a/tools/objtool/arch/x86/Build +++ b/tools/objtool/arch/x86/Build @@ -1,5 +1,5 @@ -objtool-y += special.o objtool-y += decode.o +objtool-y += special.o objtool-y += orc.o inat_tables_script = ../arch/x86/tools/gen-insn-attr-x86.awk @@ -12,3 +12,14 @@ $(OUTPUT)arch/x86/lib/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) $(OUTPUT)arch/x86/decode.o: $(OUTPUT)arch/x86/lib/inat-tables.c CFLAGS_decode.o += -I$(OUTPUT)arch/x86/lib + +cpu_features = ../arch/x86/include/asm/cpufeatures.h +cpu_features_script = ../arch/x86/tools/gen-cpu-feature-names-x86.awk + +$(OUTPUT)arch/x86/lib/cpu-feature-names.c: $(cpu_features_script) $(cpu_features) + $(call rule_mkdir) + $(Q)$(call echo-cmd,gen)$(AWK) -f $(cpu_features_script) $(cpu_features) > $@ + +$(OUTPUT)arch/x86/special.o: $(OUTPUT)arch/x86/lib/cpu-feature-names.c + +CFLAGS_special.o += -I$(OUTPUT)arch/x86/lib diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index cc85db7b65a4..f4af82508228 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -16,12 +16,21 @@ #include <asm/orc_types.h> #include <objtool/check.h> +#include <objtool/disas.h> #include <objtool/elf.h> #include <objtool/arch.h> #include <objtool/warn.h> #include <objtool/builtin.h> #include <arch/elf.h> +const char *arch_reg_name[CFI_NUM_REGS] = { + "rax", "rcx", "rdx", "rbx", + "rsp", "rbp", "rsi", "rdi", + "r8", "r9", "r10", "r11", + "r12", "r13", "r14", "r15", + "ra" +}; + int arch_ftrace_match(const char *name) { return !strcmp(name, "__fentry__"); @@ -949,3 +958,14 @@ bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) return false; } } + +#ifdef DISAS + +int arch_disas_info_init(struct disassemble_info *dinfo) +{ + return disas_info_init(dinfo, bfd_arch_i386, + bfd_mach_i386_i386, bfd_mach_x86_64, + "att"); +} + +#endif /* DISAS */ diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c index 09300761f108..e817a3fff449 100644 --- a/tools/objtool/arch/x86/special.c +++ b/tools/objtool/arch/x86/special.c @@ -4,6 +4,10 @@ #include <objtool/special.h> #include <objtool/builtin.h> #include <objtool/warn.h> +#include <asm/cpufeatures.h> + +/* cpu feature name array generated from cpufeatures.h */ +#include "cpu-feature-names.c" void arch_handle_alternative(struct special_alt *alt) { @@ -134,3 +138,9 @@ struct reloc *arch_find_switch_table(struct objtool_file *file, *table_size = 0; return rodata_reloc; } + +const char *arch_cpu_feature_name(int feature_number) +{ + return (feature_number < ARRAY_SIZE(cpu_feature_names)) ? + cpu_feature_names[feature_number] : NULL; +} diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index aab7fa9c7e00..b780df513715 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -75,6 +75,7 @@ static const struct option check_options[] = { OPT_GROUP("Actions:"), OPT_BOOLEAN(0, "checksum", &opts.checksum, "generate per-function checksums"), OPT_BOOLEAN(0, "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"), + OPT_STRING_OPTARG('d', "disas", &opts.disas, "function-pattern", "disassemble functions", "*"), OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label,noinstr,skylake", "patch toolchain bugs/limitations", parse_hacks), OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"), OPT_BOOLEAN('m', "mcount", &opts.mcount, "annotate mcount/fentry calls for ftrace"), @@ -103,8 +104,10 @@ static const struct option check_options[] = { OPT_STRING('o', "output", &opts.output, "file", "output file name"), OPT_BOOLEAN(0, "sec-address", &opts.sec_address, "print section addresses in warnings"), OPT_BOOLEAN(0, "stats", &opts.stats, "print statistics"), + OPT_STRING(0, "trace", &opts.trace, "func", "trace function validation"), OPT_BOOLEAN('v', "verbose", &opts.verbose, "verbose warnings"), OPT_BOOLEAN(0, "werror", &opts.werror, "return error on warnings"), + OPT_BOOLEAN(0, "wide", &opts.wide, "wide output"), OPT_END(), }; @@ -175,6 +178,7 @@ static bool opts_valid(void) } if (opts.checksum || + opts.disas || opts.hack_jump_label || opts.hack_noinstr || opts.ibt || diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 490cf78029b5..9ec0e07cce90 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -4,6 +4,7 @@ */ #define _GNU_SOURCE /* memmem() */ +#include <fnmatch.h> #include <string.h> #include <stdlib.h> #include <inttypes.h> @@ -12,8 +13,10 @@ #include <objtool/builtin.h> #include <objtool/cfi.h> #include <objtool/arch.h> +#include <objtool/disas.h> #include <objtool/check.h> #include <objtool/special.h> +#include <objtool/trace.h> #include <objtool/warn.h> #include <objtool/checksum.h> #include <objtool/util.h> @@ -24,11 +27,6 @@ #include <linux/static_call_types.h> #include <linux/string.h> -struct alternative { - struct alternative *next; - struct instruction *insn; -}; - static unsigned long nr_cfi, nr_cfi_reused, nr_cfi_cache; static struct cfi_init_state initial_func_cfi; @@ -36,6 +34,10 @@ static struct cfi_state init_cfi; static struct cfi_state func_cfi; static struct cfi_state force_undefined_cfi; +struct disas_context *objtool_disas_ctx; + +size_t sym_name_max_len; + struct instruction *find_insn(struct objtool_file *file, struct section *sec, unsigned long offset) { @@ -133,15 +135,6 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file, for (insn = next_insn_same_sec(file, insn); insn; \ insn = next_insn_same_sec(file, insn)) -static inline struct symbol *insn_call_dest(struct instruction *insn) -{ - if (insn->type == INSN_JUMP_DYNAMIC || - insn->type == INSN_CALL_DYNAMIC) - return NULL; - - return insn->_call_dest; -} - static inline struct reloc *insn_jump_table(struct instruction *insn) { if (insn->type == INSN_JUMP_DYNAMIC || @@ -1758,6 +1751,7 @@ static int handle_group_alt(struct objtool_file *file, orig_alt_group->last_insn = last_orig_insn; orig_alt_group->nop = NULL; orig_alt_group->ignore = orig_insn->ignore_alts; + orig_alt_group->feature = 0; } else { if (orig_alt_group->last_insn->offset + orig_alt_group->last_insn->len - orig_alt_group->first_insn->offset != special_alt->orig_len) { @@ -1862,6 +1856,7 @@ end: new_alt_group->nop = nop; new_alt_group->ignore = (*new_insn)->ignore_alts; new_alt_group->cfi = orig_alt_group->cfi; + new_alt_group->feature = special_alt->feature; return 0; } @@ -1926,7 +1921,9 @@ static int add_special_section_alts(struct objtool_file *file) struct list_head special_alts; struct instruction *orig_insn, *new_insn; struct special_alt *special_alt, *tmp; + enum alternative_type alt_type; struct alternative *alt; + struct alternative *a; if (special_get_alts(file->elf, &special_alts)) return -1; @@ -1961,9 +1958,15 @@ static int add_special_section_alts(struct objtool_file *file) if (handle_group_alt(file, special_alt, orig_insn, &new_insn)) return -1; + alt_type = ALT_TYPE_INSTRUCTIONS; + } else if (special_alt->jump_or_nop) { if (handle_jump_alt(file, special_alt, orig_insn, &new_insn)) return -1; + + alt_type = ALT_TYPE_JUMP_TABLE; + } else { + alt_type = ALT_TYPE_EX_TABLE; } alt = calloc(1, sizeof(*alt)); @@ -1973,8 +1976,20 @@ static int add_special_section_alts(struct objtool_file *file) } alt->insn = new_insn; - alt->next = orig_insn->alts; - orig_insn->alts = alt; + alt->type = alt_type; + alt->next = NULL; + + /* + * Store alternatives in the same order they have been + * defined. + */ + if (!orig_insn->alts) { + orig_insn->alts = alt; + } else { + for (a = orig_insn->alts; a->next; a = a->next) + ; + a->next = alt; + } list_del(&special_alt->list); free(special_alt); @@ -2471,6 +2486,7 @@ static bool is_profiling_func(const char *name) static int classify_symbols(struct objtool_file *file) { struct symbol *func; + size_t len; for_each_sym(file->elf, func) { if (is_notype_sym(func) && strstarts(func->name, ".L")) @@ -2497,6 +2513,10 @@ static int classify_symbols(struct objtool_file *file) if (is_profiling_func(func->name)) func->profiling_func = true; + + len = strlen(func->name); + if (len > sym_name_max_len) + sym_name_max_len = len; } return 0; @@ -2605,7 +2625,7 @@ static int decode_sections(struct objtool_file *file) * Must be before add_jump_destinations(), which depends on 'func' * being set for alternatives, to enable proper sibling call detection. */ - if (validate_branch_enabled() || opts.noinstr || opts.hack_jump_label) { + if (validate_branch_enabled() || opts.noinstr || opts.hack_jump_label || opts.disas) { if (add_special_section_alts(file)) return -1; } @@ -3557,8 +3577,10 @@ static bool skip_alt_group(struct instruction *insn) return false; /* ANNOTATE_IGNORE_ALTERNATIVE */ - if (insn->alt_group->ignore) + if (insn->alt_group->ignore) { + TRACE_ALT(insn, "alt group ignored"); return true; + } /* * For NOP patched with CLAC/STAC, only follow the latter to avoid @@ -3662,255 +3684,322 @@ static void checksum_update_insn(struct objtool_file *file, struct symbol *func, checksum_update(func, insn, &offset, sizeof(offset)); } -/* - * Follow the branch starting at the given instruction, and recursively follow - * any other branches (jumps). Meanwhile, track the frame pointer state at - * each instruction and validate all the rules described in - * tools/objtool/Documentation/objtool.txt. - */ static int validate_branch(struct objtool_file *file, struct symbol *func, - struct instruction *insn, struct insn_state state) + struct instruction *insn, struct insn_state state); +static int do_validate_branch(struct objtool_file *file, struct symbol *func, + struct instruction *insn, struct insn_state state); + +static int validate_insn(struct objtool_file *file, struct symbol *func, + struct instruction *insn, struct insn_state *statep, + struct instruction *prev_insn, struct instruction *next_insn, + bool *dead_end) { + /* prev_state and alt_name are not used if there is no disassembly support */ + struct insn_state prev_state __maybe_unused; + char *alt_name __maybe_unused = NULL; struct alternative *alt; - struct instruction *next_insn, *prev_insn = NULL; u8 visited; int ret; - if (func && func->ignore) - return 0; + /* + * Any returns before the end of this function are effectively dead + * ends, i.e. validate_branch() has reached the end of the branch. + */ + *dead_end = true; - while (1) { - next_insn = next_insn_to_validate(file, insn); + visited = VISITED_BRANCH << statep->uaccess; + if (insn->visited & VISITED_BRANCH_MASK) { + if (!insn->hint && !insn_cfi_match(insn, &statep->cfi)) + return 1; - if (opts.checksum && func && insn->sec) - checksum_update_insn(file, func, insn); + if (insn->visited & visited) { + TRACE_INSN(insn, "already visited"); + return 0; + } + } else { + nr_insns_visited++; + } - if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { - /* Ignore KCFI type preambles, which always fall through */ - if (is_prefix_func(func)) - return 0; + if (statep->noinstr) + statep->instr += insn->instr; - if (file->ignore_unreachables) - return 0; + if (insn->hint) { + if (insn->restore) { + struct instruction *save_insn, *i; - WARN("%s() falls through to next function %s()", - func->name, insn_func(insn)->name); - func->warned = 1; + i = insn; + save_insn = NULL; - return 1; - } + sym_for_each_insn_continue_reverse(file, func, i) { + if (i->save) { + save_insn = i; + break; + } + } - visited = VISITED_BRANCH << state.uaccess; - if (insn->visited & VISITED_BRANCH_MASK) { - if (!insn->hint && !insn_cfi_match(insn, &state.cfi)) + if (!save_insn) { + WARN_INSN(insn, "no corresponding CFI save for CFI restore"); return 1; + } - if (insn->visited & visited) - return 0; + if (!save_insn->visited) { + /* + * If the restore hint insn is at the + * beginning of a basic block and was + * branched to from elsewhere, and the + * save insn hasn't been visited yet, + * defer following this branch for now. + * It will be seen later via the + * straight-line path. + */ + if (!prev_insn) { + TRACE_INSN(insn, "defer restore"); + return 0; + } + + WARN_INSN(insn, "objtool isn't smart enough to handle this CFI save/restore combo"); + return 1; + } + + insn->cfi = save_insn->cfi; + nr_cfi_reused++; + } + + statep->cfi = *insn->cfi; + } else { + /* XXX track if we actually changed statep->cfi */ + + if (prev_insn && !cficmp(prev_insn->cfi, &statep->cfi)) { + insn->cfi = prev_insn->cfi; + nr_cfi_reused++; } else { - nr_insns_visited++; + insn->cfi = cfi_hash_find_or_add(&statep->cfi); } + } - if (state.noinstr) - state.instr += insn->instr; + insn->visited |= visited; - if (insn->hint) { - if (insn->restore) { - struct instruction *save_insn, *i; + if (propagate_alt_cfi(file, insn)) + return 1; - i = insn; - save_insn = NULL; + if (insn->alts) { + for (alt = insn->alts; alt; alt = alt->next) { + TRACE_ALT_BEGIN(insn, alt, alt_name); + ret = validate_branch(file, func, alt->insn, *statep); + TRACE_ALT_END(insn, alt, alt_name); + if (ret) { + BT_INSN(insn, "(alt)"); + return ret; + } + } + TRACE_ALT_INFO_NOADDR(insn, "/ ", "DEFAULT"); + } - sym_for_each_insn_continue_reverse(file, func, i) { - if (i->save) { - save_insn = i; - break; - } - } + if (skip_alt_group(insn)) + return 0; - if (!save_insn) { - WARN_INSN(insn, "no corresponding CFI save for CFI restore"); - return 1; - } + prev_state = *statep; + ret = handle_insn_ops(insn, next_insn, statep); + TRACE_INSN_STATE(insn, &prev_state, statep); - if (!save_insn->visited) { - /* - * If the restore hint insn is at the - * beginning of a basic block and was - * branched to from elsewhere, and the - * save insn hasn't been visited yet, - * defer following this branch for now. - * It will be seen later via the - * straight-line path. - */ - if (!prev_insn) - return 0; + if (ret) + return 1; - WARN_INSN(insn, "objtool isn't smart enough to handle this CFI save/restore combo"); - return 1; - } + switch (insn->type) { - insn->cfi = save_insn->cfi; - nr_cfi_reused++; - } + case INSN_RETURN: + TRACE_INSN(insn, "return"); + return validate_return(func, insn, statep); - state.cfi = *insn->cfi; - } else { - /* XXX track if we actually changed state.cfi */ + case INSN_CALL: + case INSN_CALL_DYNAMIC: + if (insn->type == INSN_CALL) + TRACE_INSN(insn, "call"); + else + TRACE_INSN(insn, "indirect call"); - if (prev_insn && !cficmp(prev_insn->cfi, &state.cfi)) { - insn->cfi = prev_insn->cfi; - nr_cfi_reused++; - } else { - insn->cfi = cfi_hash_find_or_add(&state.cfi); - } + ret = validate_call(file, insn, statep); + if (ret) + return ret; + + if (opts.stackval && func && !is_special_call(insn) && + !has_valid_stack_frame(statep)) { + WARN_INSN(insn, "call without frame pointer save/setup"); + return 1; } - insn->visited |= visited; + break; - if (propagate_alt_cfi(file, insn)) - return 1; + case INSN_JUMP_CONDITIONAL: + case INSN_JUMP_UNCONDITIONAL: + if (is_sibling_call(insn)) { + TRACE_INSN(insn, "sibling call"); + ret = validate_sibling_call(file, insn, statep); + if (ret) + return ret; - if (insn->alts) { - for (alt = insn->alts; alt; alt = alt->next) { - ret = validate_branch(file, func, alt->insn, state); - if (ret) { - BT_INSN(insn, "(alt)"); - return ret; - } + } else if (insn->jump_dest) { + if (insn->type == INSN_JUMP_UNCONDITIONAL) + TRACE_INSN(insn, "unconditional jump"); + else + TRACE_INSN(insn, "jump taken"); + + ret = validate_branch(file, func, insn->jump_dest, *statep); + if (ret) { + BT_INSN(insn, "(branch)"); + return ret; } } - if (skip_alt_group(insn)) + if (insn->type == INSN_JUMP_UNCONDITIONAL) return 0; - if (handle_insn_ops(insn, next_insn, &state)) - return 1; - - switch (insn->type) { - - case INSN_RETURN: - return validate_return(func, insn, &state); + TRACE_INSN(insn, "jump not taken"); + break; - case INSN_CALL: - case INSN_CALL_DYNAMIC: - ret = validate_call(file, insn, &state); + case INSN_JUMP_DYNAMIC: + case INSN_JUMP_DYNAMIC_CONDITIONAL: + TRACE_INSN(insn, "indirect jump"); + if (is_sibling_call(insn)) { + ret = validate_sibling_call(file, insn, statep); if (ret) return ret; + } - if (opts.stackval && func && !is_special_call(insn) && - !has_valid_stack_frame(&state)) { - WARN_INSN(insn, "call without frame pointer save/setup"); - return 1; - } + if (insn->type == INSN_JUMP_DYNAMIC) + return 0; - break; + break; - case INSN_JUMP_CONDITIONAL: - case INSN_JUMP_UNCONDITIONAL: - if (is_sibling_call(insn)) { - ret = validate_sibling_call(file, insn, &state); - if (ret) - return ret; + case INSN_SYSCALL: + TRACE_INSN(insn, "syscall"); + if (func && (!next_insn || !next_insn->hint)) { + WARN_INSN(insn, "unsupported instruction in callable function"); + return 1; + } - } else if (insn->jump_dest) { - ret = validate_branch(file, func, - insn->jump_dest, state); - if (ret) { - BT_INSN(insn, "(branch)"); - return ret; - } - } + break; - if (insn->type == INSN_JUMP_UNCONDITIONAL) - return 0; + case INSN_SYSRET: + TRACE_INSN(insn, "sysret"); + if (func && (!next_insn || !next_insn->hint)) { + WARN_INSN(insn, "unsupported instruction in callable function"); + return 1; + } + return 0; + + case INSN_STAC: + TRACE_INSN(insn, "stac"); + if (!opts.uaccess) break; - case INSN_JUMP_DYNAMIC: - case INSN_JUMP_DYNAMIC_CONDITIONAL: - if (is_sibling_call(insn)) { - ret = validate_sibling_call(file, insn, &state); - if (ret) - return ret; - } + if (statep->uaccess) { + WARN_INSN(insn, "recursive UACCESS enable"); + return 1; + } - if (insn->type == INSN_JUMP_DYNAMIC) - return 0; + statep->uaccess = true; + break; + case INSN_CLAC: + TRACE_INSN(insn, "clac"); + if (!opts.uaccess) break; - case INSN_SYSCALL: - if (func && (!next_insn || !next_insn->hint)) { - WARN_INSN(insn, "unsupported instruction in callable function"); - return 1; - } + if (!statep->uaccess && func) { + WARN_INSN(insn, "redundant UACCESS disable"); + return 1; + } - break; + if (func_uaccess_safe(func) && !statep->uaccess_stack) { + WARN_INSN(insn, "UACCESS-safe disables UACCESS"); + return 1; + } - case INSN_SYSRET: - if (func && (!next_insn || !next_insn->hint)) { - WARN_INSN(insn, "unsupported instruction in callable function"); - return 1; - } + statep->uaccess = false; + break; - return 0; + case INSN_STD: + TRACE_INSN(insn, "std"); + if (statep->df) { + WARN_INSN(insn, "recursive STD"); + return 1; + } - case INSN_STAC: - if (!opts.uaccess) - break; + statep->df = true; + break; - if (state.uaccess) { - WARN_INSN(insn, "recursive UACCESS enable"); - return 1; - } + case INSN_CLD: + TRACE_INSN(insn, "cld"); + if (!statep->df && func) { + WARN_INSN(insn, "redundant CLD"); + return 1; + } - state.uaccess = true; - break; + statep->df = false; + break; - case INSN_CLAC: - if (!opts.uaccess) - break; + default: + break; + } - if (!state.uaccess && func) { - WARN_INSN(insn, "redundant UACCESS disable"); - return 1; - } + if (insn->dead_end) + TRACE_INSN(insn, "dead end"); - if (func_uaccess_safe(func) && !state.uaccess_stack) { - WARN_INSN(insn, "UACCESS-safe disables UACCESS"); - return 1; - } + *dead_end = insn->dead_end; + return 0; +} - state.uaccess = false; - break; +/* + * Follow the branch starting at the given instruction, and recursively follow + * any other branches (jumps). Meanwhile, track the frame pointer state at + * each instruction and validate all the rules described in + * tools/objtool/Documentation/objtool.txt. + */ +static int do_validate_branch(struct objtool_file *file, struct symbol *func, + struct instruction *insn, struct insn_state state) +{ + struct instruction *next_insn, *prev_insn = NULL; + bool dead_end; + int ret; - case INSN_STD: - if (state.df) { - WARN_INSN(insn, "recursive STD"); - return 1; - } + if (func && func->ignore) + return 0; - state.df = true; - break; + do { + insn->trace = 0; + next_insn = next_insn_to_validate(file, insn); - case INSN_CLD: - if (!state.df && func) { - WARN_INSN(insn, "redundant CLD"); - return 1; - } + if (opts.checksum && func && insn->sec) + checksum_update_insn(file, func, insn); - state.df = false; - break; + if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { + /* Ignore KCFI type preambles, which always fall through */ + if (is_prefix_func(func)) + return 0; - default: - break; + if (file->ignore_unreachables) + return 0; + + WARN("%s() falls through to next function %s()", + func->name, insn_func(insn)->name); + func->warned = 1; + + return 1; } - if (insn->dead_end) - return 0; + ret = validate_insn(file, func, insn, &state, prev_insn, next_insn, + &dead_end); - if (!next_insn) { + if (!insn->trace) { + if (ret) + TRACE_INSN(insn, "warning (%d)", ret); + else + TRACE_INSN(insn, NULL); + } + + if (!dead_end && !next_insn) { if (state.cfi.cfa.base == CFI_UNDEFINED) return 0; if (file->ignore_unreachables) @@ -3924,9 +4013,22 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, prev_insn = insn; insn = next_insn; - } - return 0; + } while (!dead_end); + + return ret; +} + +static int validate_branch(struct objtool_file *file, struct symbol *func, + struct instruction *insn, struct insn_state state) +{ + int ret; + + trace_depth_inc(); + ret = do_validate_branch(file, func, insn, state); + trace_depth_dec(); + + return ret; } static int validate_unwind_hint(struct objtool_file *file, @@ -4385,10 +4487,18 @@ static int validate_symbol(struct objtool_file *file, struct section *sec, if (opts.checksum) checksum_init(func); + if (opts.trace && !fnmatch(opts.trace, sym->name, 0)) { + trace_enable(); + TRACE("%s: validation begin\n", sym->name); + } + ret = validate_branch(file, func, insn, *state); if (ret) BT_INSN(insn, "<=== (sym)"); + TRACE("%s: validation %s\n\n", sym->name, ret ? "failed" : "end"); + trace_disable(); + if (opts.checksum) checksum_finish(func); @@ -4657,6 +4767,8 @@ static int validate_ibt(struct objtool_file *file) !strcmp(sec->name, ".llvm.call-graph-profile") || !strcmp(sec->name, ".llvm_bb_addr_map") || !strcmp(sec->name, "__tracepoints") || + !strcmp(sec->name, ".return_sites") || + !strcmp(sec->name, ".call_sites") || !strcmp(sec->name, "__patchable_function_entries")) continue; @@ -4731,87 +4843,6 @@ static int validate_reachable_instructions(struct objtool_file *file) return warnings; } -/* 'funcs' is a space-separated list of function names */ -static void disas_funcs(const char *funcs) -{ - const char *objdump_str, *cross_compile; - int size, ret; - char *cmd; - - cross_compile = getenv("CROSS_COMPILE"); - if (!cross_compile) - cross_compile = ""; - - objdump_str = "%sobjdump -wdr %s | gawk -M -v _funcs='%s' '" - "BEGIN { split(_funcs, funcs); }" - "/^$/ { func_match = 0; }" - "/<.*>:/ { " - "f = gensub(/.*<(.*)>:/, \"\\\\1\", 1);" - "for (i in funcs) {" - "if (funcs[i] == f) {" - "func_match = 1;" - "base = strtonum(\"0x\" $1);" - "break;" - "}" - "}" - "}" - "{" - "if (func_match) {" - "addr = strtonum(\"0x\" $1);" - "printf(\"%%04x \", addr - base);" - "print;" - "}" - "}' 1>&2"; - - /* fake snprintf() to calculate the size */ - size = snprintf(NULL, 0, objdump_str, cross_compile, objname, funcs) + 1; - if (size <= 0) { - WARN("objdump string size calculation failed"); - return; - } - - cmd = malloc(size); - - /* real snprintf() */ - snprintf(cmd, size, objdump_str, cross_compile, objname, funcs); - ret = system(cmd); - if (ret) { - WARN("disassembly failed: %d", ret); - return; - } -} - -static void disas_warned_funcs(struct objtool_file *file) -{ - struct symbol *sym; - char *funcs = NULL, *tmp; - - for_each_sym(file->elf, sym) { - if (sym->warned) { - if (!funcs) { - funcs = malloc(strlen(sym->name) + 1); - if (!funcs) { - ERROR_GLIBC("malloc"); - return; - } - strcpy(funcs, sym->name); - } else { - tmp = malloc(strlen(funcs) + strlen(sym->name) + 2); - if (!tmp) { - ERROR_GLIBC("malloc"); - return; - } - sprintf(tmp, "%s %s", funcs, sym->name); - free(funcs); - funcs = tmp; - } - } - } - - if (funcs) - disas_funcs(funcs); -} - __weak bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) { unsigned int type = reloc_type(reloc); @@ -4881,10 +4912,35 @@ static void free_insns(struct objtool_file *file) free(chunk->addr); } +const char *objtool_disas_insn(struct instruction *insn) +{ + struct disas_context *dctx = objtool_disas_ctx; + + if (!dctx) + return ""; + + disas_insn(dctx, insn); + return disas_result(dctx); +} + int check(struct objtool_file *file) { + struct disas_context *disas_ctx = NULL; int ret = 0, warnings = 0; + /* + * Create a disassembly context if we might disassemble any + * instruction or function. + */ + if (opts.verbose || opts.backtrace || opts.trace || opts.disas) { + disas_ctx = disas_context_create(file); + if (!disas_ctx) { + opts.disas = false; + opts.trace = false; + } + objtool_disas_ctx = disas_ctx; + } + arch_initial_func_cfi_state(&initial_func_cfi); init_cfi_state(&init_cfi); init_cfi_state(&func_cfi); @@ -5005,8 +5061,6 @@ int check(struct objtool_file *file) goto out; } - free_insns(file); - if (opts.stats) { printf("nr_insns_visited: %ld\n", nr_insns_visited); printf("nr_cfi: %ld\n", nr_cfi); @@ -5015,18 +5069,30 @@ int check(struct objtool_file *file) } out: - if (!ret && !warnings) - return 0; + if (ret || warnings) { + if (opts.werror && warnings) + ret = 1; - if (opts.werror && warnings) - ret = 1; + if (opts.verbose) { + if (opts.werror && warnings) + WARN("%d warning(s) upgraded to errors", warnings); + disas_warned_funcs(disas_ctx); + } + } - if (opts.verbose) { - if (opts.werror && warnings) - WARN("%d warning(s) upgraded to errors", warnings); - disas_warned_funcs(file); + if (opts.disas) + disas_funcs(disas_ctx); + + if (disas_ctx) { + disas_context_destroy(disas_ctx); + objtool_disas_ctx = NULL; } + free_insns(file); + + if (!ret && !warnings) + return 0; + if (opts.backup && make_backup()) return 1; diff --git a/tools/objtool/disas.c b/tools/objtool/disas.c new file mode 100644 index 000000000000..2b5059f55e40 --- /dev/null +++ b/tools/objtool/disas.c @@ -0,0 +1,1248 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2015-2017 Josh Poimboeuf <jpoimboe@redhat.com> + */ + +#define _GNU_SOURCE +#include <fnmatch.h> + +#include <objtool/arch.h> +#include <objtool/check.h> +#include <objtool/disas.h> +#include <objtool/special.h> +#include <objtool/warn.h> + +#include <bfd.h> +#include <linux/string.h> +#include <tools/dis-asm-compat.h> + +/* + * Size of the buffer for storing the result of disassembling + * a single instruction. + */ +#define DISAS_RESULT_SIZE 1024 + +struct disas_context { + struct objtool_file *file; + struct instruction *insn; + bool alt_applied; + char result[DISAS_RESULT_SIZE]; + disassembler_ftype disassembler; + struct disassemble_info info; +}; + +/* + * Maximum number of alternatives + */ +#define DISAS_ALT_MAX 5 + +/* + * Maximum number of instructions per alternative + */ +#define DISAS_ALT_INSN_MAX 50 + +/* + * Information to disassemble an alternative + */ +struct disas_alt { + struct instruction *orig_insn; /* original instruction */ + struct alternative *alt; /* alternative or NULL if default code */ + char *name; /* name for this alternative */ + int width; /* formatting width */ + struct { + char *str; /* instruction string */ + int offset; /* instruction offset */ + int nops; /* number of nops */ + } insn[DISAS_ALT_INSN_MAX]; /* alternative instructions */ + int insn_idx; /* index of the next instruction to print */ +}; + +#define DALT_DEFAULT(dalt) (!(dalt)->alt) +#define DALT_INSN(dalt) (DALT_DEFAULT(dalt) ? (dalt)->orig_insn : (dalt)->alt->insn) +#define DALT_GROUP(dalt) (DALT_INSN(dalt)->alt_group) +#define DALT_ALTID(dalt) ((dalt)->orig_insn->offset) + +#define ALT_FLAGS_SHIFT 16 +#define ALT_FLAG_NOT (1 << 0) +#define ALT_FLAG_DIRECT_CALL (1 << 1) +#define ALT_FEATURE_MASK ((1 << ALT_FLAGS_SHIFT) - 1) + +static int alt_feature(unsigned int ft_flags) +{ + return (ft_flags & ALT_FEATURE_MASK); +} + +static int alt_flags(unsigned int ft_flags) +{ + return (ft_flags >> ALT_FLAGS_SHIFT); +} + +/* + * Wrapper around asprintf() to allocate and format a string. + * Return the allocated string or NULL on error. + */ +static char *strfmt(const char *fmt, ...) +{ + va_list ap; + char *str; + int rv; + + va_start(ap, fmt); + rv = vasprintf(&str, fmt, ap); + va_end(ap); + + return rv == -1 ? NULL : str; +} + +static int sprint_name(char *str, const char *name, unsigned long offset) +{ + int len; + + if (offset) + len = sprintf(str, "%s+0x%lx", name, offset); + else + len = sprintf(str, "%s", name); + + return len; +} + +#define DINFO_FPRINTF(dinfo, ...) \ + ((*(dinfo)->fprintf_func)((dinfo)->stream, __VA_ARGS__)) + +static int disas_result_fprintf(struct disas_context *dctx, + const char *fmt, va_list ap) +{ + char *buf = dctx->result; + int avail, len; + + len = strlen(buf); + if (len >= DISAS_RESULT_SIZE - 1) { + WARN_FUNC(dctx->insn->sec, dctx->insn->offset, + "disassembly buffer is full"); + return -1; + } + avail = DISAS_RESULT_SIZE - len; + + len = vsnprintf(buf + len, avail, fmt, ap); + if (len < 0 || len >= avail) { + WARN_FUNC(dctx->insn->sec, dctx->insn->offset, + "disassembly buffer is truncated"); + return -1; + } + + return 0; +} + +static int disas_fprintf(void *stream, const char *fmt, ...) +{ + va_list arg; + int rv; + + va_start(arg, fmt); + rv = disas_result_fprintf(stream, fmt, arg); + va_end(arg); + + return rv; +} + +/* + * For init_disassemble_info_compat(). + */ +static int disas_fprintf_styled(void *stream, + enum disassembler_style style, + const char *fmt, ...) +{ + va_list arg; + int rv; + + va_start(arg, fmt); + rv = disas_result_fprintf(stream, fmt, arg); + va_end(arg); + + return rv; +} + +static void disas_print_addr_sym(struct section *sec, struct symbol *sym, + bfd_vma addr, struct disassemble_info *dinfo) +{ + char symstr[1024]; + char *str; + + if (sym) { + sprint_name(symstr, sym->name, addr - sym->offset); + DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr); + } else { + str = offstr(sec, addr); + DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, str); + free(str); + } +} + +static bool disas_print_addr_alt(bfd_vma addr, struct disassemble_info *dinfo) +{ + struct disas_context *dctx = dinfo->application_data; + struct instruction *orig_first_insn; + struct alt_group *alt_group; + unsigned long offset; + struct symbol *sym; + + /* + * Check if we are processing an alternative at the original + * instruction address (i.e. if alt_applied is true) and if + * we are referencing an address inside the alternative. + * + * For example, this happens if there is a branch inside an + * alternative. In that case, the address should be updated + * to a reference inside the original instruction flow. + */ + if (!dctx->alt_applied) + return false; + + alt_group = dctx->insn->alt_group; + if (!alt_group || !alt_group->orig_group || + addr < alt_group->first_insn->offset || + addr > alt_group->last_insn->offset) + return false; + + orig_first_insn = alt_group->orig_group->first_insn; + offset = addr - alt_group->first_insn->offset; + + addr = orig_first_insn->offset + offset; + sym = orig_first_insn->sym; + + disas_print_addr_sym(orig_first_insn->sec, sym, addr, dinfo); + + return true; +} + +static void disas_print_addr_noreloc(bfd_vma addr, + struct disassemble_info *dinfo) +{ + struct disas_context *dctx = dinfo->application_data; + struct instruction *insn = dctx->insn; + struct symbol *sym = NULL; + + if (disas_print_addr_alt(addr, dinfo)) + return; + + if (insn->sym && addr >= insn->sym->offset && + addr < insn->sym->offset + insn->sym->len) { + sym = insn->sym; + } + + disas_print_addr_sym(insn->sec, sym, addr, dinfo); +} + +static void disas_print_addr_reloc(bfd_vma addr, struct disassemble_info *dinfo) +{ + struct disas_context *dctx = dinfo->application_data; + struct instruction *insn = dctx->insn; + unsigned long offset; + struct reloc *reloc; + char symstr[1024]; + char *str; + + reloc = find_reloc_by_dest_range(dctx->file->elf, insn->sec, + insn->offset, insn->len); + if (!reloc) { + /* + * There is no relocation for this instruction although + * the address to resolve points to the next instruction. + * So this is an effective reference to the next IP, for + * example: "lea 0x0(%rip),%rdi". The kernel can reference + * the next IP with _THIS_IP_ macro. + */ + DINFO_FPRINTF(dinfo, "0x%lx <_THIS_IP_>", addr); + return; + } + + offset = arch_insn_adjusted_addend(insn, reloc); + + /* + * If the relocation symbol is a section name (for example ".bss") + * then we try to further resolve the name. + */ + if (reloc->sym->type == STT_SECTION) { + str = offstr(reloc->sym->sec, reloc->sym->offset + offset); + DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, str); + free(str); + } else { + sprint_name(symstr, reloc->sym->name, offset); + DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr); + } +} + +/* + * Resolve an address into a "<symbol>+<offset>" string. + */ +static void disas_print_address(bfd_vma addr, struct disassemble_info *dinfo) +{ + struct disas_context *dctx = dinfo->application_data; + struct instruction *insn = dctx->insn; + struct instruction *jump_dest; + struct symbol *sym; + bool is_reloc; + + /* + * If the instruction is a call/jump and it references a + * destination then this is likely the address we are looking + * up. So check it first. + */ + jump_dest = insn->jump_dest; + if (jump_dest && jump_dest->sym && jump_dest->offset == addr) { + if (!disas_print_addr_alt(addr, dinfo)) + disas_print_addr_sym(jump_dest->sec, jump_dest->sym, + addr, dinfo); + return; + } + + /* + * If the address points to the next instruction then there is + * probably a relocation. It can be a false positive when the + * current instruction is referencing the address of the next + * instruction. This particular case will be handled in + * disas_print_addr_reloc(). + */ + is_reloc = (addr == insn->offset + insn->len); + + /* + * The call destination offset can be the address we are looking + * up, or 0 if there is a relocation. + */ + sym = insn_call_dest(insn); + if (sym && (sym->offset == addr || (sym->offset == 0 && is_reloc))) { + DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, sym->name); + return; + } + + if (!is_reloc) + disas_print_addr_noreloc(addr, dinfo); + else + disas_print_addr_reloc(addr, dinfo); +} + +/* + * Initialize disassemble info arch, mach (32 or 64-bit) and options. + */ +int disas_info_init(struct disassemble_info *dinfo, + int arch, int mach32, int mach64, + const char *options) +{ + struct disas_context *dctx = dinfo->application_data; + struct objtool_file *file = dctx->file; + + dinfo->arch = arch; + + switch (file->elf->ehdr.e_ident[EI_CLASS]) { + case ELFCLASS32: + dinfo->mach = mach32; + break; + case ELFCLASS64: + dinfo->mach = mach64; + break; + default: + return -1; + } + + dinfo->disassembler_options = options; + + return 0; +} + +struct disas_context *disas_context_create(struct objtool_file *file) +{ + struct disas_context *dctx; + struct disassemble_info *dinfo; + int err; + + dctx = malloc(sizeof(*dctx)); + if (!dctx) { + WARN("failed to allocate disassembly context"); + return NULL; + } + + dctx->file = file; + dinfo = &dctx->info; + + init_disassemble_info_compat(dinfo, dctx, + disas_fprintf, disas_fprintf_styled); + + dinfo->read_memory_func = buffer_read_memory; + dinfo->print_address_func = disas_print_address; + dinfo->application_data = dctx; + + /* + * bfd_openr() is not used to avoid doing ELF data processing + * and caching that has already being done. Here, we just need + * to identify the target file so we call an arch specific + * function to fill some disassemble info (arch, mach). + */ + + dinfo->arch = bfd_arch_unknown; + dinfo->mach = 0; + + err = arch_disas_info_init(dinfo); + if (err || dinfo->arch == bfd_arch_unknown || dinfo->mach == 0) { + WARN("failed to init disassembly arch"); + goto error; + } + + dinfo->endian = (file->elf->ehdr.e_ident[EI_DATA] == ELFDATA2MSB) ? + BFD_ENDIAN_BIG : BFD_ENDIAN_LITTLE; + + disassemble_init_for_target(dinfo); + + dctx->disassembler = disassembler(dinfo->arch, + dinfo->endian == BFD_ENDIAN_BIG, + dinfo->mach, NULL); + if (!dctx->disassembler) { + WARN("failed to create disassembler function"); + goto error; + } + + return dctx; + +error: + free(dctx); + return NULL; +} + +void disas_context_destroy(struct disas_context *dctx) +{ + free(dctx); +} + +char *disas_result(struct disas_context *dctx) +{ + return dctx->result; +} + +#define DISAS_INSN_OFFSET_SPACE 10 +#define DISAS_INSN_SPACE 60 + +#define DISAS_PRINSN(dctx, insn, depth) \ + disas_print_insn(stdout, dctx, insn, depth, "\n") + +/* + * Print a message in the instruction flow. If sec is not NULL then the + * address at the section offset is printed in addition of the message, + * otherwise only the message is printed. + */ +static int disas_vprint(FILE *stream, struct section *sec, unsigned long offset, + int depth, const char *format, va_list ap) +{ + const char *addr_str; + int i, n; + int len; + + len = sym_name_max_len + DISAS_INSN_OFFSET_SPACE; + if (depth < 0) { + len += depth; + depth = 0; + } + + n = 0; + + if (sec) { + addr_str = offstr(sec, offset); + n += fprintf(stream, "%6lx: %-*s ", offset, len, addr_str); + free((char *)addr_str); + } else { + len += DISAS_INSN_OFFSET_SPACE + 1; + n += fprintf(stream, "%-*s", len, ""); + } + + /* print vertical bars to show the code flow */ + for (i = 0; i < depth; i++) + n += fprintf(stream, "| "); + + if (format) + n += vfprintf(stream, format, ap); + + return n; +} + +static int disas_print(FILE *stream, struct section *sec, unsigned long offset, + int depth, const char *format, ...) +{ + va_list args; + int len; + + va_start(args, format); + len = disas_vprint(stream, sec, offset, depth, format, args); + va_end(args); + + return len; +} + +/* + * Print a message in the instruction flow. If insn is not NULL then + * the instruction address is printed in addition of the message, + * otherwise only the message is printed. In all cases, the instruction + * itself is not printed. + */ +void disas_print_info(FILE *stream, struct instruction *insn, int depth, + const char *format, ...) +{ + struct section *sec; + unsigned long off; + va_list args; + + if (insn) { + sec = insn->sec; + off = insn->offset; + } else { + sec = NULL; + off = 0; + } + + va_start(args, format); + disas_vprint(stream, sec, off, depth, format, args); + va_end(args); +} + +/* + * Print an instruction address (offset and function), the instruction itself + * and an optional message. + */ +void disas_print_insn(FILE *stream, struct disas_context *dctx, + struct instruction *insn, int depth, + const char *format, ...) +{ + char fake_nop_insn[32]; + const char *insn_str; + bool fake_nop; + va_list args; + int len; + + /* + * Alternative can insert a fake nop, sometimes with no + * associated section so nothing to disassemble. + */ + fake_nop = (!insn->sec && insn->type == INSN_NOP); + if (fake_nop) { + snprintf(fake_nop_insn, 32, "<fake nop> (%d bytes)", insn->len); + insn_str = fake_nop_insn; + } else { + disas_insn(dctx, insn); + insn_str = disas_result(dctx); + } + + /* print the instruction */ + len = (depth + 1) * 2 < DISAS_INSN_SPACE ? DISAS_INSN_SPACE - (depth+1) * 2 : 1; + disas_print_info(stream, insn, depth, "%-*s", len, insn_str); + + /* print message if any */ + if (!format) + return; + + if (strcmp(format, "\n") == 0) { + fprintf(stream, "\n"); + return; + } + + fprintf(stream, " - "); + va_start(args, format); + vfprintf(stream, format, args); + va_end(args); +} + +/* + * Disassemble a single instruction. Return the size of the instruction. + * + * If alt_applied is true then insn should be an instruction from of an + * alternative (i.e. insn->alt_group != NULL), and it is disassembled + * at the location of the original code it is replacing. When the + * instruction references any address inside the alternative then + * these references will be re-adjusted to replace the original code. + */ +static size_t disas_insn_common(struct disas_context *dctx, + struct instruction *insn, + bool alt_applied) +{ + disassembler_ftype disasm = dctx->disassembler; + struct disassemble_info *dinfo = &dctx->info; + + dctx->insn = insn; + dctx->alt_applied = alt_applied; + dctx->result[0] = '\0'; + + if (insn->type == INSN_NOP) { + DINFO_FPRINTF(dinfo, "nop%d", insn->len); + return insn->len; + } + + /* + * Set the disassembler buffer to read data from the section + * containing the instruction to disassemble. + */ + dinfo->buffer = insn->sec->data->d_buf; + dinfo->buffer_vma = 0; + dinfo->buffer_length = insn->sec->sh.sh_size; + + return disasm(insn->offset, &dctx->info); +} + +size_t disas_insn(struct disas_context *dctx, struct instruction *insn) +{ + return disas_insn_common(dctx, insn, false); +} + +static size_t disas_insn_alt(struct disas_context *dctx, + struct instruction *insn) +{ + return disas_insn_common(dctx, insn, true); +} + +static struct instruction *next_insn_same_alt(struct objtool_file *file, + struct alt_group *alt_grp, + struct instruction *insn) +{ + if (alt_grp->last_insn == insn || alt_grp->nop == insn) + return NULL; + + return next_insn_same_sec(file, insn); +} + +#define alt_for_each_insn(file, alt_grp, insn) \ + for (insn = alt_grp->first_insn; \ + insn; \ + insn = next_insn_same_alt(file, alt_grp, insn)) + +/* + * Provide a name for the type of alternatives present at the + * specified instruction. + * + * An instruction can have alternatives with different types, for + * example alternative instructions and an exception table. In that + * case the name for the alternative instructions type is used. + * + * Return NULL if the instruction as no alternative. + */ +const char *disas_alt_type_name(struct instruction *insn) +{ + struct alternative *alt; + const char *name; + + name = NULL; + for (alt = insn->alts; alt; alt = alt->next) { + if (alt->type == ALT_TYPE_INSTRUCTIONS) { + name = "alternative"; + break; + } + + switch (alt->type) { + case ALT_TYPE_EX_TABLE: + name = "ex_table"; + break; + case ALT_TYPE_JUMP_TABLE: + name = "jump_table"; + break; + default: + name = "unknown"; + break; + } + } + + return name; +} + +/* + * Provide a name for an alternative. + */ +char *disas_alt_name(struct alternative *alt) +{ + char pfx[4] = { 0 }; + char *str = NULL; + const char *name; + int feature; + int flags; + int num; + + switch (alt->type) { + + case ALT_TYPE_EX_TABLE: + str = strdup("EXCEPTION"); + break; + + case ALT_TYPE_JUMP_TABLE: + str = strdup("JUMP"); + break; + + case ALT_TYPE_INSTRUCTIONS: + /* + * This is a non-default group alternative. Create a name + * based on the feature and flags associated with this + * alternative. Use either the feature name (it is available) + * or the feature number. And add a prefix to show the flags + * used. + * + * Prefix flags characters: + * + * '!' alternative used when feature not enabled + * '+' direct call alternative + * '?' unknown flag + */ + + if (!alt->insn->alt_group) + return NULL; + + feature = alt->insn->alt_group->feature; + num = alt_feature(feature); + flags = alt_flags(feature); + str = pfx; + + if (flags & ~(ALT_FLAG_NOT | ALT_FLAG_DIRECT_CALL)) + *str++ = '?'; + if (flags & ALT_FLAG_DIRECT_CALL) + *str++ = '+'; + if (flags & ALT_FLAG_NOT) + *str++ = '!'; + + name = arch_cpu_feature_name(num); + if (!name) + str = strfmt("%sFEATURE 0x%X", pfx, num); + else + str = strfmt("%s%s", pfx, name); + + break; + } + + return str; +} + +/* + * Initialize an alternative. The default alternative should be initialized + * with alt=NULL. + */ +static int disas_alt_init(struct disas_alt *dalt, + struct instruction *orig_insn, + struct alternative *alt) +{ + dalt->orig_insn = orig_insn; + dalt->alt = alt; + dalt->insn_idx = 0; + dalt->name = alt ? disas_alt_name(alt) : strdup("DEFAULT"); + if (!dalt->name) + return -1; + dalt->width = strlen(dalt->name); + + return 0; +} + +static int disas_alt_add_insn(struct disas_alt *dalt, int index, char *insn_str, + int offset, int nops) +{ + int len; + + if (index >= DISAS_ALT_INSN_MAX) { + WARN("Alternative %lx.%s has more instructions than supported", + DALT_ALTID(dalt), dalt->name); + return -1; + } + + len = strlen(insn_str); + dalt->insn[index].str = insn_str; + dalt->insn[index].offset = offset; + dalt->insn[index].nops = nops; + if (len > dalt->width) + dalt->width = len; + + return 0; +} + +static int disas_alt_jump(struct disas_alt *dalt) +{ + struct instruction *orig_insn; + struct instruction *dest_insn; + char suffix[2] = { 0 }; + char *str; + int nops; + + orig_insn = dalt->orig_insn; + dest_insn = dalt->alt->insn; + + if (orig_insn->type == INSN_NOP) { + if (orig_insn->len == 5) + suffix[0] = 'q'; + str = strfmt("jmp%-3s %lx <%s+0x%lx>", suffix, + dest_insn->offset, dest_insn->sym->name, + dest_insn->offset - dest_insn->sym->offset); + nops = 0; + } else { + str = strfmt("nop%d", orig_insn->len); + nops = orig_insn->len; + } + + if (!str) + return -1; + + disas_alt_add_insn(dalt, 0, str, 0, nops); + + return 1; +} + +/* + * Disassemble an exception table alternative. + */ +static int disas_alt_extable(struct disas_alt *dalt) +{ + struct instruction *alt_insn; + char *str; + + alt_insn = dalt->alt->insn; + str = strfmt("resume at 0x%lx <%s+0x%lx>", + alt_insn->offset, alt_insn->sym->name, + alt_insn->offset - alt_insn->sym->offset); + if (!str) + return -1; + + disas_alt_add_insn(dalt, 0, str, 0, 0); + + return 1; +} + +/* + * Disassemble an alternative and store instructions in the disas_alt + * structure. Return the number of instructions in the alternative. + */ +static int disas_alt_group(struct disas_context *dctx, struct disas_alt *dalt) +{ + struct objtool_file *file; + struct instruction *insn; + int offset; + char *str; + int count; + int nops; + int err; + + file = dctx->file; + count = 0; + offset = 0; + nops = 0; + + alt_for_each_insn(file, DALT_GROUP(dalt), insn) { + + disas_insn_alt(dctx, insn); + str = strdup(disas_result(dctx)); + if (!str) + return -1; + + nops = insn->type == INSN_NOP ? insn->len : 0; + err = disas_alt_add_insn(dalt, count, str, offset, nops); + if (err) + break; + offset += insn->len; + count++; + } + + return count; +} + +/* + * Disassemble the default alternative. + */ +static int disas_alt_default(struct disas_context *dctx, struct disas_alt *dalt) +{ + char *str; + int nops; + int err; + + if (DALT_GROUP(dalt)) + return disas_alt_group(dctx, dalt); + + /* + * Default alternative with no alt_group: this is the default + * code associated with either a jump table or an exception + * table and no other instruction alternatives. In that case + * the default alternative is made of a single instruction. + */ + disas_insn(dctx, dalt->orig_insn); + str = strdup(disas_result(dctx)); + if (!str) + return -1; + nops = dalt->orig_insn->type == INSN_NOP ? dalt->orig_insn->len : 0; + err = disas_alt_add_insn(dalt, 0, str, 0, nops); + if (err) + return -1; + + return 1; +} + +/* + * For each alternative, if there is an instruction at the specified + * offset then print this instruction, otherwise print a blank entry. + * The offset is an offset from the start of the alternative. + * + * Return the offset for the next instructions to print, or -1 if all + * instructions have been printed. + */ +static int disas_alt_print_insn(struct disas_alt *dalts, int alt_count, + int insn_count, int offset) +{ + struct disas_alt *dalt; + int offset_next; + char *str; + int i, j; + + offset_next = -1; + + for (i = 0; i < alt_count; i++) { + dalt = &dalts[i]; + j = dalt->insn_idx; + if (j == -1) { + printf("| %-*s ", dalt->width, ""); + continue; + } + + if (dalt->insn[j].offset == offset) { + str = dalt->insn[j].str; + printf("| %-*s ", dalt->width, str ?: ""); + if (++j < insn_count) { + dalt->insn_idx = j; + } else { + dalt->insn_idx = -1; + continue; + } + } else { + printf("| %-*s ", dalt->width, ""); + } + + if (dalt->insn[j].offset > 0 && + (offset_next == -1 || + (dalt->insn[j].offset < offset_next))) + offset_next = dalt->insn[j].offset; + } + printf("\n"); + + return offset_next; +} + +/* + * Print all alternatives side-by-side. + */ +static void disas_alt_print_wide(char *alt_name, struct disas_alt *dalts, int alt_count, + int insn_count) +{ + struct instruction *orig_insn; + int offset_next; + int offset; + int i; + + orig_insn = dalts[0].orig_insn; + + /* + * Print an header with the name of each alternative. + */ + disas_print_info(stdout, orig_insn, -2, NULL); + + if (strlen(alt_name) > dalts[0].width) + dalts[0].width = strlen(alt_name); + printf("| %-*s ", dalts[0].width, alt_name); + + for (i = 1; i < alt_count; i++) + printf("| %-*s ", dalts[i].width, dalts[i].name); + + printf("\n"); + + /* + * Print instructions for each alternative. + */ + offset_next = 0; + do { + offset = offset_next; + disas_print(stdout, orig_insn->sec, orig_insn->offset + offset, + -2, NULL); + offset_next = disas_alt_print_insn(dalts, alt_count, insn_count, + offset); + } while (offset_next > offset); +} + +/* + * Print all alternatives one above the other. + */ +static void disas_alt_print_compact(char *alt_name, struct disas_alt *dalts, + int alt_count, int insn_count) +{ + struct instruction *orig_insn; + int width; + int i, j; + int len; + + orig_insn = dalts[0].orig_insn; + + len = disas_print(stdout, orig_insn->sec, orig_insn->offset, 0, NULL); + printf("%s\n", alt_name); + + /* + * If all alternatives have a single instruction then print each + * alternative on a single line. Otherwise, print alternatives + * one above the other with a clear separation. + */ + + if (insn_count == 1) { + width = 0; + for (i = 0; i < alt_count; i++) { + if (dalts[i].width > width) + width = dalts[i].width; + } + + for (i = 0; i < alt_count; i++) { + printf("%*s= %-*s (if %s)\n", len, "", width, + dalts[i].insn[0].str, dalts[i].name); + } + + return; + } + + for (i = 0; i < alt_count; i++) { + printf("%*s= %s\n", len, "", dalts[i].name); + for (j = 0; j < insn_count; j++) { + if (!dalts[i].insn[j].str) + break; + disas_print(stdout, orig_insn->sec, + orig_insn->offset + dalts[i].insn[j].offset, 0, + "| %s\n", dalts[i].insn[j].str); + } + printf("%*s|\n", len, ""); + } +} + +/* + * Trim NOPs in alternatives. This replaces trailing NOPs in alternatives + * with a single indication of the number of bytes covered with NOPs. + * + * Return the maximum numbers of instructions in all alternatives after + * trailing NOPs have been trimmed. + */ +static int disas_alt_trim_nops(struct disas_alt *dalts, int alt_count, + int insn_count) +{ + struct disas_alt *dalt; + int nops_count; + const char *s; + int offset; + int count; + int nops; + int i, j; + + count = 0; + for (i = 0; i < alt_count; i++) { + offset = 0; + nops = 0; + nops_count = 0; + dalt = &dalts[i]; + for (j = insn_count - 1; j >= 0; j--) { + if (!dalt->insn[j].str || !dalt->insn[j].nops) + break; + offset = dalt->insn[j].offset; + free(dalt->insn[j].str); + dalt->insn[j].offset = 0; + dalt->insn[j].str = NULL; + nops += dalt->insn[j].nops; + nops_count++; + } + + /* + * All trailing NOPs have been removed. If there was a single + * NOP instruction then re-add it. If there was a block of + * NOPs then indicate the number of bytes than the block + * covers (nop*<number-of-bytes>). + */ + if (nops_count) { + s = nops_count == 1 ? "" : "*"; + dalt->insn[j + 1].str = strfmt("nop%s%d", s, nops); + dalt->insn[j + 1].offset = offset; + dalt->insn[j + 1].nops = nops; + j++; + } + + if (j > count) + count = j; + } + + return count + 1; +} + +/* + * Disassemble an alternative. + * + * Return the last instruction in the default alternative so that + * disassembly can continue with the next instruction. Return NULL + * on error. + */ +static void *disas_alt(struct disas_context *dctx, + struct instruction *orig_insn) +{ + struct disas_alt dalts[DISAS_ALT_MAX] = { 0 }; + struct instruction *last_insn = NULL; + struct alternative *alt; + struct disas_alt *dalt; + int insn_count = 0; + int alt_count = 0; + char *alt_name; + int count; + int i, j; + int err; + + alt_name = strfmt("<%s.%lx>", disas_alt_type_name(orig_insn), + orig_insn->offset); + if (!alt_name) { + WARN("Failed to define name for alternative at instruction 0x%lx", + orig_insn->offset); + goto done; + } + + /* + * Initialize and disassemble the default alternative. + */ + err = disas_alt_init(&dalts[0], orig_insn, NULL); + if (err) { + WARN("%s: failed to initialize default alternative", alt_name); + goto done; + } + + insn_count = disas_alt_default(dctx, &dalts[0]); + if (insn_count < 0) { + WARN("%s: failed to disassemble default alternative", alt_name); + goto done; + } + + /* + * Initialize and disassemble all other alternatives. + */ + i = 1; + for (alt = orig_insn->alts; alt; alt = alt->next) { + if (i >= DISAS_ALT_MAX) { + WARN("%s has more alternatives than supported", alt_name); + break; + } + + dalt = &dalts[i]; + err = disas_alt_init(dalt, orig_insn, alt); + if (err) { + WARN("%s: failed to disassemble alternative", alt_name); + goto done; + } + + count = -1; + switch (dalt->alt->type) { + case ALT_TYPE_INSTRUCTIONS: + count = disas_alt_group(dctx, dalt); + break; + case ALT_TYPE_EX_TABLE: + count = disas_alt_extable(dalt); + break; + case ALT_TYPE_JUMP_TABLE: + count = disas_alt_jump(dalt); + break; + } + if (count < 0) { + WARN("%s: failed to disassemble alternative %s", + alt_name, dalt->name); + goto done; + } + + insn_count = count > insn_count ? count : insn_count; + i++; + } + alt_count = i; + + /* + * Print default and non-default alternatives. + */ + + insn_count = disas_alt_trim_nops(dalts, alt_count, insn_count); + + if (opts.wide) + disas_alt_print_wide(alt_name, dalts, alt_count, insn_count); + else + disas_alt_print_compact(alt_name, dalts, alt_count, insn_count); + + last_insn = orig_insn->alt_group ? orig_insn->alt_group->last_insn : + orig_insn; + +done: + for (i = 0; i < alt_count; i++) { + free(dalts[i].name); + for (j = 0; j < insn_count; j++) + free(dalts[i].insn[j].str); + } + + free(alt_name); + + return last_insn; +} + +/* + * Disassemble a function. + */ +static void disas_func(struct disas_context *dctx, struct symbol *func) +{ + struct instruction *insn_start; + struct instruction *insn; + + printf("%s:\n", func->name); + sym_for_each_insn(dctx->file, func, insn) { + if (insn->alts) { + insn_start = insn; + insn = disas_alt(dctx, insn); + if (insn) + continue; + /* + * There was an error with disassembling + * the alternative. Resume disassembling + * at the current instruction, this will + * disassemble the default alternative + * only and continue with the code after + * the alternative. + */ + insn = insn_start; + } + + DISAS_PRINSN(dctx, insn, 0); + } + printf("\n"); +} + +/* + * Disassemble all warned functions. + */ +void disas_warned_funcs(struct disas_context *dctx) +{ + struct symbol *sym; + + if (!dctx) + return; + + for_each_sym(dctx->file->elf, sym) { + if (sym->warned) + disas_func(dctx, sym); + } +} + +void disas_funcs(struct disas_context *dctx) +{ + bool disas_all = !strcmp(opts.disas, "*"); + struct section *sec; + struct symbol *sym; + + for_each_sec(dctx->file->elf, sec) { + + if (!(sec->sh.sh_flags & SHF_EXECINSTR)) + continue; + + sec_for_each_sym(sec, sym) { + /* + * If the function had a warning and the verbose + * option is used then the function was already + * disassemble. + */ + if (opts.verbose && sym->warned) + continue; + + if (disas_all || fnmatch(opts.disas, sym->name, 0) == 0) + disas_func(dctx, sym); + } + } +} diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index d89f8b5ec14e..8866158975fc 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -103,4 +103,15 @@ bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc); unsigned int arch_reloc_size(struct reloc *reloc); unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table); +extern const char *arch_reg_name[CFI_NUM_REGS]; + +#ifdef DISAS + +#include <bfd.h> +#include <dis-asm.h> + +int arch_disas_info_init(struct disassemble_info *dinfo); + +#endif /* DISAS */ + #endif /* _ARCH_H */ diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index bb0b25eb08ba..b9e229ed4dc0 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -28,6 +28,7 @@ struct opts { bool static_call; bool uaccess; int prefix; + const char *disas; /* options: */ bool backtrace; @@ -41,8 +42,10 @@ struct opts { const char *output; bool sec_address; bool stats; + const char *trace; bool verbose; bool werror; + bool wide; }; extern struct opts opts; diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index d73b0c3ae1ee..2e1346ad5e92 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -36,6 +36,19 @@ struct alt_group { struct cfi_state **cfi; bool ignore; + unsigned int feature; +}; + +enum alternative_type { + ALT_TYPE_INSTRUCTIONS, + ALT_TYPE_JUMP_TABLE, + ALT_TYPE_EX_TABLE, +}; + +struct alternative { + struct alternative *next; + struct instruction *insn; + enum alternative_type type; }; #define INSN_CHUNK_BITS 8 @@ -66,7 +79,8 @@ struct instruction { visited : 4, no_reloc : 1, hole : 1, - fake : 1; + fake : 1, + trace : 1; /* 9 bit hole */ struct alt_group *alt_group; @@ -117,6 +131,15 @@ static inline bool is_jump(struct instruction *insn) return is_static_jump(insn) || is_dynamic_jump(insn); } +static inline struct symbol *insn_call_dest(struct instruction *insn) +{ + if (insn->type == INSN_JUMP_DYNAMIC || + insn->type == INSN_CALL_DYNAMIC) + return NULL; + + return insn->_call_dest; +} + struct instruction *find_insn(struct objtool_file *file, struct section *sec, unsigned long offset); @@ -127,4 +150,14 @@ struct instruction *next_insn_same_sec(struct objtool_file *file, struct instruc insn && insn->sec == _sec; \ insn = next_insn_same_sec(file, insn)) +#define sym_for_each_insn(file, sym, insn) \ + for (insn = find_insn(file, sym->sec, sym->offset); \ + insn && insn->offset < sym->offset + sym->len; \ + insn = next_insn_same_sec(file, insn)) + +const char *objtool_disas_insn(struct instruction *insn); + +extern size_t sym_name_max_len; +extern struct disas_context *objtool_disas_ctx; + #endif /* _CHECK_H */ diff --git a/tools/objtool/include/objtool/disas.h b/tools/objtool/include/objtool/disas.h new file mode 100644 index 000000000000..e8f395eff159 --- /dev/null +++ b/tools/objtool/include/objtool/disas.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. + */ + +#ifndef _DISAS_H +#define _DISAS_H + +struct alternative; +struct disas_context; +struct disassemble_info; + +#ifdef DISAS + +struct disas_context *disas_context_create(struct objtool_file *file); +void disas_context_destroy(struct disas_context *dctx); +void disas_warned_funcs(struct disas_context *dctx); +void disas_funcs(struct disas_context *dctx); +int disas_info_init(struct disassemble_info *dinfo, + int arch, int mach32, int mach64, + const char *options); +size_t disas_insn(struct disas_context *dctx, struct instruction *insn); +char *disas_result(struct disas_context *dctx); +void disas_print_info(FILE *stream, struct instruction *insn, int depth, + const char *format, ...); +void disas_print_insn(FILE *stream, struct disas_context *dctx, + struct instruction *insn, int depth, + const char *format, ...); +char *disas_alt_name(struct alternative *alt); +const char *disas_alt_type_name(struct instruction *insn); + +#else /* DISAS */ + +#include <objtool/warn.h> + +static inline struct disas_context *disas_context_create(struct objtool_file *file) +{ + WARN("Rebuild with libopcodes for disassembly support"); + return NULL; +} + +static inline void disas_context_destroy(struct disas_context *dctx) {} +static inline void disas_warned_funcs(struct disas_context *dctx) {} +static inline void disas_funcs(struct disas_context *dctx) {} + +static inline int disas_info_init(struct disassemble_info *dinfo, + int arch, int mach32, int mach64, + const char *options) +{ + return -1; +} + +static inline size_t disas_insn(struct disas_context *dctx, + struct instruction *insn) +{ + return -1; +} + +static inline char *disas_result(struct disas_context *dctx) +{ + return NULL; +} + +static inline void disas_print_info(FILE *stream, struct instruction *insn, + int depth, const char *format, ...) {} +static inline void disas_print_insn(FILE *stream, struct disas_context *dctx, + struct instruction *insn, int depth, + const char *format, ...) {} +static inline char *disas_alt_name(struct alternative *alt) +{ + return NULL; +} + +static inline const char *disas_alt_type_name(struct instruction *insn) +{ + return NULL; +} + +#endif /* DISAS */ + +#endif /* _DISAS_H */ diff --git a/tools/objtool/include/objtool/special.h b/tools/objtool/include/objtool/special.h index 72d09c0adf1a..121c3761899c 100644 --- a/tools/objtool/include/objtool/special.h +++ b/tools/objtool/include/objtool/special.h @@ -25,7 +25,7 @@ struct special_alt { struct section *new_sec; unsigned long new_off; - unsigned int orig_len, new_len; /* group only */ + unsigned int orig_len, new_len, feature; /* group only */ }; int special_get_alts(struct elf *elf, struct list_head *alts); @@ -38,4 +38,6 @@ bool arch_support_alt_relocation(struct special_alt *special_alt, struct reloc *arch_find_switch_table(struct objtool_file *file, struct instruction *insn, unsigned long *table_size); +const char *arch_cpu_feature_name(int feature_number); + #endif /* _SPECIAL_H */ diff --git a/tools/objtool/include/objtool/trace.h b/tools/objtool/include/objtool/trace.h new file mode 100644 index 000000000000..70b574366797 --- /dev/null +++ b/tools/objtool/include/objtool/trace.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. + */ + +#ifndef _TRACE_H +#define _TRACE_H + +#include <objtool/check.h> +#include <objtool/disas.h> + +#ifdef DISAS + +extern bool trace; +extern int trace_depth; + +#define TRACE(fmt, ...) \ +({ if (trace) \ + fprintf(stderr, fmt, ##__VA_ARGS__); \ +}) + +/* + * Print the instruction address and a message. The instruction + * itself is not printed. + */ +#define TRACE_ADDR(insn, fmt, ...) \ +({ \ + if (trace) { \ + disas_print_info(stderr, insn, trace_depth - 1, \ + fmt "\n", ##__VA_ARGS__); \ + } \ +}) + +/* + * Print the instruction address, the instruction and a message. + */ +#define TRACE_INSN(insn, fmt, ...) \ +({ \ + if (trace) { \ + disas_print_insn(stderr, objtool_disas_ctx, \ + insn, trace_depth - 1, \ + fmt, ##__VA_ARGS__); \ + fprintf(stderr, "\n"); \ + insn->trace = 1; \ + } \ +}) + +#define TRACE_INSN_STATE(insn, sprev, snext) \ +({ \ + if (trace) \ + trace_insn_state(insn, sprev, snext); \ +}) + +#define TRACE_ALT_FMT(pfx, fmt) pfx "<%s.%lx> " fmt +#define TRACE_ALT_ARG(insn) disas_alt_type_name(insn), (insn)->offset + +#define TRACE_ALT(insn, fmt, ...) \ + TRACE_INSN(insn, TRACE_ALT_FMT("", fmt), \ + TRACE_ALT_ARG(insn), ##__VA_ARGS__) + +#define TRACE_ALT_INFO(insn, pfx, fmt, ...) \ + TRACE_ADDR(insn, TRACE_ALT_FMT(pfx, fmt), \ + TRACE_ALT_ARG(insn), ##__VA_ARGS__) + +#define TRACE_ALT_INFO_NOADDR(insn, pfx, fmt, ...) \ + TRACE_ADDR(NULL, TRACE_ALT_FMT(pfx, fmt), \ + TRACE_ALT_ARG(insn), ##__VA_ARGS__) + +#define TRACE_ALT_BEGIN(insn, alt, alt_name) \ +({ \ + if (trace) { \ + alt_name = disas_alt_name(alt); \ + trace_alt_begin(insn, alt, alt_name); \ + } \ +}) + +#define TRACE_ALT_END(insn, alt, alt_name) \ +({ \ + if (trace) { \ + trace_alt_end(insn, alt, alt_name); \ + free(alt_name); \ + } \ +}) + +static inline void trace_enable(void) +{ + trace = true; + trace_depth = 0; +} + +static inline void trace_disable(void) +{ + trace = false; +} + +static inline void trace_depth_inc(void) +{ + if (trace) + trace_depth++; +} + +static inline void trace_depth_dec(void) +{ + if (trace) + trace_depth--; +} + +void trace_insn_state(struct instruction *insn, struct insn_state *sprev, + struct insn_state *snext); +void trace_alt_begin(struct instruction *orig_insn, struct alternative *alt, + char *alt_name); +void trace_alt_end(struct instruction *orig_insn, struct alternative *alt, + char *alt_name); + +#else /* DISAS */ + +#define TRACE(fmt, ...) ({}) +#define TRACE_ADDR(insn, fmt, ...) ({}) +#define TRACE_INSN(insn, fmt, ...) ({}) +#define TRACE_INSN_STATE(insn, sprev, snext) ({}) +#define TRACE_ALT(insn, fmt, ...) ({}) +#define TRACE_ALT_INFO(insn, fmt, ...) ({}) +#define TRACE_ALT_INFO_NOADDR(insn, fmt, ...) ({}) +#define TRACE_ALT_BEGIN(insn, alt, alt_name) ({}) +#define TRACE_ALT_END(insn, alt, alt_name) ({}) + + +static inline void trace_enable(void) {} +static inline void trace_disable(void) {} +static inline void trace_depth_inc(void) {} +static inline void trace_depth_dec(void) {} +static inline void trace_alt_begin(struct instruction *orig_insn, + struct alternative *alt, + char *alt_name) {}; +static inline void trace_alt_end(struct instruction *orig_insn, + struct alternative *alt, + char *alt_name) {}; + +#endif + +#endif /* _TRACE_H */ diff --git a/tools/objtool/include/objtool/warn.h b/tools/objtool/include/objtool/warn.h index a1e3927d8e7c..25ff7942b4d5 100644 --- a/tools/objtool/include/objtool/warn.h +++ b/tools/objtool/include/objtool/warn.h @@ -77,9 +77,11 @@ static inline char *offstr(struct section *sec, unsigned long offset) #define WARN_INSN(insn, format, ...) \ ({ \ struct instruction *_insn = (insn); \ - if (!_insn->sym || !_insn->sym->warned) \ + if (!_insn->sym || !_insn->sym->warned) { \ WARN_FUNC(_insn->sec, _insn->offset, format, \ ##__VA_ARGS__); \ + BT_INSN(_insn, ""); \ + } \ if (_insn->sym) \ _insn->sym->warned = 1; \ }) @@ -87,10 +89,15 @@ static inline char *offstr(struct section *sec, unsigned long offset) #define BT_INSN(insn, format, ...) \ ({ \ if (opts.verbose || opts.backtrace) { \ - struct instruction *_insn = (insn); \ - char *_str = offstr(_insn->sec, _insn->offset); \ - WARN(" %s: " format, _str, ##__VA_ARGS__); \ - free(_str); \ + struct instruction *__insn = (insn); \ + char *_str = offstr(__insn->sec, __insn->offset); \ + const char *_istr = objtool_disas_insn(__insn); \ + int _len; \ + _len = snprintf(NULL, 0, " %s: " format, _str, ##__VA_ARGS__); \ + _len = (_len < 50) ? 50 - _len : 0; \ + WARN(" %s: " format " %*s%s", _str, ##__VA_ARGS__, _len, "", _istr); \ + free(_str); \ + __insn->trace = 1; \ } \ }) diff --git a/tools/objtool/special.c b/tools/objtool/special.c index e262af917143..2a533afbc69a 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -81,6 +81,8 @@ static int get_alt_entry(struct elf *elf, const struct special_entry *entry, entry->orig_len); alt->new_len = *(unsigned char *)(sec->data->d_buf + offset + entry->new_len); + alt->feature = *(unsigned int *)(sec->data->d_buf + offset + + entry->feature); } orig_reloc = find_reloc_by_dest(elf, sec, offset + entry->orig); diff --git a/tools/objtool/trace.c b/tools/objtool/trace.c new file mode 100644 index 000000000000..5dec44dab781 --- /dev/null +++ b/tools/objtool/trace.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2025, Oracle and/or its affiliates. + */ + +#include <objtool/trace.h> + +bool trace; +int trace_depth; + +/* + * Macros to trace CFI state attributes changes. + */ + +#define TRACE_CFI_ATTR(attr, prev, next, fmt, ...) \ +({ \ + if ((prev)->attr != (next)->attr) \ + TRACE("%s=" fmt " ", #attr, __VA_ARGS__); \ +}) + +#define TRACE_CFI_ATTR_BOOL(attr, prev, next) \ + TRACE_CFI_ATTR(attr, prev, next, \ + "%s", (next)->attr ? "true" : "false") + +#define TRACE_CFI_ATTR_NUM(attr, prev, next, fmt) \ + TRACE_CFI_ATTR(attr, prev, next, fmt, (next)->attr) + +#define CFI_REG_NAME_MAXLEN 16 + +/* + * Return the name of a register. Note that the same static buffer + * is returned if the name is dynamically generated. + */ +static const char *cfi_reg_name(unsigned int reg) +{ + static char rname_buffer[CFI_REG_NAME_MAXLEN]; + const char *rname; + + switch (reg) { + case CFI_UNDEFINED: + return "<undefined>"; + case CFI_CFA: + return "cfa"; + case CFI_SP_INDIRECT: + return "(sp)"; + case CFI_BP_INDIRECT: + return "(bp)"; + } + + if (reg < CFI_NUM_REGS) { + rname = arch_reg_name[reg]; + if (rname) + return rname; + } + + if (snprintf(rname_buffer, CFI_REG_NAME_MAXLEN, "r%d", reg) == -1) + return "<error>"; + + return (const char *)rname_buffer; +} + +/* + * Functions and macros to trace CFI registers changes. + */ + +static void trace_cfi_reg(const char *prefix, int reg, const char *fmt, + int base_prev, int offset_prev, + int base_next, int offset_next) +{ + char *rname; + + if (base_prev == base_next && offset_prev == offset_next) + return; + + if (prefix) + TRACE("%s:", prefix); + + if (base_next == CFI_UNDEFINED) { + TRACE("%1$s=<undef> ", cfi_reg_name(reg)); + } else { + rname = strdup(cfi_reg_name(reg)); + TRACE(fmt, rname, cfi_reg_name(base_next), offset_next); + free(rname); + } +} + +static void trace_cfi_reg_val(const char *prefix, int reg, + int base_prev, int offset_prev, + int base_next, int offset_next) +{ + trace_cfi_reg(prefix, reg, "%1$s=%2$s%3$+d ", + base_prev, offset_prev, base_next, offset_next); +} + +static void trace_cfi_reg_ref(const char *prefix, int reg, + int base_prev, int offset_prev, + int base_next, int offset_next) +{ + trace_cfi_reg(prefix, reg, "%1$s=(%2$s%3$+d) ", + base_prev, offset_prev, base_next, offset_next); +} + +#define TRACE_CFI_REG_VAL(reg, prev, next) \ + trace_cfi_reg_val(NULL, reg, prev.base, prev.offset, \ + next.base, next.offset) + +#define TRACE_CFI_REG_REF(reg, prev, next) \ + trace_cfi_reg_ref(NULL, reg, prev.base, prev.offset, \ + next.base, next.offset) + +void trace_insn_state(struct instruction *insn, struct insn_state *sprev, + struct insn_state *snext) +{ + struct cfi_state *cprev, *cnext; + int i; + + if (!memcmp(sprev, snext, sizeof(struct insn_state))) + return; + + cprev = &sprev->cfi; + cnext = &snext->cfi; + + disas_print_insn(stderr, objtool_disas_ctx, insn, + trace_depth - 1, "state: "); + + /* print registers changes */ + TRACE_CFI_REG_VAL(CFI_CFA, cprev->cfa, cnext->cfa); + for (i = 0; i < CFI_NUM_REGS; i++) { + TRACE_CFI_REG_VAL(i, cprev->vals[i], cnext->vals[i]); + TRACE_CFI_REG_REF(i, cprev->regs[i], cnext->regs[i]); + } + + /* print attributes changes */ + TRACE_CFI_ATTR_NUM(stack_size, cprev, cnext, "%d"); + TRACE_CFI_ATTR_BOOL(drap, cprev, cnext); + if (cnext->drap) { + trace_cfi_reg_val("drap", cnext->drap_reg, + cprev->drap_reg, cprev->drap_offset, + cnext->drap_reg, cnext->drap_offset); + } + TRACE_CFI_ATTR_BOOL(bp_scratch, cprev, cnext); + TRACE_CFI_ATTR_NUM(instr, sprev, snext, "%d"); + TRACE_CFI_ATTR_NUM(uaccess_stack, sprev, snext, "%u"); + + TRACE("\n"); + + insn->trace = 1; +} + +void trace_alt_begin(struct instruction *orig_insn, struct alternative *alt, + char *alt_name) +{ + struct instruction *alt_insn; + char suffix[2]; + + alt_insn = alt->insn; + + if (alt->type == ALT_TYPE_EX_TABLE) { + /* + * When there is an exception table then the instruction + * at the original location is executed but it can cause + * an exception. In that case, the execution will be + * redirected to the alternative instruction. + * + * The instruction at the original location can have + * instruction alternatives, so we just print the location + * of the instruction that can cause the exception and + * not the instruction itself. + */ + TRACE_ALT_INFO_NOADDR(orig_insn, "/ ", "%s for instruction at 0x%lx <%s+0x%lx>", + alt_name, + orig_insn->offset, orig_insn->sym->name, + orig_insn->offset - orig_insn->sym->offset); + } else { + TRACE_ALT_INFO_NOADDR(orig_insn, "/ ", "%s", alt_name); + } + + if (alt->type == ALT_TYPE_JUMP_TABLE) { + /* + * For a jump alternative, if the default instruction is + * a NOP then it is replaced with the jmp instruction, + * otherwise it is replaced with a NOP instruction. + */ + trace_depth++; + if (orig_insn->type == INSN_NOP) { + suffix[0] = (orig_insn->len == 5) ? 'q' : '\0'; + TRACE_ADDR(orig_insn, "jmp%-3s %lx <%s+0x%lx>", suffix, + alt_insn->offset, alt_insn->sym->name, + alt_insn->offset - alt_insn->sym->offset); + } else { + TRACE_ADDR(orig_insn, "nop%d", orig_insn->len); + trace_depth--; + } + } +} + +void trace_alt_end(struct instruction *orig_insn, struct alternative *alt, + char *alt_name) +{ + if (alt->type == ALT_TYPE_JUMP_TABLE && orig_insn->type == INSN_NOP) + trace_depth--; + TRACE_ALT_INFO_NOADDR(orig_insn, "\\ ", "%s", alt_name); +} diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 5700516aa84a..2dd5f5a60568 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -354,9 +354,6 @@ FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS) FEATURE_CHECK_LDFLAGS-libaio = -lrt -FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes -ldl -FEATURE_CHECK_LDFLAGS-disassembler-init-styled = -lbfd -lopcodes -ldl - CORE_CFLAGS += -fno-omit-frame-pointer CORE_CFLAGS += -Wall CORE_CFLAGS += -Wextra @@ -930,6 +927,8 @@ ifdef BUILD_NONDISTRO ifeq ($(feature-libbfd), 1) EXTLIBS += -lbfd -lopcodes + FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes -ldl + FEATURE_CHECK_LDFLAGS-disassembler-init-styled = -lbfd -lopcodes -ldl else # we are on a system that requires -liberty and (maybe) -lz # to link against -lbfd; test each case individually here diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 078634461df2..e8962c985d34 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -1867,6 +1867,7 @@ static int __cmd_report(bool display_info) eops.sample = process_sample_event; eops.comm = perf_event__process_comm; eops.mmap = perf_event__process_mmap; + eops.mmap2 = perf_event__process_mmap2; eops.namespaces = perf_event__process_namespaces; eops.tracing_data = perf_event__process_tracing_data; session = perf_session__new(&data, &eops); @@ -2023,6 +2024,7 @@ static int __cmd_contention(int argc, const char **argv) eops.sample = process_sample_event; eops.comm = perf_event__process_comm; eops.mmap = perf_event__process_mmap; + eops.mmap2 = perf_event__process_mmap2; eops.tracing_data = perf_event__process_tracing_data; perf_env__init(&host_env); diff --git a/tools/perf/tests/shell/lock_contention.sh b/tools/perf/tests/shell/lock_contention.sh index 7248a74ca2a3..6dd90519f45c 100755 --- a/tools/perf/tests/shell/lock_contention.sh +++ b/tools/perf/tests/shell/lock_contention.sh @@ -13,15 +13,18 @@ cleanup() { rm -f ${perfdata} rm -f ${result} rm -f ${errout} - trap - EXIT TERM INT + trap - EXIT TERM INT ERR } trap_cleanup() { + if (( $? == 139 )); then #SIGSEGV + err=1 + fi echo "Unexpected signal in ${FUNCNAME[1]}" cleanup exit ${err} } -trap trap_cleanup EXIT TERM INT +trap trap_cleanup EXIT TERM INT ERR check() { if [ "$(id -u)" != 0 ]; then @@ -145,7 +148,7 @@ test_aggr_cgroup() fi # the perf lock contention output goes to the stderr - perf lock con -a -b -g -E 1 -q -- perf bench sched messaging -p > /dev/null 2> ${result} + perf lock con -a -b --lock-cgroup -E 1 -q -- perf bench sched messaging -p > /dev/null 2> ${result} if [ "$(cat "${result}" | wc -l)" != "1" ]; then echo "[Fail] BPF result count is not 1:" "$(cat "${result}" | wc -l)" err=1 @@ -271,7 +274,7 @@ test_cgroup_filter() return fi - perf lock con -a -b -g -E 1 -F wait_total -q -- perf bench sched messaging -p > /dev/null 2> ${result} + perf lock con -a -b --lock-cgroup -E 1 -F wait_total -q -- perf bench sched messaging -p > /dev/null 2> ${result} if [ "$(cat "${result}" | wc -l)" != "1" ]; then echo "[Fail] BPF result should have a cgroup result:" "$(cat "${result}")" err=1 @@ -279,7 +282,7 @@ test_cgroup_filter() fi cgroup=$(cat "${result}" | awk '{ print $3 }') - perf lock con -a -b -g -E 1 -G "${cgroup}" -q -- perf bench sched messaging -p > /dev/null 2> ${result} + perf lock con -a -b --lock-cgroup -E 1 -G "${cgroup}" -q -- perf bench sched messaging -p > /dev/null 2> ${result} if [ "$(cat "${result}" | wc -l)" != "1" ]; then echo "[Fail] BPF result should have a result with cgroup filter:" "$(cat "${cgroup}")" err=1 @@ -338,4 +341,5 @@ test_aggr_task_stack_filter test_cgroup_filter test_csv_output +cleanup exit ${err} diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 4f2a6e10ed5c..4e12be579140 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1022,12 +1022,9 @@ static int write_bpf_prog_info(struct feat_fd *ff, down_read(&env->bpf_progs.lock); - if (env->bpf_progs.infos_cnt == 0) - goto out; - ret = do_write(ff, &env->bpf_progs.infos_cnt, sizeof(env->bpf_progs.infos_cnt)); - if (ret < 0) + if (ret < 0 || env->bpf_progs.infos_cnt == 0) goto out; root = &env->bpf_progs.infos; @@ -1067,13 +1064,10 @@ static int write_bpf_btf(struct feat_fd *ff, down_read(&env->bpf_progs.lock); - if (env->bpf_progs.btfs_cnt == 0) - goto out; - ret = do_write(ff, &env->bpf_progs.btfs_cnt, sizeof(env->bpf_progs.btfs_cnt)); - if (ret < 0) + if (ret < 0 || env->bpf_progs.btfs_cnt == 0) goto out; root = &env->bpf_progs.btfs; diff --git a/tools/perf/util/libbfd.c b/tools/perf/util/libbfd.c index 01147fbf73b3..6434c2dccd4a 100644 --- a/tools/perf/util/libbfd.c +++ b/tools/perf/util/libbfd.c @@ -38,6 +38,39 @@ struct a2l_data { asymbol **syms; }; +static bool perf_bfd_lock(void *bfd_mutex) +{ + mutex_lock(bfd_mutex); + return true; +} + +static bool perf_bfd_unlock(void *bfd_mutex) +{ + mutex_unlock(bfd_mutex); + return true; +} + +static void perf_bfd_init(void) +{ + static struct mutex bfd_mutex; + + mutex_init_recursive(&bfd_mutex); + + if (bfd_init() != BFD_INIT_MAGIC) { + pr_err("Error initializing libbfd\n"); + return; + } + if (!bfd_thread_init(perf_bfd_lock, perf_bfd_unlock, &bfd_mutex)) + pr_err("Error initializing libbfd threading\n"); +} + +static void ensure_bfd_init(void) +{ + static pthread_once_t bfd_init_once = PTHREAD_ONCE_INIT; + + pthread_once(&bfd_init_once, perf_bfd_init); +} + static int bfd_error(const char *string) { const char *errmsg; @@ -132,6 +165,7 @@ static struct a2l_data *addr2line_init(const char *path) bfd *abfd; struct a2l_data *a2l = NULL; + ensure_bfd_init(); abfd = bfd_openr(path, NULL); if (abfd == NULL) return NULL; @@ -288,6 +322,7 @@ int dso__load_bfd_symbols(struct dso *dso, const char *debugfile) bfd *abfd; u64 start, len; + ensure_bfd_init(); abfd = bfd_openr(debugfile, NULL); if (!abfd) return -1; @@ -393,6 +428,7 @@ int libbfd__read_build_id(const char *filename, struct build_id *bid, bool block if (fd < 0) return -1; + ensure_bfd_init(); abfd = bfd_fdopenr(filename, /*target=*/NULL, fd); if (!abfd) return -1; @@ -421,6 +457,7 @@ int libbfd_filename__read_debuglink(const char *filename, char *debuglink, asection *section; bfd *abfd; + ensure_bfd_init(); abfd = bfd_openr(filename, NULL); if (!abfd) return -1; @@ -480,6 +517,7 @@ int symbol__disassemble_bpf_libbfd(struct symbol *sym __maybe_unused, memset(tpath, 0, sizeof(tpath)); perf_exe(tpath, sizeof(tpath)); + ensure_bfd_init(); bfdf = bfd_openr(tpath, NULL); if (bfdf == NULL) abort(); diff --git a/tools/perf/util/mutex.c b/tools/perf/util/mutex.c index bca7f0717f35..7aa1f3f55a7d 100644 --- a/tools/perf/util/mutex.c +++ b/tools/perf/util/mutex.c @@ -17,7 +17,7 @@ static void check_err(const char *fn, int err) #define CHECK_ERR(err) check_err(__func__, err) -static void __mutex_init(struct mutex *mtx, bool pshared) +static void __mutex_init(struct mutex *mtx, bool pshared, bool recursive) { pthread_mutexattr_t attr; @@ -27,21 +27,27 @@ static void __mutex_init(struct mutex *mtx, bool pshared) /* In normal builds enable error checking, such as recursive usage. */ CHECK_ERR(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK)); #endif + if (recursive) + CHECK_ERR(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)); if (pshared) CHECK_ERR(pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)); - CHECK_ERR(pthread_mutex_init(&mtx->lock, &attr)); CHECK_ERR(pthread_mutexattr_destroy(&attr)); } void mutex_init(struct mutex *mtx) { - __mutex_init(mtx, /*pshared=*/false); + __mutex_init(mtx, /*pshared=*/false, /*recursive=*/false); } void mutex_init_pshared(struct mutex *mtx) { - __mutex_init(mtx, /*pshared=*/true); + __mutex_init(mtx, /*pshared=*/true, /*recursive=*/false); +} + +void mutex_init_recursive(struct mutex *mtx) +{ + __mutex_init(mtx, /*pshared=*/false, /*recursive=*/true); } void mutex_destroy(struct mutex *mtx) diff --git a/tools/perf/util/mutex.h b/tools/perf/util/mutex.h index 38458f00846f..70232d8d094f 100644 --- a/tools/perf/util/mutex.h +++ b/tools/perf/util/mutex.h @@ -104,6 +104,8 @@ void mutex_init(struct mutex *mtx); * process-private attribute. */ void mutex_init_pshared(struct mutex *mtx); +/* Initializes a mutex that may be recursively held on the same thread. */ +void mutex_init_recursive(struct mutex *mtx); void mutex_destroy(struct mutex *mtx); void mutex_lock(struct mutex *mtx) EXCLUSIVE_LOCK_FUNCTION(*mtx); diff --git a/tools/power/acpi/tools/pfrut/pfrut.c b/tools/power/acpi/tools/pfrut/pfrut.c index 44a9ecbd91e8..4d9b0177c312 100644 --- a/tools/power/acpi/tools/pfrut/pfrut.c +++ b/tools/power/acpi/tools/pfrut/pfrut.c @@ -222,6 +222,7 @@ int main(int argc, char *argv[]) fd_update_log = open("/dev/acpi_pfr_telemetry0", O_RDWR); if (fd_update_log < 0) { printf("PFRT device not supported - Quit...\n"); + close(fd_update); return 1; } @@ -265,7 +266,8 @@ int main(int argc, char *argv[]) printf("chunk2_size:%d\n", data_info.chunk2_size); printf("rollover_cnt:%d\n", data_info.rollover_cnt); printf("reset_cnt:%d\n", data_info.reset_cnt); - + close(fd_update); + close(fd_update_log); return 0; } @@ -358,6 +360,7 @@ int main(int argc, char *argv[]) if (ret == -1) { perror("Failed to load capsule file"); + munmap(addr_map_capsule, st.st_size); close(fd_capsule); close(fd_update); close(fd_update_log); @@ -420,7 +423,7 @@ int main(int argc, char *argv[]) if (p_mmap == MAP_FAILED) { perror("mmap error."); close(fd_update_log); - + free(log_buf); return 1; } diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index c43db1c41205..a1df9196dc45 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -37,9 +37,7 @@ NLS ?= true # cpufreq-bench benchmarking tool CPUFREQ_BENCH ?= true -# Do not build libraries, but build the code in statically -# Libraries are still built, otherwise the Makefile code would -# be rather ugly. +# Build the code, including libraries, statically. export STATIC ?= false # Prefix to the directories we're installing to @@ -207,14 +205,25 @@ $(OUTPUT)lib/%.o: $(LIB_SRC) $(LIB_HEADERS) $(ECHO) " CC " $@ $(QUIET) $(CC) $(CFLAGS) -fPIC -o $@ -c lib/$*.c -$(OUTPUT)libcpupower.so.$(LIB_VER): $(LIB_OBJS) +ifeq ($(strip $(STATIC)),true) +LIBCPUPOWER := libcpupower.a +else +LIBCPUPOWER := libcpupower.so.$(LIB_VER) +endif + +$(OUTPUT)$(LIBCPUPOWER): $(LIB_OBJS) +ifeq ($(strip $(STATIC)),true) + $(ECHO) " AR " $@ + $(QUIET) $(AR) rcs $@ $(LIB_OBJS) +else $(ECHO) " LD " $@ $(QUIET) $(CC) -shared $(CFLAGS) $(LDFLAGS) -o $@ \ -Wl,-soname,libcpupower.so.$(LIB_MAJ) $(LIB_OBJS) @ln -sf $(@F) $(OUTPUT)libcpupower.so @ln -sf $(@F) $(OUTPUT)libcpupower.so.$(LIB_MAJ) +endif -libcpupower: $(OUTPUT)libcpupower.so.$(LIB_VER) +libcpupower: $(OUTPUT)$(LIBCPUPOWER) # Let all .o files depend on its .c file and all headers # Might be worth to put this into utils/Makefile at some point of time @@ -224,7 +233,7 @@ $(OUTPUT)%.o: %.c $(ECHO) " CC " $@ $(QUIET) $(CC) $(CFLAGS) -I./lib -I ./utils -o $@ -c $*.c -$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)libcpupower.so.$(LIB_VER) +$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)$(LIBCPUPOWER) $(ECHO) " CC " $@ ifeq ($(strip $(STATIC)),true) $(QUIET) $(CC) $(CFLAGS) $(LDFLAGS) $(UTIL_OBJS) -lrt -lpci -L$(OUTPUT) -o $@ @@ -269,7 +278,7 @@ update-po: $(OUTPUT)po/$(PACKAGE).pot done; endif -compile-bench: $(OUTPUT)libcpupower.so.$(LIB_VER) +compile-bench: $(OUTPUT)$(LIBCPUPOWER) @V=$(V) confdir=$(confdir) $(MAKE) -C bench O=$(OUTPUT) # we compile into subdirectories. if the target directory is not the @@ -287,6 +296,7 @@ clean: -find $(OUTPUT) \( -not -type d \) -and \( -name '*~' -o -name '*.[oas]' \) -type f -print \ | xargs rm -f -rm -f $(OUTPUT)cpupower + -rm -f $(OUTPUT)libcpupower.a -rm -f $(OUTPUT)libcpupower.so* -rm -rf $(OUTPUT)po/*.gmo -rm -rf $(OUTPUT)po/*.pot @@ -295,7 +305,11 @@ clean: install-lib: libcpupower $(INSTALL) -d $(DESTDIR)${libdir} +ifeq ($(strip $(STATIC)),true) + $(CP) $(OUTPUT)libcpupower.a $(DESTDIR)${libdir}/ +else $(CP) $(OUTPUT)libcpupower.so* $(DESTDIR)${libdir}/ +endif $(INSTALL) -d $(DESTDIR)${includedir} $(INSTALL_DATA) lib/cpufreq.h $(DESTDIR)${includedir}/cpufreq.h $(INSTALL_DATA) lib/cpuidle.h $(DESTDIR)${includedir}/cpuidle.h @@ -336,11 +350,7 @@ install-bench: compile-bench @#DESTDIR must be set from outside to survive @sbindir=$(sbindir) bindir=$(bindir) docdir=$(docdir) confdir=$(confdir) $(MAKE) -C bench O=$(OUTPUT) install -ifeq ($(strip $(STATIC)),true) -install: all install-tools install-man $(INSTALL_NLS) $(INSTALL_BENCH) -else install: all install-lib install-tools install-man $(INSTALL_NLS) $(INSTALL_BENCH) -endif uninstall: - rm -f $(DESTDIR)${libdir}/libcpupower.* diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c index a85c19e9524e..0114108ab25f 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace.c +++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c @@ -1071,7 +1071,7 @@ static bool sve_write_supported(struct test_config *config) static bool sve_write_fpsimd_supported(struct test_config *config) { - if (!sve_supported()) + if (!sve_supported() && !sme_supported()) return false; if ((config->svcr_in & SVCR_ZA) != (config->svcr_expected & SVCR_ZA)) @@ -1231,9 +1231,6 @@ static void sve_write_fpsimd(pid_t child, struct test_config *config) vl = vl_expected(config); vq = __sve_vq_from_vl(vl); - if (!vl) - return; - iov.iov_len = SVE_PT_SIZE(vq, SVE_PT_REGS_FPSIMD); iov.iov_base = malloc(iov.iov_len); if (!iov.iov_base) { diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index e0fc3a001e28..f44d44618575 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -394,6 +394,58 @@ out: free(svebuf); } +/* Write the FPSIMD registers via the SVE regset when SVE is not supported */ +static void ptrace_sve_fpsimd_no_sve(pid_t child) +{ + void *svebuf; + struct user_sve_header *sve; + struct user_fpsimd_state *fpsimd, new_fpsimd; + unsigned int i, j; + unsigned char *p; + int ret; + + svebuf = malloc(SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD)); + if (!svebuf) { + ksft_test_result_fail("Failed to allocate FPSIMD buffer\n"); + return; + } + + /* On a system without SVE the VL should be set to 0 */ + memset(svebuf, 0, SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD)); + sve = svebuf; + sve->flags = SVE_PT_REGS_FPSIMD; + sve->size = SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD); + sve->vl = 0; + + /* Try to set a known FPSIMD state via PT_REGS_SVE */ + fpsimd = (struct user_fpsimd_state *)((char *)sve + + SVE_PT_FPSIMD_OFFSET); + for (i = 0; i < 32; ++i) { + p = (unsigned char *)&fpsimd->vregs[i]; + + for (j = 0; j < sizeof(fpsimd->vregs[i]); ++j) + p[j] = j; + } + + ret = set_sve(child, &vec_types[0], sve); + ksft_test_result(ret == 0, "FPSIMD write via SVE\n"); + if (ret) { + ksft_test_result_skip("Verify FPSIMD write via SVE\n"); + goto out; + } + + /* Verify via the FPSIMD regset */ + if (get_fpsimd(child, &new_fpsimd)) { + ksft_test_result_skip("Verify FPSIMD write via SVE\n"); + goto out; + } + ksft_test_result(memcmp(fpsimd, &new_fpsimd, sizeof(*fpsimd)) == 0, + "Verify FPSIMD write via SVE\n"); + +out: + free(svebuf); +} + /* Validate attempting to set SVE data and read SVE data */ static void ptrace_set_sve_get_sve_data(pid_t child, const struct vec_type *type, @@ -826,6 +878,15 @@ static int do_parent(pid_t child) } } + /* We support SVE writes of FPSMID format on SME only systems */ + if (!(getauxval(AT_HWCAP) & HWCAP_SVE) && + (getauxval(AT_HWCAP2) & HWCAP2_SME)) { + ptrace_sve_fpsimd_no_sve(child); + } else { + ksft_test_result_skip("FPSIMD write via SVE\n"); + ksft_test_result_skip("Verify FPSIMD write via SVE\n"); + } + ret = EXIT_SUCCESS; error: diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S index 38080f3c3280..a8df05771670 100644 --- a/tools/testing/selftests/arm64/fp/zt-test.S +++ b/tools/testing/selftests/arm64/fp/zt-test.S @@ -276,7 +276,7 @@ function barf bl putdec puts ", iteration=" mov x0, x22 - bl putdec + bl putdecn puts "\tExpected [" mov x0, x10 mov x1, x12 diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 70b28c1e653e..f2a2fd236ca8 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -50,6 +50,7 @@ CONFIG_IPV6_SIT=y CONFIG_IPV6_TUNNEL=y CONFIG_KEYS=y CONFIG_LIRC=y +CONFIG_LIVEPATCH=y CONFIG_LWTUNNEL=y CONFIG_MODULE_SIG=y CONFIG_MODULE_SRCVERSION_ALL=y @@ -111,6 +112,8 @@ CONFIG_IP6_NF_FILTER=y CONFIG_NF_NAT=y CONFIG_PACKET=y CONFIG_RC_CORE=y +CONFIG_SAMPLES=y +CONFIG_SAMPLE_LIVEPATCH=m CONFIG_SECURITY=y CONFIG_SECURITYFS=y CONFIG_SYN_COOKIES=y diff --git a/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c b/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c new file mode 100644 index 000000000000..72aa5376c30e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <test_progs.h> +#include "testing_helpers.h" +#include "livepatch_trampoline.skel.h" + +static int load_livepatch(void) +{ + char path[4096]; + + /* CI will set KBUILD_OUTPUT */ + snprintf(path, sizeof(path), "%s/samples/livepatch/livepatch-sample.ko", + getenv("KBUILD_OUTPUT") ? : "../../../.."); + + return load_module(path, env_verbosity > VERBOSE_NONE); +} + +static void unload_livepatch(void) +{ + /* Disable the livepatch before unloading the module */ + system("echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled"); + + unload_module("livepatch_sample", env_verbosity > VERBOSE_NONE); +} + +static void read_proc_cmdline(void) +{ + char buf[4096]; + int fd, ret; + + fd = open("/proc/cmdline", O_RDONLY); + if (!ASSERT_OK_FD(fd, "open /proc/cmdline")) + return; + + ret = read(fd, buf, sizeof(buf)); + if (!ASSERT_GT(ret, 0, "read /proc/cmdline")) + goto out; + + ASSERT_OK(strncmp(buf, "this has been live patched", 26), "strncmp"); + +out: + close(fd); +} + +static void __test_livepatch_trampoline(bool fexit_first) +{ + struct livepatch_trampoline *skel = NULL; + int err; + + skel = livepatch_trampoline__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + goto out; + + skel->bss->my_pid = getpid(); + + if (!fexit_first) { + /* fentry program is loaded first by default */ + err = livepatch_trampoline__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + } else { + /* Manually load fexit program first. */ + skel->links.fexit_cmdline = bpf_program__attach(skel->progs.fexit_cmdline); + if (!ASSERT_OK_PTR(skel->links.fexit_cmdline, "attach_fexit")) + goto out; + + skel->links.fentry_cmdline = bpf_program__attach(skel->progs.fentry_cmdline); + if (!ASSERT_OK_PTR(skel->links.fentry_cmdline, "attach_fentry")) + goto out; + } + + read_proc_cmdline(); + + ASSERT_EQ(skel->bss->fentry_hit, 1, "fentry_hit"); + ASSERT_EQ(skel->bss->fexit_hit, 1, "fexit_hit"); +out: + livepatch_trampoline__destroy(skel); +} + +void test_livepatch_trampoline(void) +{ + int retry_cnt = 0; + +retry: + if (load_livepatch()) { + if (retry_cnt) { + ASSERT_OK(1, "load_livepatch"); + goto out; + } + /* + * Something else (previous run of the same test?) loaded + * the KLP module. Unload the KLP module and retry. + */ + unload_livepatch(); + retry_cnt++; + goto retry; + } + + if (test__start_subtest("fentry_first")) + __test_livepatch_trampoline(false); + + if (test__start_subtest("fexit_first")) + __test_livepatch_trampoline(true); +out: + unload_livepatch(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index f8eb7f9d4fd2..8fade8bdc451 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -6,11 +6,13 @@ #include <netinet/in.h> #include <test_progs.h> #include <unistd.h> +#include <errno.h> #include "cgroup_helpers.h" #include "network_helpers.h" #include "mptcp_sock.skel.h" #include "mptcpify.skel.h" #include "mptcp_subflow.skel.h" +#include "mptcp_sockmap.skel.h" #define NS_TEST "mptcp_ns" #define ADDR_1 "10.0.1.1" @@ -436,6 +438,142 @@ close_cgroup: close(cgroup_fd); } +/* Test sockmap on MPTCP server handling non-mp-capable clients. */ +static void test_sockmap_with_mptcp_fallback(struct mptcp_sockmap *skel) +{ + int listen_fd = -1, client_fd1 = -1, client_fd2 = -1; + int server_fd1 = -1, server_fd2 = -1, sent, recvd; + char snd[9] = "123456789"; + char rcv[10]; + + /* start server with MPTCP enabled */ + listen_fd = start_mptcp_server(AF_INET, NULL, 0, 0); + if (!ASSERT_OK_FD(listen_fd, "sockmap-fb:start_mptcp_server")) + return; + + skel->bss->trace_port = ntohs(get_socket_local_port(listen_fd)); + skel->bss->sk_index = 0; + /* create client without MPTCP enabled */ + client_fd1 = connect_to_fd_opts(listen_fd, NULL); + if (!ASSERT_OK_FD(client_fd1, "sockmap-fb:connect_to_fd")) + goto end; + + server_fd1 = accept(listen_fd, NULL, 0); + skel->bss->sk_index = 1; + client_fd2 = connect_to_fd_opts(listen_fd, NULL); + if (!ASSERT_OK_FD(client_fd2, "sockmap-fb:connect_to_fd")) + goto end; + + server_fd2 = accept(listen_fd, NULL, 0); + /* test normal redirect behavior: data sent by client_fd1 can be + * received by client_fd2 + */ + skel->bss->redirect_idx = 1; + sent = send(client_fd1, snd, sizeof(snd), 0); + if (!ASSERT_EQ(sent, sizeof(snd), "sockmap-fb:send(client_fd1)")) + goto end; + + /* try to recv more bytes to avoid truncation check */ + recvd = recv(client_fd2, rcv, sizeof(rcv), 0); + if (!ASSERT_EQ(recvd, sizeof(snd), "sockmap-fb:recv(client_fd2)")) + goto end; + +end: + if (client_fd1 >= 0) + close(client_fd1); + if (client_fd2 >= 0) + close(client_fd2); + if (server_fd1 >= 0) + close(server_fd1); + if (server_fd2 >= 0) + close(server_fd2); + close(listen_fd); +} + +/* Test sockmap rejection of MPTCP sockets - both server and client sides. */ +static void test_sockmap_reject_mptcp(struct mptcp_sockmap *skel) +{ + int listen_fd = -1, server_fd = -1, client_fd1 = -1; + int err, zero = 0; + + /* start server with MPTCP enabled */ + listen_fd = start_mptcp_server(AF_INET, NULL, 0, 0); + if (!ASSERT_OK_FD(listen_fd, "start_mptcp_server")) + return; + + skel->bss->trace_port = ntohs(get_socket_local_port(listen_fd)); + skel->bss->sk_index = 0; + /* create client with MPTCP enabled */ + client_fd1 = connect_to_fd(listen_fd, 0); + if (!ASSERT_OK_FD(client_fd1, "connect_to_fd client_fd1")) + goto end; + + /* bpf_sock_map_update() called from sockops should reject MPTCP sk */ + if (!ASSERT_EQ(skel->bss->helper_ret, -EOPNOTSUPP, "should reject")) + goto end; + + server_fd = accept(listen_fd, NULL, 0); + err = bpf_map_update_elem(bpf_map__fd(skel->maps.sock_map), + &zero, &server_fd, BPF_NOEXIST); + if (!ASSERT_EQ(err, -EOPNOTSUPP, "server should be disallowed")) + goto end; + + /* MPTCP client should also be disallowed */ + err = bpf_map_update_elem(bpf_map__fd(skel->maps.sock_map), + &zero, &client_fd1, BPF_NOEXIST); + if (!ASSERT_EQ(err, -EOPNOTSUPP, "client should be disallowed")) + goto end; +end: + if (client_fd1 >= 0) + close(client_fd1); + if (server_fd >= 0) + close(server_fd); + close(listen_fd); +} + +static void test_mptcp_sockmap(void) +{ + struct mptcp_sockmap *skel; + struct netns_obj *netns; + int cgroup_fd, err; + + cgroup_fd = test__join_cgroup("/mptcp_sockmap"); + if (!ASSERT_OK_FD(cgroup_fd, "join_cgroup: mptcp_sockmap")) + return; + + skel = mptcp_sockmap__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_load: mptcp_sockmap")) + goto close_cgroup; + + skel->links.mptcp_sockmap_inject = + bpf_program__attach_cgroup(skel->progs.mptcp_sockmap_inject, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.mptcp_sockmap_inject, "attach sockmap")) + goto skel_destroy; + + err = bpf_prog_attach(bpf_program__fd(skel->progs.mptcp_sockmap_redirect), + bpf_map__fd(skel->maps.sock_map), + BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach stream verdict")) + goto skel_destroy; + + netns = netns_new(NS_TEST, true); + if (!ASSERT_OK_PTR(netns, "netns_new: mptcp_sockmap")) + goto skel_destroy; + + if (endpoint_init("subflow") < 0) + goto close_netns; + + test_sockmap_with_mptcp_fallback(skel); + test_sockmap_reject_mptcp(skel); + +close_netns: + netns_free(netns); +skel_destroy: + mptcp_sockmap__destroy(skel); +close_cgroup: + close(cgroup_fd); +} + void test_mptcp(void) { if (test__start_subtest("base")) @@ -444,4 +582,6 @@ void test_mptcp(void) test_mptcpify(); if (test__start_subtest("subflow")) test_subflow(); + if (test__start_subtest("sockmap")) + test_mptcp_sockmap(); } diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c new file mode 100644 index 000000000000..c9efdd2a5b18 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include "stacktrace_ips.skel.h" + +#ifdef __x86_64__ +static int check_stacktrace_ips(int fd, __u32 key, int cnt, ...) +{ + __u64 ips[PERF_MAX_STACK_DEPTH]; + struct ksyms *ksyms = NULL; + int i, err = 0; + va_list args; + + /* sorted by addr */ + ksyms = load_kallsyms_local(); + if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_local")) + return -1; + + /* unlikely, but... */ + if (!ASSERT_LT(cnt, PERF_MAX_STACK_DEPTH, "check_max")) + return -1; + + err = bpf_map_lookup_elem(fd, &key, ips); + if (err) + goto out; + + /* + * Compare all symbols provided via arguments with stacktrace ips, + * and their related symbol addresses.t + */ + va_start(args, cnt); + + for (i = 0; i < cnt; i++) { + unsigned long val; + struct ksym *ksym; + + val = va_arg(args, unsigned long); + ksym = ksym_search_local(ksyms, ips[i]); + if (!ASSERT_OK_PTR(ksym, "ksym_search_local")) + break; + ASSERT_EQ(ksym->addr, val, "stack_cmp"); + } + + va_end(args); + +out: + free_kallsyms_local(ksyms); + return err; +} + +static void test_stacktrace_ips_kprobe_multi(bool retprobe) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts, + .retprobe = retprobe + ); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct stacktrace_ips *skel; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + skel->links.kprobe_multi_test = bpf_program__attach_kprobe_multi_opts( + skel->progs.kprobe_multi_test, + "bpf_testmod_stacktrace_test", &opts); + if (!ASSERT_OK_PTR(skel->links.kprobe_multi_test, "bpf_program__attach_kprobe_multi_opts")) + goto cleanup; + + trigger_module_test_read(1); + + load_kallsyms(); + + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + +cleanup: + stacktrace_ips__destroy(skel); +} + +static void test_stacktrace_ips_raw_tp(void) +{ + __u32 info_len = sizeof(struct bpf_prog_info); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_prog_info info = {}; + struct stacktrace_ips *skel; + __u64 bpf_prog_ksym = 0; + int err; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + skel->links.rawtp_test = bpf_program__attach_raw_tracepoint( + skel->progs.rawtp_test, + "bpf_testmod_test_read"); + if (!ASSERT_OK_PTR(skel->links.rawtp_test, "bpf_program__attach_raw_tracepoint")) + goto cleanup; + + /* get bpf program address */ + info.jited_ksyms = ptr_to_u64(&bpf_prog_ksym); + info.nr_jited_ksyms = 1; + err = bpf_prog_get_info_by_fd(bpf_program__fd(skel->progs.rawtp_test), + &info, &info_len); + if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd")) + goto cleanup; + + trigger_module_test_read(1); + + load_kallsyms(); + + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 2, + bpf_prog_ksym, + ksym_get_addr("bpf_trace_run2")); + +cleanup: + stacktrace_ips__destroy(skel); +} + +static void __test_stacktrace_ips(void) +{ + if (test__start_subtest("kprobe_multi")) + test_stacktrace_ips_kprobe_multi(false); + if (test__start_subtest("kretprobe_multi")) + test_stacktrace_ips_kprobe_multi(true); + if (test__start_subtest("raw_tp")) + test_stacktrace_ips_raw_tp(); +} +#else +static void __test_stacktrace_ips(void) +{ + test__skip(); +} +#endif + +void test_stacktrace_ips(void) +{ + __test_stacktrace_ips(); +} diff --git a/tools/testing/selftests/bpf/progs/iters_looping.c b/tools/testing/selftests/bpf/progs/iters_looping.c index 05fa5ce7fc59..d00fd570255a 100644 --- a/tools/testing/selftests/bpf/progs/iters_looping.c +++ b/tools/testing/selftests/bpf/progs/iters_looping.c @@ -161,3 +161,56 @@ int simplest_loop(void *ctx) return 0; } + +__used +static void iterator_with_diff_stack_depth(int x) +{ + struct bpf_iter_num iter; + + asm volatile ( + "if r1 == 42 goto 0f;" + "*(u64 *)(r10 - 128) = 0;" + "0:" + /* create iterator */ + "r1 = %[iter];" + "r2 = 0;" + "r3 = 10;" + "call %[bpf_iter_num_new];" + "1:" + /* consume next item */ + "r1 = %[iter];" + "call %[bpf_iter_num_next];" + "if r0 == 0 goto 2f;" + "goto 1b;" + "2:" + /* destroy iterator */ + "r1 = %[iter];" + "call %[bpf_iter_num_destroy];" + : + : __imm_ptr(iter), ITER_HELPERS + : __clobber_common, "r6" + ); +} + +SEC("socket") +__success +__naked int widening_stack_size_bug(void *ctx) +{ + /* + * Depending on iterator_with_diff_stack_depth() parameter value, + * subprogram stack depth is either 8 or 128 bytes. Arrange values so + * that it is 128 on a first call and 8 on a second. This triggered a + * bug in verifier's widen_imprecise_scalars() logic. + */ + asm volatile ( + "r6 = 0;" + "r1 = 0;" + "1:" + "call iterator_with_diff_stack_depth;" + "r1 = 42;" + "r6 += 1;" + "if r6 < 2 goto 1b;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} diff --git a/tools/testing/selftests/bpf/progs/livepatch_trampoline.c b/tools/testing/selftests/bpf/progs/livepatch_trampoline.c new file mode 100644 index 000000000000..15579d5bcd91 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/livepatch_trampoline.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +int fentry_hit; +int fexit_hit; +int my_pid; + +SEC("fentry/cmdline_proc_show") +int BPF_PROG(fentry_cmdline) +{ + if (my_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + fentry_hit = 1; + return 0; +} + +SEC("fexit/cmdline_proc_show") +int BPF_PROG(fexit_cmdline) +{ + if (my_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + fexit_hit = 1; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/mptcp_sockmap.c b/tools/testing/selftests/bpf/progs/mptcp_sockmap.c new file mode 100644 index 000000000000..d4eef0cbadb9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/mptcp_sockmap.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bpf_tracing_net.h" + +char _license[] SEC("license") = "GPL"; + +int sk_index; +int redirect_idx; +int trace_port; +int helper_ret; +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); + __uint(max_entries, 100); +} sock_map SEC(".maps"); + +SEC("sockops") +int mptcp_sockmap_inject(struct bpf_sock_ops *skops) +{ + struct bpf_sock *sk; + + /* only accept specified connection */ + if (skops->local_port != trace_port || + skops->op != BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB) + return 1; + + sk = skops->sk; + if (!sk) + return 1; + + /* update sk handler */ + helper_ret = bpf_sock_map_update(skops, &sock_map, &sk_index, BPF_NOEXIST); + + return 1; +} + +SEC("sk_skb/stream_verdict") +int mptcp_sockmap_redirect(struct __sk_buff *skb) +{ + /* redirect skb to the sk under sock_map[redirect_idx] */ + return bpf_sk_redirect_map(skb, &sock_map, redirect_idx, 0); +} diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c new file mode 100644 index 000000000000..a96c8150d7f5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#ifndef PERF_MAX_STACK_DEPTH +#define PERF_MAX_STACK_DEPTH 127 +#endif + +typedef __u64 stack_trace_t[PERF_MAX_STACK_DEPTH]; + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(max_entries, 16384); + __type(key, __u32); + __type(value, stack_trace_t); +} stackmap SEC(".maps"); + +extern bool CONFIG_UNWINDER_ORC __kconfig __weak; + +/* + * This function is here to have CONFIG_UNWINDER_ORC + * used and added to object BTF. + */ +int unused(void) +{ + return CONFIG_UNWINDER_ORC ? 0 : 1; +} + +__u32 stack_key; + +SEC("kprobe.multi") +int kprobe_multi_test(struct pt_regs *ctx) +{ + stack_key = bpf_get_stackid(ctx, &stackmap, 0); + return 0; +} + +SEC("raw_tp/bpf_testmod_test_read") +int rawtp_test(void *ctx) +{ + /* Skip ebpf program entry in the stack. */ + stack_key = bpf_get_stackid(ctx, &stackmap, 0); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c index b4a0d0cc8ec8..3662515f0107 100644 --- a/tools/testing/selftests/bpf/progs/stream_fail.c +++ b/tools/testing/selftests/bpf/progs/stream_fail.c @@ -10,7 +10,7 @@ SEC("syscall") __failure __msg("Possibly NULL pointer passed") int stream_vprintk_null_arg(void *ctx) { - bpf_stream_vprintk(BPF_STDOUT, "", NULL, 0, NULL); + bpf_stream_vprintk_impl(BPF_STDOUT, "", NULL, 0, NULL); return 0; } @@ -18,7 +18,7 @@ SEC("syscall") __failure __msg("R3 type=scalar expected=") int stream_vprintk_scalar_arg(void *ctx) { - bpf_stream_vprintk(BPF_STDOUT, "", (void *)46, 0, NULL); + bpf_stream_vprintk_impl(BPF_STDOUT, "", (void *)46, 0, NULL); return 0; } @@ -26,7 +26,7 @@ SEC("syscall") __failure __msg("arg#1 doesn't point to a const string") int stream_vprintk_string_arg(void *ctx) { - bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0, NULL); + bpf_stream_vprintk_impl(BPF_STDOUT, ctx, NULL, 0, NULL); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c index 23217f06a3ec..663a80990f8f 100644 --- a/tools/testing/selftests/bpf/progs/task_work.c +++ b/tools/testing/selftests/bpf/progs/task_work.c @@ -66,7 +66,7 @@ int oncpu_hash_map(struct pt_regs *args) if (!work) return 0; - bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); return 0; } @@ -80,7 +80,7 @@ int oncpu_array_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work, NULL); + bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, process_work, NULL); return 0; } @@ -102,6 +102,6 @@ int oncpu_lru_map(struct pt_regs *args) work = bpf_map_lookup_elem(&lrumap, &key); if (!work || work->data[0]) return 0; - bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, &lrumap, process_work, NULL); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c index 77fe8f28facd..1270953fd092 100644 --- a/tools/testing/selftests/bpf/progs/task_work_fail.c +++ b/tools/testing/selftests/bpf/progs/task_work_fail.c @@ -53,7 +53,7 @@ int mismatch_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); return 0; } @@ -65,7 +65,7 @@ int no_map_task_work(struct pt_regs *args) struct bpf_task_work tw; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume(task, &tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &tw, &hmap, process_work, NULL); return 0; } @@ -76,7 +76,7 @@ int task_work_null(struct pt_regs *args) struct task_struct *task; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume(task, NULL, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, NULL, &hmap, process_work, NULL); return 0; } @@ -91,6 +91,6 @@ int map_null(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, NULL, process_work, NULL); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_stress.c b/tools/testing/selftests/bpf/progs/task_work_stress.c index 90fca06fff56..55e555f7f41b 100644 --- a/tools/testing/selftests/bpf/progs/task_work_stress.c +++ b/tools/testing/selftests/bpf/progs/task_work_stress.c @@ -51,8 +51,8 @@ int schedule_task_work(void *ctx) if (!work) return 0; } - err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap, - process_work, NULL); + err = bpf_task_work_schedule_signal_impl(bpf_get_current_task_btf(), &work->tw, &hmap, + process_work, NULL); if (err) __sync_fetch_and_add(&schedule_error, 1); else diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 8074bc5f6f20..ed0a4721d8fd 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -417,6 +417,30 @@ noinline int bpf_testmod_fentry_test11(u64 a, void *b, short c, int d, return a + (long)b + c + d + (long)e + f + g + h + i + j + k; } +noinline void bpf_testmod_stacktrace_test(void) +{ + /* used for stacktrace test as attach function */ + asm volatile (""); +} + +noinline void bpf_testmod_stacktrace_test_3(void) +{ + bpf_testmod_stacktrace_test(); + asm volatile (""); +} + +noinline void bpf_testmod_stacktrace_test_2(void) +{ + bpf_testmod_stacktrace_test_3(); + asm volatile (""); +} + +noinline void bpf_testmod_stacktrace_test_1(void) +{ + bpf_testmod_stacktrace_test_2(); + asm volatile (""); +} + int bpf_testmod_fentry_ok; noinline ssize_t @@ -497,6 +521,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, 21, 22, 23, 24, 25, 26) != 231) goto out; + bpf_testmod_stacktrace_test_1(); + bpf_testmod_fentry_ok = 1; out: return -EIO; /* always fail */ diff --git a/tools/testing/selftests/coredump/.gitignore b/tools/testing/selftests/coredump/.gitignore new file mode 100644 index 000000000000..097f52db0be9 --- /dev/null +++ b/tools/testing/selftests/coredump/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +stackdump_test +coredump_socket_test +coredump_socket_protocol_test diff --git a/tools/testing/selftests/coredump/Makefile b/tools/testing/selftests/coredump/Makefile index 77b3665c73c7..dece1a31d561 100644 --- a/tools/testing/selftests/coredump/Makefile +++ b/tools/testing/selftests/coredump/Makefile @@ -1,7 +1,13 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) -TEST_GEN_PROGS := stackdump_test +TEST_GEN_PROGS := stackdump_test \ + coredump_socket_test \ + coredump_socket_protocol_test TEST_FILES := stackdump include ../lib.mk + +$(OUTPUT)/stackdump_test: coredump_test_helpers.c +$(OUTPUT)/coredump_socket_test: coredump_test_helpers.c +$(OUTPUT)/coredump_socket_protocol_test: coredump_test_helpers.c diff --git a/tools/testing/selftests/coredump/coredump_socket_protocol_test.c b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c new file mode 100644 index 000000000000..d19b6717c53e --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c @@ -0,0 +1,1568 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <sys/stat.h> +#include <sys/epoll.h> +#include <sys/socket.h> +#include <sys/un.h> + +#include "coredump_test.h" + +#define NUM_CRASHING_COREDUMPS 5 + +FIXTURE_SETUP(coredump) +{ + FILE *file; + int ret; + + self->pid_coredump_server = -ESRCH; + self->fd_tmpfs_detached = -1; + file = fopen("/proc/sys/kernel/core_pattern", "r"); + ASSERT_NE(NULL, file); + + ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file); + ASSERT_TRUE(ret || feof(file)); + ASSERT_LT(ret, sizeof(self->original_core_pattern)); + + self->original_core_pattern[ret] = '\0'; + self->fd_tmpfs_detached = create_detached_tmpfs(); + ASSERT_GE(self->fd_tmpfs_detached, 0); + + ret = fclose(file); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(coredump) +{ + const char *reason; + FILE *file; + int ret, status; + + if (self->pid_coredump_server > 0) { + kill(self->pid_coredump_server, SIGTERM); + waitpid(self->pid_coredump_server, &status, 0); + } + unlink("/tmp/coredump.file"); + unlink("/tmp/coredump.socket"); + + file = fopen("/proc/sys/kernel/core_pattern", "w"); + if (!file) { + reason = "Unable to open core_pattern"; + goto fail; + } + + ret = fprintf(file, "%s", self->original_core_pattern); + if (ret < 0) { + reason = "Unable to write to core_pattern"; + goto fail; + } + + ret = fclose(file); + if (ret) { + reason = "Unable to close core_pattern"; + goto fail; + } + + if (self->fd_tmpfs_detached >= 0) { + ret = close(self->fd_tmpfs_detached); + if (ret < 0) { + reason = "Unable to close detached tmpfs"; + goto fail; + } + self->fd_tmpfs_detached = -1; + } + + return; +fail: + /* This should never happen */ + fprintf(stderr, "Failed to cleanup coredump test: %s\n", reason); +} + +TEST_F(coredump, socket_request_kernel) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct stat st; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_core_file = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_kernel: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_kernel: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_kernel: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_kernel: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_kernel: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_kernel: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_kernel: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + fd_core_file = creat("/tmp/coredump.file", 0644); + if (fd_core_file < 0) { + fprintf(stderr, "socket_request_kernel: creat coredump file failed: %m\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_kernel: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_kernel: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_kernel: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_request_kernel: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket_request_kernel: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "socket_request_kernel: write to core file failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_kernel: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); + + ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); + ASSERT_GT(st.st_size, 0); + system("file /tmp/coredump.file"); +} + +TEST_F(coredump, socket_request_userspace) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_userspace: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_userspace: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_userspace: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_userspace: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_userspace: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_userspace: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_userspace: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_userspace: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_userspace: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_USERSPACE | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_userspace: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_request_userspace: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read > 0) { + fprintf(stderr, "socket_request_userspace: unexpected data received (expected no coredump data)\n"); + goto out; + } + + if (bytes_read < 0) { + fprintf(stderr, "socket_request_userspace: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_userspace: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_reject) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_reject: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_reject: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_reject: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_reject: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_reject: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_reject: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_reject: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_reject: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_reject: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_reject: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_request_reject: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read > 0) { + fprintf(stderr, "socket_request_reject: unexpected data received (expected no coredump data for REJECT)\n"); + goto out; + } + + if (bytes_read < 0) { + fprintf(stderr, "socket_request_reject: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_reject: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_invalid_flag_combination) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_invalid_flag_combination: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_invalid_flag_combination: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_invalid_flag_combination: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_invalid_flag_combination: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_invalid_flag_combination: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_KERNEL | COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_invalid_flag_combination: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_CONFLICTING)) { + fprintf(stderr, "socket_request_invalid_flag_combination: read_marker COREDUMP_MARK_CONFLICTING failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_invalid_flag_combination: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_unknown_flag) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_unknown_flag: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_unknown_flag: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_unknown_flag: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_unknown_flag: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_unknown_flag: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_unknown_flag: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_unknown_flag: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_unknown_flag: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_unknown_flag: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, (1ULL << 63), 0)) { + fprintf(stderr, "socket_request_unknown_flag: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_UNSUPPORTED)) { + fprintf(stderr, "socket_request_unknown_flag: read_marker COREDUMP_MARK_UNSUPPORTED failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_unknown_flag: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_invalid_size_small) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_invalid_size_small: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_invalid_size_small: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_invalid_size_small: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_invalid_size_small: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_invalid_size_small: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_invalid_size_small: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_invalid_size_small: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_invalid_size_small: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_invalid_size_small: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, + COREDUMP_ACK_SIZE_VER0 / 2)) { + fprintf(stderr, "socket_request_invalid_size_small: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_MINSIZE)) { + fprintf(stderr, "socket_request_invalid_size_small: read_marker COREDUMP_MARK_MINSIZE failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_invalid_size_small: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_invalid_size_large) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_invalid_size_large: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_invalid_size_large: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_invalid_size_large: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_invalid_size_large: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_invalid_size_large: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_invalid_size_large: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_invalid_size_large: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_invalid_size_large: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_invalid_size_large: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, + COREDUMP_ACK_SIZE_VER0 + PAGE_SIZE)) { + fprintf(stderr, "socket_request_invalid_size_large: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_MAXSIZE)) { + fprintf(stderr, "socket_request_invalid_size_large: read_marker COREDUMP_MARK_MAXSIZE failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_invalid_size_large: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via socket coredump with SIGSEGV + * + * Verify that when using socket-based coredump protocol, + * the coredump_signal field is correctly exposed as SIGSEGV. + */ +TEST_F(coredump, socket_coredump_signal_sigsegv) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGSEGV) { + fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_signal=%d, expected SIGSEGV=%d\n", + info.coredump_signal, SIGSEGV); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: read_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigsegv: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGSEGV); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGSEGV); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via socket coredump with SIGABRT + * + * Verify that when using socket-based coredump protocol, + * the coredump_signal field is correctly exposed as SIGABRT. + */ +TEST_F(coredump, socket_coredump_signal_sigabrt) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGABRT) { + fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_signal=%d, expected SIGABRT=%d\n", + info.coredump_signal, SIGABRT); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: read_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigabrt: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + abort(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGABRT); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGABRT); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps, 500) +{ + int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; + pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + struct coredump_req req = {}; + + close(ipc_sockets[0]); + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "Failed to create and listen on unix socket\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "Failed to notify parent via ipc socket\n"); + goto out; + } + close(ipc_sockets[1]); + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "get_peer_pidfd failed for fd %d: %m\n", fd_coredump); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "get_pidfd_info failed for fd %d\n", fd_peer_pidfd); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "pidfd info missing PIDFD_INFO_COREDUMP for fd %d\n", fd_peer_pidfd); + goto out; + } + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "pidfd info missing PIDFD_COREDUMPED for fd %d\n", fd_peer_pidfd); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "read_coredump_req failed for fd %d\n", fd_coredump); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "check_coredump_req failed for fd %d\n", fd_coredump); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { + fprintf(stderr, "send_coredump_ack failed for fd %d\n", fd_coredump); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "read_marker failed for fd %d\n", fd_coredump); + goto out; + } + + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "%m - open_coredump_tmpfile failed for fd %d\n", fd_coredump); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "read failed for fd %d: %m\n", fd_coredump); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "write failed for fd %d: %m\n", fd_core_file); + goto out; + } + } + + close(fd_core_file); + close(fd_peer_pidfd); + close(fd_coredump); + fd_peer_pidfd = -1; + fd_coredump = -1; + } + + exit_code = EXIT_SUCCESS; +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + pid[i] = fork(); + ASSERT_GE(pid[i], 0); + if (pid[i] == 0) + crashing_child(); + pidfd[i] = sys_pidfd_open(pid[i], 0); + ASSERT_GE(pidfd[i], 0); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + waitpid(pid[i], &status[i], 0); + ASSERT_TRUE(WIFSIGNALED(status[i])); + ASSERT_TRUE(WCOREDUMP(status[i])); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; + ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + } + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps_epoll_workers, 500) +{ + int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; + pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server, worker_pids[NUM_CRASHING_COREDUMPS]; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, exit_code = EXIT_FAILURE, n_conns = 0; + fd_server = -1; + exit_code = EXIT_FAILURE; + n_conns = 0; + close(ipc_sockets[0]); + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: write_nointr to ipc socket failed: %m\n"); + goto out; + } + close(ipc_sockets[1]); + + while (n_conns < NUM_CRASHING_COREDUMPS) { + int fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + struct coredump_req req = {}; + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + continue; + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: accept4 failed: %m\n"); + goto out; + } + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: get_peer_pidfd failed\n"); + goto out; + } + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: get_pidfd_info failed\n"); + goto out; + } + if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: missing PIDFD_INFO_COREDUMP or PIDFD_COREDUMPED\n"); + goto out; + } + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: read_coredump_req failed\n"); + goto out; + } + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: check_coredump_req failed\n"); + goto out; + } + if (!send_coredump_ack(fd_coredump, &req, COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: send_coredump_ack failed\n"); + goto out; + } + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: read_marker failed\n"); + goto out; + } + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: open_coredump_tmpfile failed: %m\n"); + goto out; + } + pid_t worker = fork(); + if (worker == 0) { + close(fd_server); + process_coredump_worker(fd_coredump, fd_peer_pidfd, fd_core_file); + } + worker_pids[n_conns] = worker; + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_core_file >= 0) + close(fd_core_file); + n_conns++; + } + exit_code = EXIT_SUCCESS; +out: + if (fd_server >= 0) + close(fd_server); + + // Reap all worker processes + for (int i = 0; i < n_conns; i++) { + int wstatus; + if (waitpid(worker_pids[i], &wstatus, 0) < 0) { + fprintf(stderr, "Failed to wait for worker %d: %m\n", worker_pids[i]); + } else if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != EXIT_SUCCESS) { + fprintf(stderr, "Worker %d exited with error code %d\n", worker_pids[i], WEXITSTATUS(wstatus)); + exit_code = EXIT_FAILURE; + } + } + + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + pid[i] = fork(); + ASSERT_GE(pid[i], 0); + if (pid[i] == 0) + crashing_child(); + pidfd[i] = sys_pidfd_open(pid[i], 0); + ASSERT_GE(pidfd[i], 0); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + ASSERT_GE(waitpid(pid[i], &status[i], 0), 0); + ASSERT_TRUE(WIFSIGNALED(status[i])); + ASSERT_TRUE(WCOREDUMP(status[i])); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; + ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + } + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/coredump/coredump_socket_test.c b/tools/testing/selftests/coredump/coredump_socket_test.c new file mode 100644 index 000000000000..7e26d4a6a15d --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_socket_test.c @@ -0,0 +1,742 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <sys/stat.h> +#include <sys/epoll.h> +#include <sys/socket.h> +#include <sys/un.h> + +#include "coredump_test.h" + +FIXTURE_SETUP(coredump) +{ + FILE *file; + int ret; + + self->pid_coredump_server = -ESRCH; + self->fd_tmpfs_detached = -1; + file = fopen("/proc/sys/kernel/core_pattern", "r"); + ASSERT_NE(NULL, file); + + ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file); + ASSERT_TRUE(ret || feof(file)); + ASSERT_LT(ret, sizeof(self->original_core_pattern)); + + self->original_core_pattern[ret] = '\0'; + self->fd_tmpfs_detached = create_detached_tmpfs(); + ASSERT_GE(self->fd_tmpfs_detached, 0); + + ret = fclose(file); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(coredump) +{ + const char *reason; + FILE *file; + int ret, status; + + if (self->pid_coredump_server > 0) { + kill(self->pid_coredump_server, SIGTERM); + waitpid(self->pid_coredump_server, &status, 0); + } + unlink("/tmp/coredump.file"); + unlink("/tmp/coredump.socket"); + + file = fopen("/proc/sys/kernel/core_pattern", "w"); + if (!file) { + reason = "Unable to open core_pattern"; + goto fail; + } + + ret = fprintf(file, "%s", self->original_core_pattern); + if (ret < 0) { + reason = "Unable to write to core_pattern"; + goto fail; + } + + ret = fclose(file); + if (ret) { + reason = "Unable to close core_pattern"; + goto fail; + } + + if (self->fd_tmpfs_detached >= 0) { + ret = close(self->fd_tmpfs_detached); + if (ret < 0) { + reason = "Unable to close detached tmpfs"; + goto fail; + } + self->fd_tmpfs_detached = -1; + } + + return; +fail: + /* This should never happen */ + fprintf(stderr, "Failed to cleanup coredump test: %s\n", reason); +} + +TEST_F(coredump, socket) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct stat st; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket test: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket test: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket test: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket test: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket test: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket test: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket test: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + fd_core_file = creat("/tmp/coredump.file", 0644); + if (fd_core_file < 0) { + fprintf(stderr, "socket test: creat coredump file failed: %m\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket test: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "socket test: write to core file failed (read=%zd, write=%zd): %m\n", bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket test: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); + + ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); + ASSERT_GT(st.st_size, 0); +} + +TEST_F(coredump, socket_detect_userspace_client) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct stat st; + struct pidfd_info info = { + .mask = PIDFD_INFO_COREDUMP, + }; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_detect_userspace_client: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_detect_userspace_client: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_detect_userspace_client: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_detect_userspace_client: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_detect_userspace_client: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_detect_userspace_client: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (info.coredump_mask & PIDFD_COREDUMPED) { + fprintf(stderr, "socket_detect_userspace_client: PIDFD_COREDUMPED incorrectly set (should be userspace client)\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_detect_userspace_client: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) { + int fd_socket; + ssize_t ret; + const struct sockaddr_un coredump_sk = { + .sun_family = AF_UNIX, + .sun_path = "/tmp/coredump.socket", + }; + size_t coredump_sk_len = + offsetof(struct sockaddr_un, sun_path) + + sizeof("/tmp/coredump.socket"); + + fd_socket = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd_socket < 0) { + fprintf(stderr, "socket_detect_userspace_client (client): socket failed: %m\n"); + _exit(EXIT_FAILURE); + } + + ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len); + if (ret < 0) { + fprintf(stderr, "socket_detect_userspace_client (client): connect failed: %m\n"); + _exit(EXIT_FAILURE); + } + + close(fd_socket); + pause(); + fprintf(stderr, "socket_detect_userspace_client (client): completed successfully\n"); + _exit(EXIT_SUCCESS); + } + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); + + ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0); + ASSERT_EQ(close(pidfd), 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + ASSERT_NE(stat("/tmp/coredump.file", &st), 0); + ASSERT_EQ(errno, ENOENT); +} + +TEST_F(coredump, socket_enoent) +{ + int pidfd, status; + pid_t pid; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); +} + +TEST_F(coredump, socket_no_listener) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + int ipc_sockets[2]; + char c; + const struct sockaddr_un coredump_sk = { + .sun_family = AF_UNIX, + .sun_path = "/tmp/coredump.socket", + }; + size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) + + sizeof("/tmp/coredump.socket"); + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (fd_server < 0) { + fprintf(stderr, "socket_no_listener: socket failed: %m\n"); + goto out; + } + + ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len); + if (ret < 0) { + fprintf(stderr, "socket_no_listener: bind failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_no_listener: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_no_listener: completed successfully\n"); +out: + if (fd_server >= 0) + close(fd_server); + close(ipc_sockets[1]); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via simple socket coredump + * + * Verify that when using simple socket-based coredump (@ pattern), + * the coredump_signal field is correctly exposed as SIGSEGV. + */ +TEST_F(coredump, socket_coredump_signal_sigsegv) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGSEGV) { + fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_signal=%d, expected SIGSEGV=%d\n", + info.coredump_signal, SIGSEGV); + goto out; + } + + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: open_coredump_tmpfile failed: %m\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + fprintf(stderr, "socket_coredump_signal_sigsegv: write to core file failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigsegv: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGSEGV); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGSEGV); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via simple socket coredump with SIGABRT + * + * Verify that when using simple socket-based coredump (@ pattern), + * the coredump_signal field is correctly exposed as SIGABRT. + */ +TEST_F(coredump, socket_coredump_signal_sigabrt) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGABRT) { + fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_signal=%d, expected SIGABRT=%d\n", + info.coredump_signal, SIGABRT); + goto out; + } + + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: open_coredump_tmpfile failed: %m\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + fprintf(stderr, "socket_coredump_signal_sigabrt: write to core file failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigabrt: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + abort(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGABRT); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGABRT); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_invalid_paths) +{ + ASSERT_FALSE(set_core_pattern("@ /tmp/coredump.socket")); + ASSERT_FALSE(set_core_pattern("@/tmp/../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@/tmp/coredump.socket/..")); + ASSERT_FALSE(set_core_pattern("@..")); + + ASSERT_FALSE(set_core_pattern("@@ /tmp/coredump.socket")); + ASSERT_FALSE(set_core_pattern("@@/tmp/../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@@../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@@/tmp/coredump.socket/..")); + ASSERT_FALSE(set_core_pattern("@@..")); + + ASSERT_FALSE(set_core_pattern("@@@/tmp/coredump.socket")); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/coredump/coredump_test.h b/tools/testing/selftests/coredump/coredump_test.h new file mode 100644 index 000000000000..ed47f01fa53c --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_test.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __COREDUMP_TEST_H +#define __COREDUMP_TEST_H + +#include <stdbool.h> +#include <sys/types.h> +#include <linux/coredump.h> + +#include "../kselftest_harness.h" +#include "../pidfd/pidfd.h" + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#define NUM_THREAD_SPAWN 128 + +/* Coredump fixture */ +FIXTURE(coredump) +{ + char original_core_pattern[256]; + pid_t pid_coredump_server; + int fd_tmpfs_detached; +}; + +/* Shared helper function declarations */ +void *do_nothing(void *arg); +void crashing_child(void); +int create_detached_tmpfs(void); +int create_and_listen_unix_socket(const char *path); +bool set_core_pattern(const char *pattern); +int get_peer_pidfd(int fd); +bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info); + +/* Inline helper that uses harness types */ +static inline void wait_and_check_coredump_server(pid_t pid_coredump_server, + struct __test_metadata *const _metadata, + FIXTURE_DATA(coredump) *self) +{ + int status; + waitpid(pid_coredump_server, &status, 0); + self->pid_coredump_server = -ESRCH; + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); +} + +/* Protocol helper function declarations */ +ssize_t recv_marker(int fd); +bool read_marker(int fd, enum coredump_mark mark); +bool read_coredump_req(int fd, struct coredump_req *req); +bool send_coredump_ack(int fd, const struct coredump_req *req, + __u64 mask, size_t size_ack); +bool check_coredump_req(const struct coredump_req *req, size_t min_size, + __u64 required_mask); +int open_coredump_tmpfile(int fd_tmpfs_detached); +void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file); + +#endif /* __COREDUMP_TEST_H */ diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c new file mode 100644 index 000000000000..a6f6d5f2ae07 --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_test_helpers.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/coredump.h> +#include <linux/fs.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/epoll.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/un.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "../filesystems/wrappers.h" +#include "../pidfd/pidfd.h" + +/* Forward declarations to avoid including harness header */ +struct __test_metadata; + +/* Match the fixture definition from coredump_test.h */ +struct _fixture_coredump_data { + char original_core_pattern[256]; + pid_t pid_coredump_server; + int fd_tmpfs_detached; +}; + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#define NUM_THREAD_SPAWN 128 + +void *do_nothing(void *arg) +{ + (void)arg; + while (1) + pause(); + + return NULL; +} + +void crashing_child(void) +{ + pthread_t thread; + int i; + + for (i = 0; i < NUM_THREAD_SPAWN; ++i) + pthread_create(&thread, NULL, do_nothing, NULL); + + /* crash on purpose */ + i = *(int *)NULL; +} + +int create_detached_tmpfs(void) +{ + int fd_context, fd_tmpfs; + + fd_context = sys_fsopen("tmpfs", 0); + if (fd_context < 0) + return -1; + + if (sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) + return -1; + + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + close(fd_context); + return fd_tmpfs; +} + +int create_and_listen_unix_socket(const char *path) +{ + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + assert(strlen(path) < sizeof(addr.sun_path) - 1); + strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1); + size_t addr_len = + offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1; + int fd, ret; + + fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (fd < 0) + goto out; + + ret = bind(fd, (const struct sockaddr *)&addr, addr_len); + if (ret < 0) + goto out; + + ret = listen(fd, 128); + if (ret < 0) + goto out; + + return fd; + +out: + if (fd >= 0) + close(fd); + return -1; +} + +bool set_core_pattern(const char *pattern) +{ + int fd; + ssize_t ret; + + fd = open("/proc/sys/kernel/core_pattern", O_WRONLY | O_CLOEXEC); + if (fd < 0) + return false; + + ret = write(fd, pattern, strlen(pattern)); + close(fd); + if (ret < 0) + return false; + + fprintf(stderr, "Set core_pattern to '%s' | %zu == %zu\n", pattern, ret, strlen(pattern)); + return ret == strlen(pattern); +} + +int get_peer_pidfd(int fd) +{ + int fd_peer_pidfd; + socklen_t fd_peer_pidfd_len = sizeof(fd_peer_pidfd); + int ret = getsockopt(fd, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd, + &fd_peer_pidfd_len); + if (ret < 0) { + fprintf(stderr, "get_peer_pidfd: getsockopt(SO_PEERPIDFD) failed: %m\n"); + return -1; + } + fprintf(stderr, "get_peer_pidfd: successfully retrieved pidfd %d\n", fd_peer_pidfd); + return fd_peer_pidfd; +} + +bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info) +{ + int ret; + memset(info, 0, sizeof(*info)); + info->mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL; + ret = ioctl(fd_peer_pidfd, PIDFD_GET_INFO, info); + if (ret < 0) { + fprintf(stderr, "get_pidfd_info: ioctl(PIDFD_GET_INFO) failed: %m\n"); + return false; + } + fprintf(stderr, "get_pidfd_info: mask=0x%llx, coredump_mask=0x%x, coredump_signal=%d\n", + (unsigned long long)info->mask, info->coredump_mask, info->coredump_signal); + return true; +} + +/* Protocol helper functions */ + +ssize_t recv_marker(int fd) +{ + enum coredump_mark mark = COREDUMP_MARK_REQACK; + ssize_t ret; + + ret = recv(fd, &mark, sizeof(mark), MSG_WAITALL); + if (ret != sizeof(mark)) + return -1; + + switch (mark) { + case COREDUMP_MARK_REQACK: + fprintf(stderr, "Received marker: ReqAck\n"); + return COREDUMP_MARK_REQACK; + case COREDUMP_MARK_MINSIZE: + fprintf(stderr, "Received marker: MinSize\n"); + return COREDUMP_MARK_MINSIZE; + case COREDUMP_MARK_MAXSIZE: + fprintf(stderr, "Received marker: MaxSize\n"); + return COREDUMP_MARK_MAXSIZE; + case COREDUMP_MARK_UNSUPPORTED: + fprintf(stderr, "Received marker: Unsupported\n"); + return COREDUMP_MARK_UNSUPPORTED; + case COREDUMP_MARK_CONFLICTING: + fprintf(stderr, "Received marker: Conflicting\n"); + return COREDUMP_MARK_CONFLICTING; + default: + fprintf(stderr, "Received unknown marker: %u\n", mark); + break; + } + return -1; +} + +bool read_marker(int fd, enum coredump_mark mark) +{ + ssize_t ret; + + ret = recv_marker(fd); + if (ret < 0) + return false; + return ret == mark; +} + +bool read_coredump_req(int fd, struct coredump_req *req) +{ + ssize_t ret; + size_t field_size, user_size, ack_size, kernel_size, remaining_size; + + memset(req, 0, sizeof(*req)); + field_size = sizeof(req->size); + + /* Peek the size of the coredump request. */ + ret = recv(fd, req, field_size, MSG_PEEK | MSG_WAITALL); + if (ret != field_size) { + fprintf(stderr, "read_coredump_req: peek failed (got %zd, expected %zu): %m\n", + ret, field_size); + return false; + } + kernel_size = req->size; + + if (kernel_size < COREDUMP_ACK_SIZE_VER0) { + fprintf(stderr, "read_coredump_req: kernel_size %zu < min %d\n", + kernel_size, COREDUMP_ACK_SIZE_VER0); + return false; + } + if (kernel_size >= PAGE_SIZE) { + fprintf(stderr, "read_coredump_req: kernel_size %zu >= PAGE_SIZE %d\n", + kernel_size, PAGE_SIZE); + return false; + } + + /* Use the minimum of user and kernel size to read the full request. */ + user_size = sizeof(struct coredump_req); + ack_size = user_size < kernel_size ? user_size : kernel_size; + ret = recv(fd, req, ack_size, MSG_WAITALL); + if (ret != ack_size) + return false; + + fprintf(stderr, "Read coredump request with size %u and mask 0x%llx\n", + req->size, (unsigned long long)req->mask); + + if (user_size > kernel_size) + remaining_size = user_size - kernel_size; + else + remaining_size = kernel_size - user_size; + + if (PAGE_SIZE <= remaining_size) + return false; + + /* + * Discard any additional data if the kernel's request was larger than + * what we knew about or cared about. + */ + if (remaining_size) { + char buffer[PAGE_SIZE]; + + ret = recv(fd, buffer, sizeof(buffer), MSG_WAITALL); + if (ret != remaining_size) + return false; + fprintf(stderr, "Discarded %zu bytes of data after coredump request\n", remaining_size); + } + + return true; +} + +bool send_coredump_ack(int fd, const struct coredump_req *req, + __u64 mask, size_t size_ack) +{ + ssize_t ret; + /* + * Wrap struct coredump_ack in a larger struct so we can + * simulate sending to much data to the kernel. + */ + struct large_ack_for_size_testing { + struct coredump_ack ack; + char buffer[PAGE_SIZE]; + } large_ack = {}; + + if (!size_ack) + size_ack = sizeof(struct coredump_ack) < req->size_ack ? + sizeof(struct coredump_ack) : + req->size_ack; + large_ack.ack.mask = mask; + large_ack.ack.size = size_ack; + ret = send(fd, &large_ack, size_ack, MSG_NOSIGNAL); + if (ret != size_ack) + return false; + + fprintf(stderr, "Sent coredump ack with size %zu and mask 0x%llx\n", + size_ack, (unsigned long long)mask); + return true; +} + +bool check_coredump_req(const struct coredump_req *req, size_t min_size, + __u64 required_mask) +{ + if (req->size < min_size) + return false; + if ((req->mask & required_mask) != required_mask) + return false; + if (req->mask & ~required_mask) + return false; + return true; +} + +int open_coredump_tmpfile(int fd_tmpfs_detached) +{ + return openat(fd_tmpfs_detached, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600); +} + +void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file) +{ + int epfd = -1; + int exit_code = EXIT_FAILURE; + struct epoll_event ev; + int flags; + + /* Set socket to non-blocking mode for edge-triggered epoll */ + flags = fcntl(fd_coredump, F_GETFL, 0); + if (flags < 0) { + fprintf(stderr, "Worker: fcntl(F_GETFL) failed: %m\n"); + goto out; + } + if (fcntl(fd_coredump, F_SETFL, flags | O_NONBLOCK) < 0) { + fprintf(stderr, "Worker: fcntl(F_SETFL, O_NONBLOCK) failed: %m\n"); + goto out; + } + + epfd = epoll_create1(0); + if (epfd < 0) { + fprintf(stderr, "Worker: epoll_create1() failed: %m\n"); + goto out; + } + + ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; + ev.data.fd = fd_coredump; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd_coredump, &ev) < 0) { + fprintf(stderr, "Worker: epoll_ctl(EPOLL_CTL_ADD) failed: %m\n"); + goto out; + } + + for (;;) { + struct epoll_event events[1]; + int n = epoll_wait(epfd, events, 1, -1); + if (n < 0) { + fprintf(stderr, "Worker: epoll_wait() failed: %m\n"); + break; + } + + if (events[0].events & (EPOLLIN | EPOLLRDHUP)) { + for (;;) { + char buffer[4096]; + ssize_t bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + break; + fprintf(stderr, "Worker: read() failed: %m\n"); + goto out; + } + if (bytes_read == 0) + goto done; + ssize_t bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_write != bytes_read) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "Worker: write() failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + } + } + +done: + exit_code = EXIT_SUCCESS; + fprintf(stderr, "Worker: completed successfully\n"); +out: + if (epfd >= 0) + close(epfd); + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + _exit(exit_code); +} diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c index a4ac80bb1003..c2e895bcc160 100644 --- a/tools/testing/selftests/coredump/stackdump_test.c +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -23,57 +23,15 @@ #include "../filesystems/wrappers.h" #include "../pidfd/pidfd.h" +#include "coredump_test.h" + #define STACKDUMP_FILE "stack_values" #define STACKDUMP_SCRIPT "stackdump" -#define NUM_THREAD_SPAWN 128 #ifndef PAGE_SIZE #define PAGE_SIZE 4096 #endif -static void *do_nothing(void *) -{ - while (1) - pause(); - - return NULL; -} - -static void crashing_child(void) -{ - pthread_t thread; - int i; - - for (i = 0; i < NUM_THREAD_SPAWN; ++i) - pthread_create(&thread, NULL, do_nothing, NULL); - - /* crash on purpose */ - i = *(int *)NULL; -} - -FIXTURE(coredump) -{ - char original_core_pattern[256]; - pid_t pid_coredump_server; - int fd_tmpfs_detached; -}; - -static int create_detached_tmpfs(void) -{ - int fd_context, fd_tmpfs; - - fd_context = sys_fsopen("tmpfs", 0); - if (fd_context < 0) - return -1; - - if (sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) - return -1; - - fd_tmpfs = sys_fsmount(fd_context, 0, 0); - close(fd_context); - return fd_tmpfs; -} - FIXTURE_SETUP(coredump) { FILE *file; @@ -208,1620 +166,4 @@ TEST_F_TIMEOUT(coredump, stackdump, 120) fclose(file); } -static int create_and_listen_unix_socket(const char *path) -{ - struct sockaddr_un addr = { - .sun_family = AF_UNIX, - }; - assert(strlen(path) < sizeof(addr.sun_path) - 1); - strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1); - size_t addr_len = - offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1; - int fd, ret; - - fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (fd < 0) - goto out; - - ret = bind(fd, (const struct sockaddr *)&addr, addr_len); - if (ret < 0) - goto out; - - ret = listen(fd, 128); - if (ret < 0) - goto out; - - return fd; - -out: - if (fd >= 0) - close(fd); - return -1; -} - -static bool set_core_pattern(const char *pattern) -{ - int fd; - ssize_t ret; - - fd = open("/proc/sys/kernel/core_pattern", O_WRONLY | O_CLOEXEC); - if (fd < 0) - return false; - - ret = write(fd, pattern, strlen(pattern)); - close(fd); - if (ret < 0) - return false; - - fprintf(stderr, "Set core_pattern to '%s' | %zu == %zu\n", pattern, ret, strlen(pattern)); - return ret == strlen(pattern); -} - -static int get_peer_pidfd(int fd) -{ - int fd_peer_pidfd; - socklen_t fd_peer_pidfd_len = sizeof(fd_peer_pidfd); - int ret = getsockopt(fd, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd, - &fd_peer_pidfd_len); - if (ret < 0) { - fprintf(stderr, "%m - Failed to retrieve peer pidfd for coredump socket connection\n"); - return -1; - } - return fd_peer_pidfd; -} - -static bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info) -{ - memset(info, 0, sizeof(*info)); - info->mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - return ioctl(fd_peer_pidfd, PIDFD_GET_INFO, info) == 0; -} - -static void -wait_and_check_coredump_server(pid_t pid_coredump_server, - struct __test_metadata *const _metadata, - FIXTURE_DATA(coredump)* self) -{ - int status; - waitpid(pid_coredump_server, &status, 0); - self->pid_coredump_server = -ESRCH; - ASSERT_TRUE(WIFEXITED(status)); - ASSERT_EQ(WEXITSTATUS(status), 0); -} - -TEST_F(coredump, socket) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct stat st; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - fd_core_file = creat("/tmp/coredump.file", 0644); - if (fd_core_file < 0) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read, bytes_write; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - - bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_read != bytes_write) - goto out; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_TRUE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); - - ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); - ASSERT_GT(st.st_size, 0); - system("file /tmp/coredump.file"); -} - -TEST_F(coredump, socket_detect_userspace_client) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct stat st; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (info.coredump_mask & PIDFD_COREDUMPED) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) { - int fd_socket; - ssize_t ret; - const struct sockaddr_un coredump_sk = { - .sun_family = AF_UNIX, - .sun_path = "/tmp/coredump.socket", - }; - size_t coredump_sk_len = - offsetof(struct sockaddr_un, sun_path) + - sizeof("/tmp/coredump.socket"); - - fd_socket = socket(AF_UNIX, SOCK_STREAM, 0); - if (fd_socket < 0) - _exit(EXIT_FAILURE); - - ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len); - if (ret < 0) - _exit(EXIT_FAILURE); - - close(fd_socket); - _exit(EXIT_SUCCESS); - } - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFEXITED(status)); - ASSERT_EQ(WEXITSTATUS(status), 0); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); - - ASSERT_NE(stat("/tmp/coredump.file", &st), 0); - ASSERT_EQ(errno, ENOENT); -} - -TEST_F(coredump, socket_enoent) -{ - int pidfd, status; - pid_t pid; - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); -} - -TEST_F(coredump, socket_no_listener) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - int ipc_sockets[2]; - char c; - const struct sockaddr_un coredump_sk = { - .sun_family = AF_UNIX, - .sun_path = "/tmp/coredump.socket", - }; - size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) + - sizeof("/tmp/coredump.socket"); - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (fd_server < 0) - goto out; - - ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len); - if (ret < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_server >= 0) - close(fd_server); - close(ipc_sockets[1]); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -static ssize_t recv_marker(int fd) -{ - enum coredump_mark mark = COREDUMP_MARK_REQACK; - ssize_t ret; - - ret = recv(fd, &mark, sizeof(mark), MSG_WAITALL); - if (ret != sizeof(mark)) - return -1; - - switch (mark) { - case COREDUMP_MARK_REQACK: - fprintf(stderr, "Received marker: ReqAck\n"); - return COREDUMP_MARK_REQACK; - case COREDUMP_MARK_MINSIZE: - fprintf(stderr, "Received marker: MinSize\n"); - return COREDUMP_MARK_MINSIZE; - case COREDUMP_MARK_MAXSIZE: - fprintf(stderr, "Received marker: MaxSize\n"); - return COREDUMP_MARK_MAXSIZE; - case COREDUMP_MARK_UNSUPPORTED: - fprintf(stderr, "Received marker: Unsupported\n"); - return COREDUMP_MARK_UNSUPPORTED; - case COREDUMP_MARK_CONFLICTING: - fprintf(stderr, "Received marker: Conflicting\n"); - return COREDUMP_MARK_CONFLICTING; - default: - fprintf(stderr, "Received unknown marker: %u\n", mark); - break; - } - return -1; -} - -static bool read_marker(int fd, enum coredump_mark mark) -{ - ssize_t ret; - - ret = recv_marker(fd); - if (ret < 0) - return false; - return ret == mark; -} - -static bool read_coredump_req(int fd, struct coredump_req *req) -{ - ssize_t ret; - size_t field_size, user_size, ack_size, kernel_size, remaining_size; - - memset(req, 0, sizeof(*req)); - field_size = sizeof(req->size); - - /* Peek the size of the coredump request. */ - ret = recv(fd, req, field_size, MSG_PEEK | MSG_WAITALL); - if (ret != field_size) - return false; - kernel_size = req->size; - - if (kernel_size < COREDUMP_ACK_SIZE_VER0) - return false; - if (kernel_size >= PAGE_SIZE) - return false; - - /* Use the minimum of user and kernel size to read the full request. */ - user_size = sizeof(struct coredump_req); - ack_size = user_size < kernel_size ? user_size : kernel_size; - ret = recv(fd, req, ack_size, MSG_WAITALL); - if (ret != ack_size) - return false; - - fprintf(stderr, "Read coredump request with size %u and mask 0x%llx\n", - req->size, (unsigned long long)req->mask); - - if (user_size > kernel_size) - remaining_size = user_size - kernel_size; - else - remaining_size = kernel_size - user_size; - - if (PAGE_SIZE <= remaining_size) - return false; - - /* - * Discard any additional data if the kernel's request was larger than - * what we knew about or cared about. - */ - if (remaining_size) { - char buffer[PAGE_SIZE]; - - ret = recv(fd, buffer, sizeof(buffer), MSG_WAITALL); - if (ret != remaining_size) - return false; - fprintf(stderr, "Discarded %zu bytes of data after coredump request\n", remaining_size); - } - - return true; -} - -static bool send_coredump_ack(int fd, const struct coredump_req *req, - __u64 mask, size_t size_ack) -{ - ssize_t ret; - /* - * Wrap struct coredump_ack in a larger struct so we can - * simulate sending to much data to the kernel. - */ - struct large_ack_for_size_testing { - struct coredump_ack ack; - char buffer[PAGE_SIZE]; - } large_ack = {}; - - if (!size_ack) - size_ack = sizeof(struct coredump_ack) < req->size_ack ? - sizeof(struct coredump_ack) : - req->size_ack; - large_ack.ack.mask = mask; - large_ack.ack.size = size_ack; - ret = send(fd, &large_ack, size_ack, MSG_NOSIGNAL); - if (ret != size_ack) - return false; - - fprintf(stderr, "Sent coredump ack with size %zu and mask 0x%llx\n", - size_ack, (unsigned long long)mask); - return true; -} - -static bool check_coredump_req(const struct coredump_req *req, size_t min_size, - __u64 required_mask) -{ - if (req->size < min_size) - return false; - if ((req->mask & required_mask) != required_mask) - return false; - if (req->mask & ~required_mask) - return false; - return true; -} - -TEST_F(coredump, socket_request_kernel) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct stat st; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_core_file = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - fd_core_file = creat("/tmp/coredump.file", 0644); - if (fd_core_file < 0) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_KERNEL | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read, bytes_write; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - - bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_read != bytes_write) - goto out; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_TRUE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); - - ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); - ASSERT_GT(st.st_size, 0); - system("file /tmp/coredump.file"); -} - -TEST_F(coredump, socket_request_userspace) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_USERSPACE | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read > 0) - goto out; - - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_TRUE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_reject) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_REJECT | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read > 0) - goto out; - - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_invalid_flag_combination) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_KERNEL | COREDUMP_REJECT | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_CONFLICTING)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_unknown_flag) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, (1ULL << 63), 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_UNSUPPORTED)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_invalid_size_small) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_REJECT | COREDUMP_WAIT, - COREDUMP_ACK_SIZE_VER0 / 2)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_MINSIZE)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_invalid_size_large) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_REJECT | COREDUMP_WAIT, - COREDUMP_ACK_SIZE_VER0 + PAGE_SIZE)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_MAXSIZE)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -static int open_coredump_tmpfile(int fd_tmpfs_detached) -{ - return openat(fd_tmpfs_detached, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600); -} - -#define NUM_CRASHING_COREDUMPS 5 - -TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps, 500) -{ - int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; - pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; - int exit_code = EXIT_FAILURE; - struct coredump_req req = {}; - - close(ipc_sockets[0]); - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) { - fprintf(stderr, "Failed to create and listen on unix socket\n"); - goto out; - } - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) { - fprintf(stderr, "Failed to notify parent via ipc socket\n"); - goto out; - } - close(ipc_sockets[1]); - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) { - fprintf(stderr, "accept4 failed: %m\n"); - goto out; - } - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) { - fprintf(stderr, "get_peer_pidfd failed for fd %d: %m\n", fd_coredump); - goto out; - } - - if (!get_pidfd_info(fd_peer_pidfd, &info)) { - fprintf(stderr, "get_pidfd_info failed for fd %d\n", fd_peer_pidfd); - goto out; - } - - if (!(info.mask & PIDFD_INFO_COREDUMP)) { - fprintf(stderr, "pidfd info missing PIDFD_INFO_COREDUMP for fd %d\n", fd_peer_pidfd); - goto out; - } - if (!(info.coredump_mask & PIDFD_COREDUMPED)) { - fprintf(stderr, "pidfd info missing PIDFD_COREDUMPED for fd %d\n", fd_peer_pidfd); - goto out; - } - - if (!read_coredump_req(fd_coredump, &req)) { - fprintf(stderr, "read_coredump_req failed for fd %d\n", fd_coredump); - goto out; - } - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) { - fprintf(stderr, "check_coredump_req failed for fd %d\n", fd_coredump); - goto out; - } - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { - fprintf(stderr, "send_coredump_ack failed for fd %d\n", fd_coredump); - goto out; - } - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { - fprintf(stderr, "read_marker failed for fd %d\n", fd_coredump); - goto out; - } - - fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); - if (fd_core_file < 0) { - fprintf(stderr, "%m - open_coredump_tmpfile failed for fd %d\n", fd_coredump); - goto out; - } - - for (;;) { - char buffer[4096]; - ssize_t bytes_read, bytes_write; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) { - fprintf(stderr, "read failed for fd %d: %m\n", fd_coredump); - goto out; - } - - if (bytes_read == 0) - break; - - bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_read != bytes_write) { - fprintf(stderr, "write failed for fd %d: %m\n", fd_core_file); - goto out; - } - } - - close(fd_core_file); - close(fd_peer_pidfd); - close(fd_coredump); - fd_peer_pidfd = -1; - fd_coredump = -1; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - pid[i] = fork(); - ASSERT_GE(pid[i], 0); - if (pid[i] == 0) - crashing_child(); - pidfd[i] = sys_pidfd_open(pid[i], 0); - ASSERT_GE(pidfd[i], 0); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - waitpid(pid[i], &status[i], 0); - ASSERT_TRUE(WIFSIGNALED(status[i])); - ASSERT_TRUE(WCOREDUMP(status[i])); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - } - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -#define MAX_EVENTS 128 - -static void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file) -{ - int epfd = -1; - int exit_code = EXIT_FAILURE; - - epfd = epoll_create1(0); - if (epfd < 0) - goto out; - - struct epoll_event ev; - ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; - ev.data.fd = fd_coredump; - if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd_coredump, &ev) < 0) - goto out; - - for (;;) { - struct epoll_event events[1]; - int n = epoll_wait(epfd, events, 1, -1); - if (n < 0) - break; - - if (events[0].events & (EPOLLIN | EPOLLRDHUP)) { - for (;;) { - char buffer[4096]; - ssize_t bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK) - break; - goto out; - } - if (bytes_read == 0) - goto done; - ssize_t bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_write != bytes_read) - goto out; - } - } - } - -done: - exit_code = EXIT_SUCCESS; -out: - if (epfd >= 0) - close(epfd); - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - _exit(exit_code); -} - -TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps_epoll_workers, 500) -{ - int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; - pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server, worker_pids[NUM_CRASHING_COREDUMPS]; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, exit_code = EXIT_FAILURE, n_conns = 0; - fd_server = -1; - exit_code = EXIT_FAILURE; - n_conns = 0; - close(ipc_sockets[0]); - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - close(ipc_sockets[1]); - - while (n_conns < NUM_CRASHING_COREDUMPS) { - int fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; - struct coredump_req req = {}; - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK) - continue; - goto out; - } - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - if (!read_coredump_req(fd_coredump, &req)) - goto out; - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - if (!send_coredump_ack(fd_coredump, &req, COREDUMP_KERNEL | COREDUMP_WAIT, 0)) - goto out; - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); - if (fd_core_file < 0) - goto out; - pid_t worker = fork(); - if (worker == 0) { - close(fd_server); - process_coredump_worker(fd_coredump, fd_peer_pidfd, fd_core_file); - } - worker_pids[n_conns] = worker; - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_core_file >= 0) - close(fd_core_file); - n_conns++; - } - exit_code = EXIT_SUCCESS; -out: - if (fd_server >= 0) - close(fd_server); - - // Reap all worker processes - for (int i = 0; i < n_conns; i++) { - int wstatus; - if (waitpid(worker_pids[i], &wstatus, 0) < 0) { - fprintf(stderr, "Failed to wait for worker %d: %m\n", worker_pids[i]); - } else if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != EXIT_SUCCESS) { - fprintf(stderr, "Worker %d exited with error code %d\n", worker_pids[i], WEXITSTATUS(wstatus)); - exit_code = EXIT_FAILURE; - } - } - - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - pid[i] = fork(); - ASSERT_GE(pid[i], 0); - if (pid[i] == 0) - crashing_child(); - pidfd[i] = sys_pidfd_open(pid[i], 0); - ASSERT_GE(pidfd[i], 0); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - ASSERT_GE(waitpid(pid[i], &status[i], 0), 0); - ASSERT_TRUE(WIFSIGNALED(status[i])); - ASSERT_TRUE(WCOREDUMP(status[i])); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - } - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_invalid_paths) -{ - ASSERT_FALSE(set_core_pattern("@ /tmp/coredump.socket")); - ASSERT_FALSE(set_core_pattern("@/tmp/../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@/tmp/coredump.socket/..")); - ASSERT_FALSE(set_core_pattern("@..")); - - ASSERT_FALSE(set_core_pattern("@@ /tmp/coredump.socket")); - ASSERT_FALSE(set_core_pattern("@@/tmp/../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@@../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@@/tmp/coredump.socket/..")); - ASSERT_FALSE(set_core_pattern("@@..")); - - ASSERT_FALSE(set_core_pattern("@@@/tmp/coredump.socket")); -} - TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 6e41635bd55a..71ee69e524d7 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -18,6 +18,7 @@ TEST_PROGS := \ netcons_fragmented_msg.sh \ netcons_overflow.sh \ netcons_sysdata.sh \ + netcons_torture.sh \ netpoll_basic.py \ ping.py \ psp.py \ diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index 402d4ee84f2e..6c5c60adb5e8 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -14,6 +14,7 @@ TEST_PROGS := \ dev_addr_lists.sh \ mode-1-recovery-updelay.sh \ mode-2-recovery-updelay.sh \ + netcons_over_bonding.sh \ # end of TEST_PROGS TEST_FILES := \ @@ -24,6 +25,7 @@ TEST_FILES := \ TEST_INCLUDES := \ ../../../net/lib.sh \ + ../lib/sh/lib_netcons.sh \ ../../../net/forwarding/lib.sh \ # end of TEST_INCLUDES diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index 6bb290abd48b..991494376223 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -1,5 +1,6 @@ CONFIG_BONDING=y CONFIG_BRIDGE=y +CONFIG_CONFIGFS_FS=y CONFIG_DUMMY=y CONFIG_INET_ESP=y CONFIG_INET_ESP_OFFLOAD=y @@ -9,6 +10,9 @@ CONFIG_MACVLAN=y CONFIG_NET_ACT_GACT=y CONFIG_NET_CLS_FLOWER=y CONFIG_NET_CLS_MATCHALL=m +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETCONSOLE_EXTENDED_LOG=y CONFIG_NETDEVSIM=m CONFIG_NET_SCH_INGRESS=y CONFIG_NLMON=y diff --git a/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh b/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh new file mode 100755 index 000000000000..477cc9379500 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh @@ -0,0 +1,361 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 +# +# This selftest exercises trying to have multiple netpoll users at the same +# time. +# +# This selftest has multiple smalls test inside, and the goal is to +# get interfaces with bonding and netconsole in different orders in order +# to catch any possible issue. +# +# The main test composes of four interfaces being created using netdevsim; two +# of them are bonded to serve as the netconsole's transmit interface. The +# remaining two interfaces are similarly bonded and assigned to a separate +# network namespace, which acts as the receive interface, where socat monitors +# for incoming messages. +# +# A netconsole message is then sent to ensure it is properly received across +# this configuration. +# +# Later, run a few other tests, to make sure that bonding and netconsole +# cannot coexist. +# +# The test's objective is to exercise netpoll usage when managed simultaneously +# by multiple subsystems (netconsole and bonding). +# +# Author: Breno Leitao <leitao@debian.org> + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true +modprobe bonding 2> /dev/null || true +modprobe veth 2> /dev/null || true + +# The content of kmsg will be save to the following file +OUTPUT_FILE="/tmp/${TARGET}" + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup_bond EXIT + +FORMAT="extended" +IP_VERSION="ipv4" +VETH0="veth"$(( RANDOM % 256)) +VETH1="veth"$((256 + RANDOM % 256)) +TXNS="" +RXNS="" + +# Create "bond_tx_XX" and "bond_rx_XX" interfaces, and set DSTIF and SRCIF with +# the bonding interfaces +function setup_bonding_ifaces() { + local RAND=$(( RANDOM % 100 )) + BOND_TX_MAIN_IF="bond_tx_$RAND" + BOND_RX_MAIN_IF="bond_rx_$RAND" + + # Setup TX + if ! ip -n "${TXNS}" link add "${BOND_TX_MAIN_IF}" type bond mode balance-rr + then + echo "Failed to create bond TX interface. Is CONFIG_BONDING set?" >&2 + # only clean nsim ifaces and namespace. Nothing else has been + # initialized + cleanup_bond_nsim + trap - EXIT + exit "${ksft_skip}" + fi + + # create_netdevsim() got the interface up, but it needs to be down + # before being enslaved. + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" down + ip -n "${TXNS}" \ + link set "${BOND_TX2_SLAVE_IF}" down + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + ip -n "${TXNS}" \ + link set "${BOND_TX2_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + ip -n "${TXNS}" \ + link set "${BOND_TX_MAIN_IF}" up + + # Setup RX + ip -n "${RXNS}" \ + link add "${BOND_RX_MAIN_IF}" type bond mode balance-rr + ip -n "${RXNS}" \ + link set "${BOND_RX1_SLAVE_IF}" down + ip -n "${RXNS}" \ + link set "${BOND_RX2_SLAVE_IF}" down + ip -n "${RXNS}" \ + link set "${BOND_RX1_SLAVE_IF}" master "${BOND_RX_MAIN_IF}" + ip -n "${RXNS}" \ + link set "${BOND_RX2_SLAVE_IF}" master "${BOND_RX_MAIN_IF}" + ip -n "${RXNS}" \ + link set "${BOND_RX_MAIN_IF}" up + + export DSTIF="${BOND_RX_MAIN_IF}" + export SRCIF="${BOND_TX_MAIN_IF}" +} + +# Create 4 netdevsim interfaces. Two of them will be bound to TX bonding iface +# and the other two will be bond to the RX interface (on the other namespace) +function create_ifaces_bond() { + BOND_TX1_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_TX_1}" "${TXNS}") + BOND_TX2_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_TX_2}" "${TXNS}") + BOND_RX1_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_RX_1}" "${RXNS}") + BOND_RX2_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_RX_2}" "${RXNS}") +} + +# netdevsim link BOND_TX to BOND_RX interfaces +function link_ifaces_bond() { + local BOND_TX1_SLAVE_IFIDX + local BOND_TX2_SLAVE_IFIDX + local BOND_RX1_SLAVE_IFIDX + local BOND_RX2_SLAVE_IFIDX + local TXNS_FD + local RXNS_FD + + BOND_TX1_SLAVE_IFIDX=$(ip netns exec "${TXNS}" \ + cat /sys/class/net/"$BOND_TX1_SLAVE_IF"/ifindex) + BOND_TX2_SLAVE_IFIDX=$(ip netns exec "${TXNS}" \ + cat /sys/class/net/"$BOND_TX2_SLAVE_IF"/ifindex) + BOND_RX1_SLAVE_IFIDX=$(ip netns exec "${RXNS}" \ + cat /sys/class/net/"$BOND_RX1_SLAVE_IF"/ifindex) + BOND_RX2_SLAVE_IFIDX=$(ip netns exec "${RXNS}" \ + cat /sys/class/net/"$BOND_RX2_SLAVE_IF"/ifindex) + + exec {TXNS_FD}</var/run/netns/"${TXNS}" + exec {RXNS_FD}</var/run/netns/"${RXNS}" + + # Linking TX ifaces to the RX ones (on the other namespace) + echo "${TXNS_FD}:$BOND_TX1_SLAVE_IFIDX $RXNS_FD:$BOND_RX1_SLAVE_IFIDX" \ + > "$NSIM_DEV_SYS_LINK" + echo "${TXNS_FD}:$BOND_TX2_SLAVE_IFIDX $RXNS_FD:$BOND_RX2_SLAVE_IFIDX" \ + > "$NSIM_DEV_SYS_LINK" + + exec {TXNS_FD}<&- + exec {RXNS_FD}<&- +} + +function create_all_ifaces() { + # setup_ns function is coming from lib.sh + setup_ns TXNS RXNS + export NAMESPACE="${RXNS}" + + # Create two interfaces for RX and two for TX + create_ifaces_bond + # Link netlink ifaces + link_ifaces_bond +} + +# configure DSTIF and SRCIF IPs +function configure_ifaces_ips() { + local IP_VERSION=${1:-"ipv4"} + select_ipv4_or_ipv6 "${IP_VERSION}" + + ip -n "${RXNS}" addr add "${DSTIP}"/24 dev "${DSTIF}" + ip -n "${RXNS}" link set "${DSTIF}" up + + ip -n "${TXNS}" addr add "${SRCIP}"/24 dev "${SRCIF}" + ip -n "${TXNS}" link set "${SRCIF}" up +} + +function test_enable_netpoll_on_enslaved_iface() { + echo 0 > "${NETCONS_PATH}"/enabled + + # At this stage, BOND_TX1_SLAVE_IF is enslaved to BOND_TX_MAIN_IF, and + # linked to BOND_RX1_SLAVE_IF inside the namespace. + echo "${BOND_TX1_SLAVE_IF}" > "${NETCONS_PATH}"/dev_name + + # This should fail with the following message in dmesg: + # netpoll: netconsole: ethX is a slave device, aborting + set +e + enable_netcons_ns 2> /dev/null + set -e + + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 1 ]] + then + echo "test failed: Bonding and netpoll cannot co-exists." >&2 + exit "${ksft_fail}" + fi +} + +function test_delete_bond_and_reenable_target() { + ip -n "${TXNS}" \ + link delete "${BOND_TX_MAIN_IF}" type bond + + # BOND_TX1_SLAVE_IF is not attached to a bond interface anymore + # netpoll can be plugged in there + echo "${BOND_TX1_SLAVE_IF}" > "${NETCONS_PATH}"/dev_name + + # this should work, since the interface is not enslaved + enable_netcons_ns + + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]] + then + echo "test failed: Unable to start netpoll on an unbond iface." >&2 + exit "${ksft_fail}" + fi +} + +# Send a netconsole message to the netconsole target +function test_send_netcons_msg_through_bond_iface() { + # Listen for netconsole port inside the namespace and + # destination interface + listen_port_and_save_to "${OUTPUT_FILE}" "${IP_VERSION}" & + # Wait for socat to start and listen to the port. + wait_for_port "${RXNS}" "${PORT}" "${IP_VERSION}" + # Send the message + echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + # Make sure the message was received in the dst part + # and exit + validate_result "${OUTPUT_FILE}" "${FORMAT}" + # kill socat in case it is still running + pkill_socat +} + +# BOND_TX1_SLAVE_IF has netconsole enabled on it, bind it to BOND_TX_MAIN_IF. +# Given BOND_TX_MAIN_IF was deleted, recreate it first +function test_enslave_netcons_enabled_iface { + # netconsole got disabled while the interface was down + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]] + then + echo "test failed: netconsole expected to be enabled against BOND_TX1_SLAVE_IF" >&2 + exit "${ksft_fail}" + fi + + # recreate the bonding iface. it got deleted by previous + # test (test_delete_bond_and_reenable_target) + ip -n "${TXNS}" \ + link add "${BOND_TX_MAIN_IF}" type bond mode balance-rr + + # sub-interface need to be down before attaching to bonding + # This will also disable netconsole. + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" down + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + ip -n "${TXNS}" \ + link set "${BOND_TX_MAIN_IF}" up + + # netconsole got disabled while the interface was down + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 1 ]] + then + echo "test failed: Device is part of a bond iface, cannot have netcons enabled" >&2 + exit "${ksft_fail}" + fi +} + +# Get netconsole enabled on a bonding interface and attach a second +# sub-interface. +function test_enslave_iface_to_bond { + # BOND_TX_MAIN_IF has only BOND_TX1_SLAVE_IF right now + echo "${BOND_TX_MAIN_IF}" > "${NETCONS_PATH}"/dev_name + enable_netcons_ns + + # netcons is attached to bond0 and BOND_TX1_SLAVE_IF is + # part of BOND_TX_MAIN_IF. Attach BOND_TX2_SLAVE_IF to BOND_TX_MAIN_IF. + ip -n "${TXNS}" \ + link set "${BOND_TX2_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]] + then + echo "test failed: Netconsole should be enabled on bonding interface. Failed" >&2 + exit "${ksft_fail}" + fi +} + +function test_enslave_iff_disabled_netpoll_iface { + local ret + + # Create two interfaces. veth interfaces it known to have + # IFF_DISABLE_NETPOLL set + if ! ip link add "${VETH0}" type veth peer name "${VETH1}" + then + echo "Failed to create veth TX interface. Is CONFIG_VETH set?" >&2 + exit "${ksft_skip}" + fi + set +e + # This will print RTNETLINK answers: Device or resource busy + ip link set "${VETH0}" master "${BOND_TX_MAIN_IF}" 2> /dev/null + ret=$? + set -e + if [[ $ret -eq 0 ]] + then + echo "test failed: veth interface could not be enslaved" + exit "${ksft_fail}" + fi +} + +# Given that netconsole picks the current net namespace, we need to enable it +# from inside the TXNS namespace +function enable_netcons_ns() { + ip netns exec "${TXNS}" sh -c \ + "mount -t configfs configfs /sys/kernel/config && echo 1 > $NETCONS_PATH/enabled" +} + +#################### +# Tests start here # +#################### + +# Create regular interfaces using netdevsim and link them +create_all_ifaces + +# Setup the bonding interfaces +# BOND_RX_MAIN_IF has BOND_RX{1,2}_SLAVE_IF +# BOND_TX_MAIN_IF has BOND_TX{1,2}_SLAVE_IF +setup_bonding_ifaces + +# Configure the ips as BOND_RX1_SLAVE_IF and BOND_TX1_SLAVE_IF +configure_ifaces_ips "${IP_VERSION}" + +_create_dynamic_target "${FORMAT}" "${NETCONS_PATH}" +enable_netcons_ns +set_user_data + +# Test #1 : Create an bonding interface and attach netpoll into +# the bonding interface. Netconsole/netpoll should work on +# the bonding interface. +test_send_netcons_msg_through_bond_iface +echo "test #1: netpoll on bonding interface worked. Test passed" >&2 + +# Test #2: Attach netpoll to an enslaved interface +# Try to attach netpoll to an enslaved sub-interface (while still being part of +# a bonding interface), which shouldn't be allowed +test_enable_netpoll_on_enslaved_iface +echo "test #2: netpoll correctly rejected enslaved interface (expected behavior). Test passed." >&2 + +# Test #3: Unplug the sub-interface from bond and enable netconsole +# Detach the interface from a bonding interface and attach netpoll again +test_delete_bond_and_reenable_target +echo "test #3: Able to attach to an unbound interface. Test passed." >&2 + +# Test #4: Enslave a sub-interface that had netconsole enabled +# Try to enslave an interface that has netconsole/netpoll enabled. +# Previous test has netconsole enabled in BOND_TX1_SLAVE_IF, try to enslave it +test_enslave_netcons_enabled_iface +echo "test #4: Enslaving an interface with netpoll attached. Test passed." >&2 + +# Test #5: Enslave a sub-interface to a bonding interface +# Enslave an interface to a bond interface that has netpoll attached +# At this stage, BOND_TX_MAIN_IF is created and BOND_TX1_SLAVE_IF is part of +# it. Netconsole is currently disabled +test_enslave_iface_to_bond +echo "test #5: Enslaving an interface to bond+netpoll. Test passed." >&2 + +# Test #6: Enslave a IFF_DISABLE_NETPOLL sub-interface to a bonding interface +# At this stage, BOND_TX_MAIN_IF has both sub interface and netconsole is +# enabled. This test will try to enslave an a veth (IFF_DISABLE_NETPOLL) interface +# and it should fail, with netpoll: veth0 doesn't support polling +test_enslave_iff_disabled_netpoll_iface +echo "test #6: Enslaving IFF_DISABLE_NETPOLL ifaces to bond iface is not supported. Test passed." >&2 + +cleanup_bond +trap - EXIT +exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh index 8e1085e89647..87f89fd92f8c 100644 --- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -11,9 +11,11 @@ set -euo pipefail LIBDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") SRCIF="" # to be populated later +SRCIP="" # to be populated later SRCIP4="192.0.2.1" SRCIP6="fc00::1" DSTIF="" # to be populated later +DSTIP="" # to be populated later DSTIP4="192.0.2.2" DSTIP6="fc00::2" @@ -28,17 +30,23 @@ NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}" # NAMESPACE will be populated by setup_ns with a random value NAMESPACE="" -# IDs for netdevsim +# IDs for netdevsim. We either use NSIM_DEV_{1,2}_ID for standard test +# or NSIM_BOND_{T,R}X_{1,2} for the bonding tests. Not both at the +# same time. NSIM_DEV_1_ID=$((256 + RANDOM % 256)) NSIM_DEV_2_ID=$((512 + RANDOM % 256)) +NSIM_BOND_TX_1=$((768 + RANDOM % 256)) +NSIM_BOND_TX_2=$((1024 + RANDOM % 256)) +NSIM_BOND_RX_1=$((1280 + RANDOM % 256)) +NSIM_BOND_RX_2=$((1536 + RANDOM % 256)) NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device" +NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device" # Used to create and delete namespaces source "${LIBDIR}"/../../../../net/lib.sh # Create netdevsim interfaces create_ifaces() { - echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW" echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW" udevadm settle 2> /dev/null || true @@ -113,31 +121,38 @@ function set_network() { configure_ip } -function create_dynamic_target() { - local FORMAT=${1:-"extended"} +function _create_dynamic_target() { + local FORMAT="${1:?FORMAT parameter required}" + local NCPATH="${2:?NCPATH parameter required}" DSTMAC=$(ip netns exec "${NAMESPACE}" \ ip link show "${DSTIF}" | awk '/ether/ {print $2}') # Create a dynamic target - mkdir "${NETCONS_PATH}" + mkdir "${NCPATH}" - echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip - echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip - echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac - echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name + echo "${DSTIP}" > "${NCPATH}"/remote_ip + echo "${SRCIP}" > "${NCPATH}"/local_ip + echo "${DSTMAC}" > "${NCPATH}"/remote_mac + echo "${SRCIF}" > "${NCPATH}"/dev_name if [ "${FORMAT}" == "basic" ] then # Basic target does not support release - echo 0 > "${NETCONS_PATH}"/release - echo 0 > "${NETCONS_PATH}"/extended + echo 0 > "${NCPATH}"/release + echo 0 > "${NCPATH}"/extended elif [ "${FORMAT}" == "extended" ] then - echo 1 > "${NETCONS_PATH}"/extended + echo 1 > "${NCPATH}"/extended fi +} - echo 1 > "${NETCONS_PATH}"/enabled +function create_dynamic_target() { + local FORMAT=${1:-"extended"} + local NCPATH=${2:-"$NETCONS_PATH"} + _create_dynamic_target "${FORMAT}" "${NCPATH}" + + echo 1 > "${NCPATH}"/enabled # This will make sure that the kernel was able to # load the netconsole driver configuration. The console message @@ -185,14 +200,26 @@ function do_cleanup() { echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk } -function cleanup() { +function cleanup_netcons() { # delete netconsole dynamic reconfiguration - echo 0 > "${NETCONS_PATH}"/enabled + # do not fail if the target is already disabled + if [[ ! -d "${NETCONS_PATH}" ]] + then + # in some cases this is called before netcons path is created + return + fi + if [[ $(cat "${NETCONS_PATH}"/enabled) != 0 ]] + then + echo 0 > "${NETCONS_PATH}"/enabled || true + fi # Remove all the keys that got created during the selftest find "${NETCONS_PATH}/userdata/" -mindepth 1 -type d -delete # Remove the configfs entry rmdir "${NETCONS_PATH}" +} +function cleanup() { + cleanup_netcons do_cleanup } @@ -369,3 +396,24 @@ function wait_for_port() { # more frequently on IPv6 sleep 1 } + +# Clean up netdevsim ifaces created for bonding test +function cleanup_bond_nsim() { + ip -n "${TXNS}" \ + link delete "${BOND_TX_MAIN_IF}" type bond || true + ip -n "${RXNS}" \ + link delete "${BOND_RX_MAIN_IF}" type bond || true + + cleanup_netdevsim "$NSIM_BOND_TX_1" + cleanup_netdevsim "$NSIM_BOND_TX_2" + cleanup_netdevsim "$NSIM_BOND_RX_1" + cleanup_netdevsim "$NSIM_BOND_RX_2" +} + +# cleanup tests that use bonding interfaces +function cleanup_bond() { + cleanup_netcons + cleanup_bond_nsim + cleanup_all_ns + ip link delete "${VETH0}" || true +} diff --git a/tools/testing/selftests/drivers/net/netcons_torture.sh b/tools/testing/selftests/drivers/net/netcons_torture.sh new file mode 100755 index 000000000000..2ce9ee3719d1 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netcons_torture.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# Repeatedly send kernel messages, toggles netconsole targets on and off, +# creates and deletes targets in parallel, and toggles the source interface to +# simulate stress conditions. +# +# This test aims to verify the robustness of netconsole under dynamic +# configurations and concurrent operations. +# +# The major goal is to run this test with LOCKDEP, Kmemleak and KASAN to make +# sure no issues is reported. +# +# Author: Breno Leitao <leitao@debian.org> + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh + +# Number of times the main loop run +ITERATIONS=${1:-150} + +# Only test extended format +FORMAT="extended" +# And ipv6 only +IP_VERSION="ipv6" + +# Create, enable and delete some targets. +create_and_delete_random_target() { + COUNT=2 + RND_PREFIX=$(mktemp -u netcons_rnd_XXXX_) + + if [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}${COUNT}" ] || \ + [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}0" ]; then + echo "Function didn't finish yet, skipping it." >&2 + return + fi + + # enable COUNT targets + for i in $(seq ${COUNT}) + do + RND_TARGET="${RND_PREFIX}"${i} + RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" + + # Basic population so the target can come up + _create_dynamic_target "${FORMAT}" "${RND_TARGET_PATH}" + done + + echo "netconsole selftest: ${COUNT} additional targets were created" > /dev/kmsg + # disable them all + for i in $(seq ${COUNT}) + do + RND_TARGET="${RND_PREFIX}"${i} + RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" + if [[ $(cat "${RND_TARGET_PATH}/enabled") -eq 1 ]] + then + echo 0 > "${RND_TARGET_PATH}"/enabled + fi + rmdir "${RND_TARGET_PATH}" + done +} + +# Disable and enable the target mid-air, while messages +# are being transmitted. +toggle_netcons_target() { + for i in $(seq 2) + do + if [ ! -d "${NETCONS_PATH}" ] + then + break + fi + echo 0 > "${NETCONS_PATH}"/enabled 2> /dev/null || true + # Try to enable a bit harder, given it might fail to enable + # Write to `enabled` might fail depending on the lock, which is + # highly contentious here + for _ in $(seq 5) + do + echo 1 > "${NETCONS_PATH}"/enabled 2> /dev/null || true + done + done +} + +toggle_iface(){ + ip link set "${SRCIF}" down + ip link set "${SRCIF}" up +} + +# Start here + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network "${IP_VERSION}" +# Create a dynamic target for netconsole +create_dynamic_target "${FORMAT}" + +for i in $(seq "$ITERATIONS") +do + for _ in $(seq 10) + do + echo "${MSG}: ${TARGET} ${i}" > /dev/kmsg + done + wait + + if (( i % 30 == 0 )); then + toggle_netcons_target & + fi + + if (( i % 50 == 0 )); then + # create some targets, enable them, send msg and disable + # all in a parallel thread + create_and_delete_random_target & + fi + + if (( i % 70 == 0 )); then + toggle_iface & + fi +done +wait + +exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c index c43a69dffd83..a0c64f415a7f 100644 --- a/tools/testing/selftests/filesystems/utils.c +++ b/tools/testing/selftests/filesystems/utils.c @@ -487,7 +487,7 @@ int setup_userns(void) uid_t uid = getuid(); gid_t gid = getgid(); - ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID); + ret = unshare(CLONE_NEWNS|CLONE_NEWUSER); if (ret) { ksft_exit_fail_msg("unsharing mountns and userns: %s\n", strerror(errno)); diff --git a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc index c62165fabd0c..cfa16aa1f39a 100644 --- a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc +++ b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc @@ -20,6 +20,10 @@ sample_events() { echo 0 > tracing_on echo 0 > events/enable +# Clear functions caused by page cache; run sample_events twice +sample_events +sample_events + echo "Get the most frequently calling function" echo > trace sample_events diff --git a/tools/testing/selftests/kvm/arm64/get-reg-list.c b/tools/testing/selftests/kvm/arm64/get-reg-list.c index c9b84eeaab6b..0a3a94c4cca1 100644 --- a/tools/testing/selftests/kvm/arm64/get-reg-list.c +++ b/tools/testing/selftests/kvm/arm64/get-reg-list.c @@ -63,11 +63,13 @@ static struct feature_id_reg feat_id_regs[] = { REG_FEAT(HDFGWTR2_EL2, ID_AA64MMFR0_EL1, FGT, FGT2), REG_FEAT(ZCR_EL2, ID_AA64PFR0_EL1, SVE, IMP), REG_FEAT(SCTLR2_EL1, ID_AA64MMFR3_EL1, SCTLRX, IMP), + REG_FEAT(SCTLR2_EL2, ID_AA64MMFR3_EL1, SCTLRX, IMP), REG_FEAT(VDISR_EL2, ID_AA64PFR0_EL1, RAS, IMP), REG_FEAT(VSESR_EL2, ID_AA64PFR0_EL1, RAS, IMP), REG_FEAT(VNCR_EL2, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY), REG_FEAT(CNTHV_CTL_EL2, ID_AA64MMFR1_EL1, VH, IMP), REG_FEAT(CNTHV_CVAL_EL2,ID_AA64MMFR1_EL1, VH, IMP), + REG_FEAT(ZCR_EL2, ID_AA64PFR0_EL1, SVE, IMP), }; bool filter_reg(__u64 reg) @@ -718,6 +720,7 @@ static __u64 el2_regs[] = { SYS_REG(VMPIDR_EL2), SYS_REG(SCTLR_EL2), SYS_REG(ACTLR_EL2), + SYS_REG(SCTLR2_EL2), SYS_REG(HCR_EL2), SYS_REG(MDCR_EL2), SYS_REG(CPTR_EL2), diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c index 5e24f77868b5..c4815d365816 100644 --- a/tools/testing/selftests/kvm/arm64/set_id_regs.c +++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c @@ -268,7 +268,9 @@ static void guest_code(void) /* Return a safe value to a given ftr_bits an ftr value */ uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) { - uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); + uint64_t ftr_max = ftr_bits->mask >> ftr_bits->shift; + + TEST_ASSERT(ftr_max > 1, "This test doesn't support single bit features"); if (ftr_bits->sign == FTR_UNSIGNED) { switch (ftr_bits->type) { @@ -320,7 +322,9 @@ uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) /* Return an invalid value to a given ftr_bits an ftr value */ uint64_t get_invalid_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) { - uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); + uint64_t ftr_max = ftr_bits->mask >> ftr_bits->shift; + + TEST_ASSERT(ftr_max > 1, "This test doesn't support single bit features"); if (ftr_bits->sign == FTR_UNSIGNED) { switch (ftr_bits->type) { @@ -672,7 +676,7 @@ static void test_clidr(struct kvm_vcpu *vcpu) clidr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1)); /* find the first empty level in the cache hierarchy */ - for (level = 1; level < 7; level++) { + for (level = 1; level <= 7; level++) { if (!CLIDR_CTYPE(clidr, level)) break; } diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c index 09f270545646..0e2f8ed90f30 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c @@ -15,6 +15,8 @@ #include "gic_v3.h" #include "processor.h" +#define GITS_COLLECTION_TARGET_SHIFT 16 + static u64 its_read_u64(unsigned long offset) { return readq_relaxed(GITS_BASE_GVA + offset); @@ -163,6 +165,11 @@ static void its_encode_collection(struct its_cmd_block *cmd, u16 col) its_mask_encode(&cmd->raw_cmd[2], col, 15, 0); } +static u64 procnum_to_rdbase(u32 vcpu_id) +{ + return vcpu_id << GITS_COLLECTION_TARGET_SHIFT; +} + #define GITS_CMDQ_POLL_ITERATIONS 0 static void its_send_cmd(void *cmdq_base, struct its_cmd_block *cmd) @@ -217,7 +224,7 @@ void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool val its_encode_cmd(&cmd, GITS_CMD_MAPC); its_encode_collection(&cmd, collection_id); - its_encode_target(&cmd, vcpu_id); + its_encode_target(&cmd, procnum_to_rdbase(vcpu_id)); its_encode_valid(&cmd, valid); its_send_cmd(cmdq_base, &cmd); diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 9e3be2ee7f1b..f917b4c4c943 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -1758,10 +1758,15 @@ int main(int argc, char *argv[]) uffd_test_ops = mem_type->mem_ops; uffd_test_case_ops = test->test_case_ops; - if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB)) + if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB)) { gopts.page_size = default_huge_page_size(); - else + if (gopts.page_size == 0) { + uffd_test_skip("huge page size is 0, feature missing?"); + continue; + } + } else { gopts.page_size = psize(); + } /* Ensure we have at least 2 pages */ gopts.nr_pages = MAX(UFFD_TEST_MEM_SIZE, gopts.page_size * 2) @@ -1776,12 +1781,6 @@ int main(int argc, char *argv[]) continue; uffd_test_start("%s on %s", test->name, mem_type->name); - if ((mem_type->mem_flag == MEM_HUGETLB || - mem_type->mem_flag == MEM_HUGETLB_PRIVATE) && - (default_huge_page_size() == 0)) { - uffd_test_skip("huge page size is 0, feature missing?"); - continue; - } if (!uffd_feature_supported(test)) { uffd_test_skip("feature missing"); continue; diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore index ccfb40837a73..0989e80da457 100644 --- a/tools/testing/selftests/namespaces/.gitignore +++ b/tools/testing/selftests/namespaces/.gitignore @@ -1,3 +1,12 @@ nsid_test file_handle_test init_ino_test +ns_active_ref_test +listns_test +listns_permissions_test +listns_efault_test +siocgskns_test +cred_change_test +stress_test +listns_pagination_bug +regression_pidfd_setns_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile index 5fe4b3dc07d3..fbb821652c17 100644 --- a/tools/testing/selftests/namespaces/Makefile +++ b/tools/testing/selftests/namespaces/Makefile @@ -1,7 +1,29 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) +LDLIBS += -lcap -TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test +TEST_GEN_PROGS := nsid_test \ + file_handle_test \ + init_ino_test \ + ns_active_ref_test \ + listns_test \ + listns_permissions_test \ + listns_efault_test \ + siocgskns_test \ + cred_change_test \ + stress_test \ + listns_pagination_bug \ + regression_pidfd_setns_test include ../lib.mk +$(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c +$(OUTPUT)/listns_test: ../filesystems/utils.c +$(OUTPUT)/listns_permissions_test: ../filesystems/utils.c +$(OUTPUT)/listns_efault_test: ../filesystems/utils.c +$(OUTPUT)/siocgskns_test: ../filesystems/utils.c +$(OUTPUT)/cred_change_test: ../filesystems/utils.c +$(OUTPUT)/stress_test: ../filesystems/utils.c +$(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c +$(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c + diff --git a/tools/testing/selftests/namespaces/cred_change_test.c b/tools/testing/selftests/namespaces/cred_change_test.c new file mode 100644 index 000000000000..7b4f5ad3f725 --- /dev/null +++ b/tools/testing/selftests/namespaces/cred_change_test.c @@ -0,0 +1,814 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/capability.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/nsfs.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Test credential changes and their impact on namespace active references. + */ + +/* + * Test setuid() in a user namespace properly swaps active references. + * Create a user namespace with multiple UIDs mapped, then setuid() between them. + * Verify that the user namespace remains active throughout. + */ +TEST(setuid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int setuid_count; + + close(pipefd[0]); + + /* Create new user namespace with multiple UIDs mapped (0-9) */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send namespace ID to parent */ + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* + * Perform multiple setuid() calls. + * Each setuid() triggers commit_creds() which should properly + * swap active references via switch_cred_namespaces(). + */ + for (setuid_count = 0; setuid_count < 50; setuid_count++) { + uid_t target_uid = (setuid_count % 10); + if (setuid(target_uid) < 0) { + if (errno != EPERM) { + close(pipefd[1]); + exit(1); + } + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + TH_LOG("Child user namespace ID: %llu", (unsigned long long)userns_id); + + /* Verify namespace is active while child is running */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + ASSERT_TRUE(found); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive after child exits */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setuid() correctly preserved active references (no leak)"); +} + +/* + * Test setgid() in a user namespace properly handles active references. + */ +TEST(setgid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int setgid_count; + + close(pipefd[0]); + + /* Create new user namespace with multiple GIDs mapped */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* Perform multiple setgid() calls */ + for (setgid_count = 0; setgid_count < 50; setgid_count++) { + gid_t target_gid = (setgid_count % 10); + if (setgid(target_gid) < 0) { + if (errno != EPERM) { + close(pipefd[1]); + exit(1); + } + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setgid() correctly preserved active references (no leak)"); +} + +/* + * Test setresuid() which changes real, effective, and saved UIDs. + * This should properly swap active references via commit_creds(). + */ +TEST(setresuid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int setres_count; + + close(pipefd[0]); + + /* Create new user namespace */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* Perform multiple setresuid() calls */ + for (setres_count = 0; setres_count < 30; setres_count++) { + uid_t uid1 = (setres_count % 5); + uid_t uid2 = ((setres_count + 1) % 5); + uid_t uid3 = ((setres_count + 2) % 5); + + if (setresuid(uid1, uid2, uid3) < 0) { + if (errno != EPERM) { + close(pipefd[1]); + exit(1); + } + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setresuid() correctly preserved active references (no leak)"); +} + +/* + * Test credential changes across multiple user namespaces. + * Create nested user namespaces and verify active reference tracking. + */ +TEST(cred_change_nested_userns) +{ + pid_t pid; + int status; + __u64 parent_userns_id, child_userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found_parent = false, found_child = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 parent_id, child_id; + uid_t orig_uid = getuid(); + + close(pipefd[0]); + + /* Create first user namespace */ + userns_fd = get_userns_fd(0, orig_uid, 1); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get first namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &parent_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create nested user namespace */ + userns_fd = get_userns_fd(0, 0, 1); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get nested namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send both IDs to parent */ + write(pipefd[1], &parent_id, sizeof(parent_id)); + write(pipefd[1], &child_id, sizeof(child_id)); + + /* Perform some credential changes in nested namespace */ + setuid(0); + setgid(0); + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + /* Read both namespace IDs */ + if (read(pipefd[0], &parent_userns_id, sizeof(parent_userns_id)) != sizeof(parent_userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get parent namespace ID"); + } + + if (read(pipefd[0], &child_userns_id, sizeof(child_userns_id)) != sizeof(child_userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get child namespace ID"); + } + close(pipefd[0]); + + TH_LOG("Parent userns: %llu, Child userns: %llu", + (unsigned long long)parent_userns_id, + (unsigned long long)child_userns_id); + + /* Verify both namespaces are active */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == parent_userns_id) + found_parent = true; + if (ns_ids[i] == child_userns_id) + found_child = true; + } + + ASSERT_TRUE(found_parent); + ASSERT_TRUE(found_child); + + /* Wait for child */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify both namespaces become inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found_parent = false; + found_child = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == parent_userns_id) + found_parent = true; + if (ns_ids[i] == child_userns_id) + found_child = true; + } + + ASSERT_FALSE(found_parent); + ASSERT_FALSE(found_child); + TH_LOG("Nested user namespace credential changes preserved active refs (no leak)"); +} + +/* + * Test rapid credential changes don't cause refcount imbalances. + * This stress-tests the switch_cred_namespaces() logic. + */ +TEST(rapid_cred_changes_no_leak) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int change_count; + + close(pipefd[0]); + + /* Create new user namespace with wider range of UIDs/GIDs */ + userns_fd = get_userns_fd(0, orig_uid, 100); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* + * Perform many rapid credential changes. + * Mix setuid, setgid, setreuid, setregid, setresuid, setresgid. + */ + for (change_count = 0; change_count < 200; change_count++) { + switch (change_count % 6) { + case 0: + setuid(change_count % 50); + break; + case 1: + setgid(change_count % 50); + break; + case 2: + setreuid(change_count % 50, (change_count + 1) % 50); + break; + case 3: + setregid(change_count % 50, (change_count + 1) % 50); + break; + case 4: + setresuid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50); + break; + case 5: + setresgid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50); + break; + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + TH_LOG("Testing with user namespace ID: %llu", (unsigned long long)userns_id); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive (no leaked active refs) */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("200 rapid credential changes completed with no active ref leak"); +} + +/* + * Test setfsuid/setfsgid which change filesystem UID/GID. + * These also trigger credential changes but may have different code paths. + */ +TEST(setfsuid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int change_count; + + close(pipefd[0]); + + /* Create new user namespace */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* Perform multiple setfsuid/setfsgid calls */ + for (change_count = 0; change_count < 50; change_count++) { + setfsuid(change_count % 10); + setfsgid(change_count % 10); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setfsuid/setfsgid correctly preserved active references (no leak)"); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c new file mode 100644 index 000000000000..c7ed4023d7a8 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_efault_test.c @@ -0,0 +1,530 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/mount.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "../pidfd/pidfd.h" +#include "wrappers.h" + +/* + * Test listns() error handling with invalid buffer addresses. + * + * When the buffer pointer is invalid (e.g., crossing page boundaries + * into unmapped memory), listns() returns EINVAL. + * + * This test also creates mount namespaces that get destroyed during + * iteration, testing that namespace cleanup happens outside the RCU + * read lock. + */ +TEST(listns_partial_fault_with_ns_cleanup) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[5]; + int sv[5][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* + * Map two pages: + * - First page: readable and writable + * - Second page: will be unmapped to trigger EFAULT + */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Unmap the second page */ + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* + * Position the buffer pointer so there's room for exactly one u64 + * before the page boundary. The second u64 would fall into the + * unmapped page. + */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 1; + + /* + * Create a separate process to run listns() in a loop concurrently + * with namespace creation and destruction. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Global listing */ + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * The kernel should: + * 1. Successfully write the first namespace ID (within valid page) + * 2. Fail with EFAULT when trying to write the second ID (unmapped page) + * 3. Handle concurrent namespace destruction without deadlock + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 2, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* + * Create several child processes, each in its own mount namespace. + * These will be destroyed while the iterator is running listns(). + */ + for (i = 0; i < 5; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Child: create a couple of tmpfs mounts */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 5; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* + * Signal children to exit. This will destroy their mount namespaces + * while listns() is iterating the namespace tree. + * This tests that cleanup happens outside the RCU read lock. + */ + for (i = 0; i < 5; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for all mount namespace children to exit and cleanup */ + for (i = 0; i < 5; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + /* Clean up */ + munmap(map, page_size); +} + +/* + * Test listns() error handling when the entire buffer is invalid. + * This is a sanity check that basic invalid pointer detection works. + */ +TEST(listns_complete_fault) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 *ns_ids; + ssize_t ret; + + /* Use a clearly invalid pointer */ + ns_ids = (__u64 *)0xdeadbeef; + + ret = sys_listns(&req, ns_ids, 10, 0); + + if (ret == -1 && errno == ENOSYS) + SKIP(return, "listns() not supported"); + + /* Should fail with EFAULT */ + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EFAULT); +} + +/* + * Test listns() error handling when the buffer is NULL. + */ +TEST(listns_null_buffer) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + ssize_t ret; + + /* NULL buffer with non-zero count should fail */ + ret = sys_listns(&req, NULL, 10, 0); + + if (ret == -1 && errno == ENOSYS) + SKIP(return, "listns() not supported"); + + /* Should fail with EFAULT */ + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EFAULT); +} + +/* + * Test listns() with a buffer that becomes invalid mid-iteration + * (after several successful writes), combined with mount namespace + * destruction to test RCU cleanup logic. + */ +TEST(listns_late_fault_with_ns_cleanup) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[10]; + int sv[10][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* Map two pages */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Unmap the second page */ + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* + * Position buffer so we can write several u64s successfully + * before hitting the page boundary. + */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 5; + + /* + * Create a separate process to run listns() concurrently. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * Request 10 namespace IDs while namespaces are being destroyed. + * This tests: + * 1. EFAULT handling when buffer becomes invalid + * 2. Namespace cleanup outside RCU read lock during iteration + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 10, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* + * Create more children with mount namespaces to increase the + * likelihood that namespace cleanup happens during iteration. + */ + for (i = 0; i < 10; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Child: create tmpfs mounts */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 10; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* Kill half the children */ + for (i = 0; i < 5; i++) + write_nointr(sv[i][0], "X", 1); + + /* Small delay to let some exit */ + usleep(10000); + + /* Kill remaining children */ + for (i = 5; i < 10; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for all children and cleanup */ + for (i = 0; i < 10; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + /* Clean up */ + munmap(map, page_size); +} + +/* + * Test specifically focused on mount namespace cleanup during EFAULT. + * Filter for mount namespaces only. + */ +TEST(listns_mnt_ns_cleanup_on_fault) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[8]; + int sv[8][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* Set up partial fault buffer */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* Position for 3 successful writes, then fault */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 3; + + /* + * Create a separate process to run listns() concurrently. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNS, /* Only mount namespaces */ + .spare2 = 0, + .user_ns_id = 0, + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * Call listns() to race with namespace destruction. + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 10, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* Create children with mount namespaces */ + for (i = 0; i < 8; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Do some mount operations to make cleanup more interesting */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 8; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* Kill children to trigger namespace destruction during iteration */ + for (i = 0; i < 8; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for children and cleanup */ + for (i = 0; i < 8; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + munmap(map, page_size); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_pagination_bug.c b/tools/testing/selftests/namespaces/listns_pagination_bug.c new file mode 100644 index 000000000000..da7d33f96397 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_pagination_bug.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/socket.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Minimal test case to reproduce KASAN out-of-bounds in listns pagination. + * + * The bug occurs when: + * 1. Filtering by a specific namespace type (e.g., CLONE_NEWUSER) + * 2. Using pagination (req.ns_id != 0) + * 3. The lookup_ns_id_at() call in do_listns() passes ns_type=0 instead of + * the filtered type, causing it to search the unified tree and potentially + * return a namespace of the wrong type. + */ +TEST(pagination_with_type_filter) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, /* Filter by user namespace */ + .spare2 = 0, + .user_ns_id = 0, + }; + pid_t pids[10]; + int num_children = 10; + int i; + int sv[2]; + __u64 first_batch[3]; + ssize_t ret; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create children with user namespaces */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + char c; + close(sv[0]); + + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Signal parent we're ready */ + if (write(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal to exit */ + if (read(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + close(sv[1]); + exit(0); + } + } + + close(sv[1]); + + /* Wait for all children to signal ready */ + for (i = 0; i < num_children; i++) { + char c; + if (read(sv[0], &c, 1) != 1) { + close(sv[0]); + for (int j = 0; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + /* First batch - this should work */ + ret = sys_listns(&req, first_batch, 3, 0); + if (ret < 0) { + if (errno == ENOSYS) { + close(sv[0]); + for (i = 0; i < num_children; i++) + kill(pids[i], SIGKILL); + for (i = 0; i < num_children; i++) + waitpid(pids[i], NULL, 0); + SKIP(return, "listns() not supported"); + } + ASSERT_GE(ret, 0); + } + + TH_LOG("First batch returned %zd entries", ret); + + if (ret == 3) { + __u64 second_batch[3]; + + /* Second batch - pagination triggers the bug */ + req.ns_id = first_batch[2]; /* Continue from last ID */ + ret = sys_listns(&req, second_batch, 3, 0); + + TH_LOG("Second batch returned %zd entries", ret); + ASSERT_GE(ret, 0); + } + + /* Signal all children to exit */ + for (i = 0; i < num_children; i++) { + char c = 'X'; + if (write(sv[0], &c, 1) != 1) { + close(sv[0]); + for (int j = i; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + close(sv[0]); + + /* Cleanup */ + for (i = 0; i < num_children; i++) { + int status; + waitpid(pids[i], &status, 0); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_permissions_test.c b/tools/testing/selftests/namespaces/listns_permissions_test.c new file mode 100644 index 000000000000..82d818751a5f --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_permissions_test.c @@ -0,0 +1,759 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/capability.h> +#include <sys/ioctl.h> +#include <sys/prctl.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Test that unprivileged users can only see namespaces they're currently in. + * Create a namespace, drop privileges, verify we can only see our own namespaces. + */ +TEST(listns_unprivileged_current_only) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + int pipefd[2]; + pid_t pid; + int status; + bool found_ours; + int unexpected_count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 our_netns_id; + bool found_ours; + int unexpected_count; + + close(pipefd[0]); + + /* Create user namespace to be unprivileged */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Create a network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get our network namespace ID */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &our_netns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Now we're unprivileged - list all network namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* We should only see our own network namespace */ + found_ours = false; + unexpected_count = 0; + + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == our_netns_id) { + found_ours = true; + } else { + /* This is either init_net (which we can see) or unexpected */ + unexpected_count++; + } + } + + /* Send results to parent */ + write(pipefd[1], &found_ours, sizeof(found_ours)); + write(pipefd[1], &unexpected_count, sizeof(unexpected_count)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + found_ours = false; + unexpected_count = 0; + read(pipefd[0], &found_ours, sizeof(found_ours)); + read(pipefd[0], &unexpected_count, sizeof(unexpected_count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Child should have seen its own namespace */ + ASSERT_TRUE(found_ours); + + TH_LOG("Unprivileged child saw its own namespace, plus %d others (likely init_net)", + unexpected_count); +} + +/* + * Test that users with CAP_SYS_ADMIN in a user namespace can see + * all namespaces owned by that user namespace. + */ +TEST(listns_cap_sys_admin_in_userns) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Will be set to our created user namespace */ + }; + __u64 ns_ids[100]; + int pipefd[2]; + pid_t pid; + int status; + bool success; + ssize_t count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 userns_id; + ssize_t ret; + int min_expected; + bool success; + + close(pipefd[0]); + + /* Create user namespace - we'll have CAP_SYS_ADMIN in it */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get the user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create several namespaces owned by this user namespace */ + unshare(CLONE_NEWNET); + unshare(CLONE_NEWUTS); + unshare(CLONE_NEWIPC); + + /* List namespaces owned by our user namespace */ + req.user_ns_id = userns_id; + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* + * We have CAP_SYS_ADMIN in this user namespace, + * so we should see all namespaces owned by it. + * That includes: net, uts, ipc, and the user namespace itself. + */ + min_expected = 4; + success = (ret >= min_expected); + + write(pipefd[1], &success, sizeof(success)); + write(pipefd[1], &ret, sizeof(ret)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + success = false; + count = 0; + read(pipefd[0], &success, sizeof(success)); + read(pipefd[0], &count, sizeof(count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(success); + TH_LOG("User with CAP_SYS_ADMIN saw %zd namespaces owned by their user namespace", + count); +} + +/* + * Test that users cannot see namespaces from unrelated user namespaces. + * Create two sibling user namespaces, verify they can't see each other's + * owned namespaces. + */ +TEST(listns_cannot_see_sibling_userns_namespaces) +{ + int pipefd[2]; + pid_t pid1, pid2; + int status; + __u64 netns_a_id; + int pipefd2[2]; + bool found_sibling_netns; + + ASSERT_EQ(pipe(pipefd), 0); + + /* Fork first child - creates user namespace A */ + pid1 = fork(); + ASSERT_GE(pid1, 0); + + if (pid1 == 0) { + int fd; + __u64 netns_a_id; + char buf; + + close(pipefd[0]); + + /* Create user namespace A */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Create network namespace owned by user namespace A */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get network namespace ID */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &netns_a_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send namespace ID to parent */ + write(pipefd[1], &netns_a_id, sizeof(netns_a_id)); + + /* Keep alive for sibling to check */ + read(pipefd[1], &buf, 1); + close(pipefd[1]); + exit(0); + } + + /* Parent reads namespace A ID */ + close(pipefd[1]); + netns_a_id = 0; + read(pipefd[0], &netns_a_id, sizeof(netns_a_id)); + + TH_LOG("User namespace A created network namespace with ID %llu", + (unsigned long long)netns_a_id); + + /* Fork second child - creates user namespace B */ + ASSERT_EQ(pipe(pipefd2), 0); + + pid2 = fork(); + ASSERT_GE(pid2, 0); + + if (pid2 == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + bool found_sibling_netns; + + close(pipefd[0]); + close(pipefd2[0]); + + /* Create user namespace B (sibling to A) */ + if (setup_userns() < 0) { + close(pipefd2[1]); + exit(1); + } + + /* Try to list all network namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + found_sibling_netns = false; + if (ret > 0) { + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == netns_a_id) { + found_sibling_netns = true; + break; + } + } + } + + /* We should NOT see the sibling's network namespace */ + write(pipefd2[1], &found_sibling_netns, sizeof(found_sibling_netns)); + close(pipefd2[1]); + exit(0); + } + + /* Parent reads result from second child */ + close(pipefd2[1]); + found_sibling_netns = false; + read(pipefd2[0], &found_sibling_netns, sizeof(found_sibling_netns)); + close(pipefd2[0]); + + /* Signal first child to exit */ + close(pipefd[0]); + + /* Wait for both children */ + waitpid(pid2, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + waitpid(pid1, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + /* Second child should NOT have seen first child's namespace */ + ASSERT_FALSE(found_sibling_netns); + TH_LOG("User namespace B correctly could not see sibling namespace A's network namespace"); +} + +/* + * Test permission checking with LISTNS_CURRENT_USER. + * Verify that listing with LISTNS_CURRENT_USER respects permissions. + */ +TEST(listns_current_user_permissions) +{ + int pipefd[2]; + pid_t pid; + int status; + bool success; + ssize_t count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = LISTNS_CURRENT_USER, + }; + __u64 ns_ids[100]; + ssize_t ret; + bool success; + + close(pipefd[0]); + + /* Create user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Create some namespaces owned by this user namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + if (unshare(CLONE_NEWUTS) < 0) { + close(pipefd[1]); + exit(1); + } + + /* List with LISTNS_CURRENT_USER - should see our owned namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + success = (ret >= 3); /* At least user, net, uts */ + write(pipefd[1], &success, sizeof(success)); + write(pipefd[1], &ret, sizeof(ret)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + success = false; + count = 0; + read(pipefd[0], &success, sizeof(success)); + read(pipefd[0], &count, sizeof(count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(success); + TH_LOG("LISTNS_CURRENT_USER returned %zd namespaces", count); +} + +/* + * Test that CAP_SYS_ADMIN in parent user namespace allows seeing + * child user namespace's owned namespaces. + */ +TEST(listns_parent_userns_cap_sys_admin) +{ + int pipefd[2]; + pid_t pid; + int status; + bool found_child_userns; + ssize_t count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 parent_userns_id; + __u64 child_userns_id; + struct ns_id_req req; + __u64 ns_ids[100]; + ssize_t ret; + bool found_child_userns; + + close(pipefd[0]); + + /* Create parent user namespace - we have CAP_SYS_ADMIN in it */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get parent user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &parent_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create child user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get child user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create namespaces owned by child user namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* List namespaces owned by parent user namespace */ + req.size = sizeof(req); + req.spare = 0; + req.ns_id = 0; + req.ns_type = 0; + req.spare2 = 0; + req.user_ns_id = parent_userns_id; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + /* Should see child user namespace in the list */ + found_child_userns = false; + if (ret > 0) { + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == child_userns_id) { + found_child_userns = true; + break; + } + } + } + + write(pipefd[1], &found_child_userns, sizeof(found_child_userns)); + write(pipefd[1], &ret, sizeof(ret)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + found_child_userns = false; + count = 0; + read(pipefd[0], &found_child_userns, sizeof(found_child_userns)); + read(pipefd[0], &count, sizeof(count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(found_child_userns); + TH_LOG("Process with CAP_SYS_ADMIN in parent user namespace saw child user namespace (total: %zd)", + count); +} + +/* + * Test that we can see user namespaces we have CAP_SYS_ADMIN inside of. + * This is different from seeing namespaces owned by a user namespace. + */ +TEST(listns_cap_sys_admin_inside_userns) +{ + int pipefd[2]; + pid_t pid; + int status; + bool found_ours; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 our_userns_id; + struct ns_id_req req; + __u64 ns_ids[100]; + ssize_t ret; + bool found_ours; + + close(pipefd[0]); + + /* Create user namespace - we have CAP_SYS_ADMIN inside it */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get our user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &our_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* List all user namespaces globally */ + req.size = sizeof(req); + req.spare = 0; + req.ns_id = 0; + req.ns_type = CLONE_NEWUSER; + req.spare2 = 0; + req.user_ns_id = 0; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + /* We should be able to see our own user namespace */ + found_ours = false; + if (ret > 0) { + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == our_userns_id) { + found_ours = true; + break; + } + } + } + + write(pipefd[1], &found_ours, sizeof(found_ours)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + found_ours = false; + read(pipefd[0], &found_ours, sizeof(found_ours)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(found_ours); + TH_LOG("Process can see user namespace it has CAP_SYS_ADMIN inside of"); +} + +/* + * Test that dropping CAP_SYS_ADMIN restricts what we can see. + */ +TEST(listns_drop_cap_sys_admin) +{ + cap_t caps; + cap_value_t cap_list[1] = { CAP_SYS_ADMIN }; + + /* This test needs to start with CAP_SYS_ADMIN */ + caps = cap_get_proc(); + if (!caps) { + SKIP(return, "Cannot get capabilities"); + } + + cap_flag_value_t cap_val; + if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &cap_val) < 0) { + cap_free(caps); + SKIP(return, "Cannot check CAP_SYS_ADMIN"); + } + + if (cap_val != CAP_SET) { + cap_free(caps); + SKIP(return, "Test needs CAP_SYS_ADMIN to start"); + } + cap_free(caps); + + int pipefd[2]; + pid_t pid; + int status; + bool correct; + ssize_t count_before, count_after; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = LISTNS_CURRENT_USER, + }; + __u64 ns_ids_before[100]; + ssize_t count_before; + __u64 ns_ids_after[100]; + ssize_t count_after; + bool correct; + + close(pipefd[0]); + + /* Create user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Count namespaces with CAP_SYS_ADMIN */ + count_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + + /* Drop CAP_SYS_ADMIN */ + caps = cap_get_proc(); + if (caps) { + cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR); + cap_set_flag(caps, CAP_PERMITTED, 1, cap_list, CAP_CLEAR); + cap_set_proc(caps); + cap_free(caps); + } + + /* Ensure we can't regain the capability */ + prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + + /* Count namespaces without CAP_SYS_ADMIN */ + count_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + + /* Without CAP_SYS_ADMIN, we should see same or fewer namespaces */ + correct = (count_after <= count_before); + + write(pipefd[1], &correct, sizeof(correct)); + write(pipefd[1], &count_before, sizeof(count_before)); + write(pipefd[1], &count_after, sizeof(count_after)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + correct = false; + count_before = 0; + count_after = 0; + read(pipefd[0], &correct, sizeof(correct)); + read(pipefd[0], &count_before, sizeof(count_before)); + read(pipefd[0], &count_after, sizeof(count_after)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(correct); + TH_LOG("With CAP_SYS_ADMIN: %zd namespaces, without: %zd namespaces", + count_before, count_after); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_test.c b/tools/testing/selftests/namespaces/listns_test.c new file mode 100644 index 000000000000..8a95789d6a87 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_test.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Test basic listns() functionality with the unified namespace tree. + * List all active namespaces globally. + */ +TEST(listns_basic_unified) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Global listing */ + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + + /* Should find at least the initial namespaces */ + ASSERT_GT(ret, 0); + TH_LOG("Found %zd active namespaces", ret); + + /* Verify all returned IDs are non-zero */ + for (ssize_t i = 0; i < ret; i++) { + ASSERT_NE(ns_ids[i], 0); + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); + } +} + +/* + * Test listns() with type filtering. + * List only network namespaces. + */ +TEST(listns_filter_by_type) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, /* Only network namespaces */ + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret, 0); + + /* Should find at least init_net */ + ASSERT_GT(ret, 0); + TH_LOG("Found %zd active network namespaces", ret); + + /* Verify we can open each namespace and it's actually a network namespace */ + for (ssize_t i = 0; i < ret && i < 5; i++) { + struct nsfs_file_handle nsfh = { + .ns_id = ns_ids[i], + .ns_type = CLONE_NEWNET, + .ns_inum = 0, + }; + struct file_handle *fh; + int fd; + + fh = (struct file_handle *)malloc(sizeof(*fh) + sizeof(nsfh)); + ASSERT_NE(fh, NULL); + fh->handle_bytes = sizeof(nsfh); + fh->handle_type = 0; + memcpy(fh->f_handle, &nsfh, sizeof(nsfh)); + + fd = open_by_handle_at(-10003, fh, O_RDONLY); + free(fh); + + if (fd >= 0) { + int ns_type; + /* Verify it's a network namespace via ioctl */ + ns_type = ioctl(fd, NS_GET_NSTYPE); + if (ns_type >= 0) { + ASSERT_EQ(ns_type, CLONE_NEWNET); + } + close(fd); + } + } +} + +/* + * Test listns() pagination. + * List namespaces in batches. + */ +TEST(listns_pagination) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 batch1[2], batch2[2]; + ssize_t ret1, ret2; + + /* Get first batch */ + ret1 = sys_listns(&req, batch1, ARRAY_SIZE(batch1), 0); + if (ret1 < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret1, 0); + + if (ret1 == 0) + SKIP(return, "No namespaces found"); + + TH_LOG("First batch: %zd namespaces", ret1); + + /* Get second batch using last ID from first batch */ + if (ret1 == ARRAY_SIZE(batch1)) { + req.ns_id = batch1[ret1 - 1]; + ret2 = sys_listns(&req, batch2, ARRAY_SIZE(batch2), 0); + ASSERT_GE(ret2, 0); + + TH_LOG("Second batch: %zd namespaces (after ns_id=%llu)", + ret2, (unsigned long long)req.ns_id); + + /* If we got more results, verify IDs are monotonically increasing */ + if (ret2 > 0) { + ASSERT_GT(batch2[0], batch1[ret1 - 1]); + TH_LOG("Pagination working: %llu > %llu", + (unsigned long long)batch2[0], + (unsigned long long)batch1[ret1 - 1]); + } + } else { + TH_LOG("All namespaces fit in first batch"); + } +} + +/* + * Test listns() with LISTNS_CURRENT_USER. + * List namespaces owned by current user namespace. + */ +TEST(listns_current_user) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = LISTNS_CURRENT_USER, + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret, 0); + + /* Should find at least the initial namespaces if we're in init_user_ns */ + TH_LOG("Found %zd namespaces owned by current user namespace", ret); + + for (ssize_t i = 0; i < ret; i++) + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); +} + +/* + * Test that listns() only returns active namespaces. + * Create a namespace, let it become inactive, verify it's not listed. + */ +TEST(listns_only_active) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[100], ns_ids_after[100]; + ssize_t ret_before, ret_after; + int pipefd[2]; + pid_t pid; + __u64 new_ns_id = 0; + int status; + + /* Get initial list */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret_before, 0); + + TH_LOG("Before: %zd active network namespaces", ret_before); + + /* Create a new namespace in a child process and get its ID */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 ns_id; + + close(pipefd[0]); + + /* Create new network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get its ID */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &ns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send ID to parent */ + write(pipefd[1], &ns_id, sizeof(ns_id)); + close(pipefd[1]); + + /* Keep namespace active briefly */ + usleep(100000); + exit(0); + } + + /* Parent reads the new namespace ID */ + { + int bytes; + + close(pipefd[1]); + bytes = read(pipefd[0], &new_ns_id, sizeof(new_ns_id)); + close(pipefd[0]); + + if (bytes == sizeof(new_ns_id)) { + __u64 ns_ids_during[100]; + int ret_during; + + TH_LOG("Child created namespace with ID %llu", (unsigned long long)new_ns_id); + + /* List namespaces while child is still alive - should see new one */ + ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0); + ASSERT_GE(ret_during, 0); + TH_LOG("During: %d active network namespaces", ret_during); + + /* Should have more namespaces than before */ + ASSERT_GE(ret_during, ret_before); + } + } + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + + /* Give time for namespace to become inactive */ + usleep(100000); + + /* List namespaces after child exits - should not see new one */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + TH_LOG("After: %zd active network namespaces", ret_after); + + /* Verify the new namespace ID is not in the after list */ + if (new_ns_id != 0) { + bool found = false; + + for (ssize_t i = 0; i < ret_after; i++) { + if (ns_ids_after[i] == new_ns_id) { + found = true; + break; + } + } + ASSERT_FALSE(found); + } +} + +/* + * Test listns() with specific user namespace ID. + * Create a user namespace and list namespaces it owns. + */ +TEST(listns_specific_userns) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, /* Will be filled with created userns ID */ + }; + __u64 ns_ids[100]; + int sv[2]; + pid_t pid; + int status; + __u64 user_ns_id = 0; + int bytes; + ssize_t ret; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 ns_id; + char buf; + + close(sv[0]); + + /* Create new user namespace */ + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(sv[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &ns_id) < 0) { + close(fd); + close(sv[1]); + exit(1); + } + close(fd); + + /* Send ID to parent */ + if (write(sv[1], &ns_id, sizeof(ns_id)) != sizeof(ns_id)) { + close(sv[1]); + exit(1); + } + + /* Create some namespaces owned by this user namespace */ + unshare(CLONE_NEWNET); + unshare(CLONE_NEWUTS); + + /* Wait for parent signal */ + if (read(sv[1], &buf, 1) != 1) { + close(sv[1]); + exit(1); + } + close(sv[1]); + exit(0); + } + + /* Parent */ + close(sv[1]); + bytes = read(sv[0], &user_ns_id, sizeof(user_ns_id)); + + if (bytes != sizeof(user_ns_id)) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get user namespace ID from child"); + } + + TH_LOG("Child created user namespace with ID %llu", (unsigned long long)user_ns_id); + + /* List namespaces owned by this user namespace */ + req.user_ns_id = user_ns_id; + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + if (ret < 0) { + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + if (errno == ENOSYS) { + SKIP(return, "listns() not supported"); + } + ASSERT_GE(ret, 0); + } + + TH_LOG("Found %zd namespaces owned by user namespace %llu", ret, + (unsigned long long)user_ns_id); + + /* Should find at least the network and UTS namespaces we created */ + if (ret > 0) { + for (ssize_t i = 0; i < ret && i < 10; i++) + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); + } + + /* Signal child to exit */ + if (write(sv[0], "X", 1) != 1) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + ASSERT_TRUE(false); + } + close(sv[0]); + waitpid(pid, &status, 0); +} + +/* + * Test listns() with multiple namespace types filter. + */ +TEST(listns_multiple_types) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET | CLONE_NEWUTS, /* Network and UTS */ + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret, 0); + + TH_LOG("Found %zd active network/UTS namespaces", ret); + + for (ssize_t i = 0; i < ret; i++) + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); +} + +/* + * Test that hierarchical active reference propagation keeps parent + * user namespaces visible in listns(). + */ +TEST(listns_hierarchical_visibility) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 parent_ns_id = 0, child_ns_id = 0; + int sv[2]; + pid_t pid; + int status; + int bytes; + __u64 ns_ids[100]; + ssize_t ret; + bool found_parent, found_child; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + char buf; + + close(sv[0]); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(sv[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &parent_ns_id) < 0) { + close(fd); + close(sv[1]); + exit(1); + } + close(fd); + + /* Create child user namespace */ + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(sv[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_ns_id) < 0) { + close(fd); + close(sv[1]); + exit(1); + } + close(fd); + + /* Send both IDs to parent */ + if (write(sv[1], &parent_ns_id, sizeof(parent_ns_id)) != sizeof(parent_ns_id)) { + close(sv[1]); + exit(1); + } + if (write(sv[1], &child_ns_id, sizeof(child_ns_id)) != sizeof(child_ns_id)) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal */ + if (read(sv[1], &buf, 1) != 1) { + close(sv[1]); + exit(1); + } + close(sv[1]); + exit(0); + } + + /* Parent */ + close(sv[1]); + + /* Read both namespace IDs */ + bytes = read(sv[0], &parent_ns_id, sizeof(parent_ns_id)); + bytes += read(sv[0], &child_ns_id, sizeof(child_ns_id)); + + if (bytes != (int)(2 * sizeof(__u64))) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace IDs from child"); + } + + TH_LOG("Parent user namespace ID: %llu", (unsigned long long)parent_ns_id); + TH_LOG("Child user namespace ID: %llu", (unsigned long long)child_ns_id); + + /* List all user namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + if (ret < 0 && errno == ENOSYS) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "listns() not supported"); + } + + ASSERT_GE(ret, 0); + TH_LOG("Found %zd active user namespaces", ret); + + /* Both parent and child should be visible (active due to child process) */ + found_parent = false; + found_child = false; + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == parent_ns_id) + found_parent = true; + if (ns_ids[i] == child_ns_id) + found_child = true; + } + + TH_LOG("Parent namespace %s, child namespace %s", + found_parent ? "found" : "NOT FOUND", + found_child ? "found" : "NOT FOUND"); + + ASSERT_TRUE(found_child); + /* With hierarchical propagation, parent should also be active */ + ASSERT_TRUE(found_parent); + + /* Signal child to exit */ + if (write(sv[0], "X", 1) != 1) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + ASSERT_TRUE(false); + } + close(sv[0]); + waitpid(pid, &status, 0); +} + +/* + * Test error cases for listns(). + */ +TEST(listns_error_cases) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[10]; + int ret; + + /* Test with invalid flags */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0xFFFF); + if (errno == ENOSYS) { + /* listns() not supported, skip this check */ + } else { + ASSERT_LT(ret, 0); + ASSERT_EQ(errno, EINVAL); + } + + /* Test with NULL ns_ids array */ + ret = sys_listns(&req, NULL, 10, 0); + ASSERT_LT(ret, 0); + + /* Test with invalid spare field */ + req.spare = 1; + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (errno == ENOSYS) { + /* listns() not supported, skip this check */ + } else { + ASSERT_LT(ret, 0); + ASSERT_EQ(errno, EINVAL); + } + req.spare = 0; + + /* Test with huge nr_ns_ids */ + ret = sys_listns(&req, ns_ids, 2000000, 0); + if (errno == ENOSYS) { + /* listns() not supported, skip this check */ + } else { + ASSERT_LT(ret, 0); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/ns_active_ref_test.c b/tools/testing/selftests/namespaces/ns_active_ref_test.c new file mode 100644 index 000000000000..093268f0efaa --- /dev/null +++ b/tools/testing/selftests/namespaces/ns_active_ref_test.c @@ -0,0 +1,2672 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/mount.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <pthread.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +#ifndef FD_NSFS_ROOT +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ +#endif + +#ifndef FILEID_NSFS +#define FILEID_NSFS 0xf1 +#endif + +/* + * Test that initial namespaces can be reopened via file handle. + * Initial namespaces should have active ref count of 1 from boot. + */ +TEST(init_ns_always_active) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd1, fd2; + struct stat st1, st2; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open initial network namespace */ + fd1 = open("/proc/1/ns/net", O_RDONLY); + ASSERT_GE(fd1, 0); + + /* Get file handle for initial namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd1, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(fd1); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + + /* Close the namespace fd */ + close(fd1); + + /* Try to reopen via file handle - should succeed since init ns is always active */ + fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd2 < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); + return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd2, 0); + + /* Verify we opened the same namespace */ + fd1 = open("/proc/1/ns/net", O_RDONLY); + ASSERT_GE(fd1, 0); + ASSERT_EQ(fstat(fd1, &st1), 0); + ASSERT_EQ(fstat(fd2, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + close(fd1); + close(fd2); + free(handle); +} + +/* + * Test namespace lifecycle: create a namespace in a child process, + * get a file handle while it's active, then try to reopen after + * the process exits (namespace becomes inactive). + */ +TEST(ns_inactive_after_exit) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + /* Create pipe for passing file handle from child */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Open our new namespace */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get file handle for the namespace */ + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + + /* Exit - namespace should become inactive */ + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + /* Read file handle from child */ + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* Try to reopen namespace - should fail with ENOENT since it's inactive */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + /* Should fail with ENOENT (namespace inactive) or ESTALE */ + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that a namespace remains active while a process is using it, + * even after the creating process exits. + */ +TEST(ns_active_with_multiple_processes) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + int syncpipe[2]; + pid_t pid1, pid2; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + char sync_byte; + + /* Create pipes for communication */ + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + + pid1 = fork(); + ASSERT_GE(pid1, 0); + + if (pid1 == 0) { + /* First child - creates namespace */ + close(pipefd[0]); + close(syncpipe[1]); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + /* Open and get handle */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + + /* Wait for signal before exiting */ + read(syncpipe[0], &sync_byte, 1); + close(syncpipe[0]); + exit(0); + } + + /* Parent reads handle */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + ASSERT_GT(ret, 0); + + handle = (struct file_handle *)buf; + + /* Create second child that will keep namespace active */ + pid2 = fork(); + ASSERT_GE(pid2, 0); + + if (pid2 == 0) { + /* Second child - reopens the namespace */ + close(syncpipe[0]); + close(syncpipe[1]); + + /* Open the namespace via handle */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0) { + exit(1); + } + + /* Join the namespace */ + ret = setns(fd, CLONE_NEWNET); + close(fd); + if (ret < 0) { + exit(1); + } + + /* Sleep to keep namespace active */ + sleep(1); + exit(0); + } + + /* Let second child enter the namespace */ + usleep(100000); /* 100ms */ + + /* Signal first child to exit */ + close(syncpipe[0]); + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + /* Wait for first child */ + waitpid(pid1, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + /* Namespace should still be active because second child is using it */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(fd, 0); + close(fd); + + /* Wait for second child */ + waitpid(pid2, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); +} + +/* + * Test user namespace active ref tracking via credential lifecycle + */ +TEST(userns_active_ref_lifecycle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new user namespace */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Set up uid/gid mappings */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd >= 0 && gid_map_fd >= 0 && setgroups_fd >= 0) { + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + } + + /* Get file handle */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* Namespace should be inactive after all tasks exit */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test PID namespace active ref tracking + */ +TEST(pidns_active_ref_lifecycle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new PID namespace */ + ret = unshare(CLONE_NEWPID); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Fork to actually enter the PID namespace */ + pid_t child = fork(); + if (child < 0) { + close(pipefd[1]); + exit(1); + } + + if (child == 0) { + /* Grandchild - in new PID namespace */ + fd = open("/proc/self/ns/pid", O_RDONLY); + if (fd < 0) { + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + exit(1); + } + + /* Send handle to grandparent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + exit(0); + } + + /* Wait for grandchild */ + waitpid(child, NULL, 0); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* Namespace should be inactive after all processes exit */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that an open file descriptor keeps a namespace active. + * Even after the creating process exits, the namespace should remain + * active as long as an fd is held open. + */ +TEST(ns_fd_keeps_active) +{ + struct file_handle *handle; + int mount_id; + int ret; + int nsfd; + int pipe_child_ready[2]; + int pipe_parent_ready[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + char sync_byte; + char proc_path[64]; + + ASSERT_EQ(pipe(pipe_child_ready), 0); + ASSERT_EQ(pipe(pipe_parent_ready), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipe_child_ready[0]); + close(pipe_parent_ready[1]); + + TH_LOG("Child: creating new network namespace"); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + TH_LOG("Child: unshare(CLONE_NEWNET) failed: %s", strerror(errno)); + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + exit(1); + } + + TH_LOG("Child: network namespace created successfully"); + + /* Get file handle for the namespace */ + nsfd = open("/proc/self/ns/net", O_RDONLY); + if (nsfd < 0) { + TH_LOG("Child: failed to open /proc/self/ns/net: %s", strerror(errno)); + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + exit(1); + } + + TH_LOG("Child: opened namespace fd %d", nsfd); + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(nsfd, "", handle, &mount_id, AT_EMPTY_PATH); + close(nsfd); + + if (ret < 0) { + TH_LOG("Child: name_to_handle_at failed: %s", strerror(errno)); + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + exit(1); + } + + TH_LOG("Child: got file handle (bytes=%u)", handle->handle_bytes); + + /* Send file handle to parent */ + ret = write(pipe_child_ready[1], buf, sizeof(*handle) + handle->handle_bytes); + TH_LOG("Child: sent %d bytes of file handle to parent", ret); + close(pipe_child_ready[1]); + + /* Wait for parent to open the fd */ + TH_LOG("Child: waiting for parent to open fd"); + ret = read(pipe_parent_ready[0], &sync_byte, 1); + close(pipe_parent_ready[0]); + + TH_LOG("Child: parent signaled (read %d bytes), exiting now", ret); + /* Exit - namespace should stay active because parent holds fd */ + exit(0); + } + + /* Parent process */ + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + + TH_LOG("Parent: reading file handle from child"); + + /* Read file handle from child */ + ret = read(pipe_child_ready[0], buf, sizeof(buf)); + close(pipe_child_ready[0]); + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + TH_LOG("Parent: received %d bytes, handle size=%u", ret, handle->handle_bytes); + + /* Open the child's namespace while it's still alive */ + snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/net", pid); + TH_LOG("Parent: opening child's namespace at %s", proc_path); + nsfd = open(proc_path, O_RDONLY); + if (nsfd < 0) { + TH_LOG("Parent: failed to open %s: %s", proc_path, strerror(errno)); + close(pipe_parent_ready[1]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child's namespace"); + } + + TH_LOG("Parent: opened child's namespace, got fd %d", nsfd); + + /* Signal child that we have the fd */ + sync_byte = 'G'; + write(pipe_parent_ready[1], &sync_byte, 1); + close(pipe_parent_ready[1]); + TH_LOG("Parent: signaled child that we have the fd"); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + TH_LOG("Child exited, parent holds fd %d to namespace", nsfd); + + /* + * Namespace should still be ACTIVE because we hold an fd. + * We should be able to reopen it via file handle. + */ + TH_LOG("Attempting to reopen namespace via file handle (should succeed - fd held)"); + int fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(fd2, 0); + + TH_LOG("Successfully reopened namespace via file handle, got fd %d", fd2); + + /* Verify it's the same namespace */ + struct stat st1, st2; + ASSERT_EQ(fstat(nsfd, &st1), 0); + ASSERT_EQ(fstat(fd2, &st2), 0); + TH_LOG("Namespace inodes: nsfd=%lu, fd2=%lu", st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_ino, st2.st_ino); + close(fd2); + + /* Now close the fd - namespace should become inactive */ + TH_LOG("Closing fd %d - namespace should become inactive", nsfd); + close(nsfd); + + /* Now reopening should fail - namespace is inactive */ + TH_LOG("Attempting to reopen namespace via file handle (should fail - inactive)"); + fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd2, 0); + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ + TH_LOG("Reopen failed as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test hierarchical active reference propagation. + * When a child namespace is active, its owning user namespace should also + * be active automatically due to hierarchical active reference propagation. + * This ensures parents are always reachable when children are active. + */ +TEST(ns_parent_always_reachable) +{ + struct file_handle *parent_handle, *child_handle; + int ret; + int child_nsfd; + int pipefd[2]; + pid_t pid; + int status; + __u64 parent_id, child_id; + char parent_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ]; + char child_buf[sizeof(*child_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + TH_LOG("Child: creating parent user namespace and setting up mappings"); + + /* Create parent user namespace with mappings */ + ret = setup_userns(); + if (ret < 0) { + TH_LOG("Child: setup_userns() for parent failed: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: parent user namespace created, now uid=%d gid=%d", getuid(), getgid()); + + /* Get namespace ID for parent user namespace */ + int parent_fd = open("/proc/self/ns/user", O_RDONLY); + if (parent_fd < 0) { + TH_LOG("Child: failed to open parent /proc/self/ns/user: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: opened parent userns fd %d", parent_fd); + + if (ioctl(parent_fd, NS_GET_ID, &parent_id) < 0) { + TH_LOG("Child: NS_GET_ID for parent failed: %s", strerror(errno)); + close(parent_fd); + close(pipefd[1]); + exit(1); + } + close(parent_fd); + + TH_LOG("Child: got parent namespace ID %llu", (unsigned long long)parent_id); + + /* Create child user namespace within parent */ + TH_LOG("Child: creating nested child user namespace"); + ret = setup_userns(); + if (ret < 0) { + TH_LOG("Child: setup_userns() for child failed: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: nested child user namespace created, uid=%d gid=%d", getuid(), getgid()); + + /* Get namespace ID for child user namespace */ + int child_fd = open("/proc/self/ns/user", O_RDONLY); + if (child_fd < 0) { + TH_LOG("Child: failed to open child /proc/self/ns/user: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: opened child userns fd %d", child_fd); + + if (ioctl(child_fd, NS_GET_ID, &child_id) < 0) { + TH_LOG("Child: NS_GET_ID for child failed: %s", strerror(errno)); + close(child_fd); + close(pipefd[1]); + exit(1); + } + close(child_fd); + + TH_LOG("Child: got child namespace ID %llu", (unsigned long long)child_id); + + /* Send both namespace IDs to parent */ + TH_LOG("Child: sending both namespace IDs to parent"); + write(pipefd[1], &parent_id, sizeof(parent_id)); + write(pipefd[1], &child_id, sizeof(child_id)); + close(pipefd[1]); + + TH_LOG("Child: exiting - parent userns should become inactive"); + /* Exit - parent user namespace should become inactive */ + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + TH_LOG("Parent: reading both namespace IDs from child"); + + /* Read both namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &parent_id, sizeof(parent_id)); + if (ret != sizeof(parent_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID from child"); + } + + ret = read(pipefd[0], &child_id, sizeof(child_id)); + close(pipefd[0]); + if (ret != sizeof(child_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read child namespace ID from child"); + } + + TH_LOG("Parent: received parent_id=%llu, child_id=%llu", + (unsigned long long)parent_id, (unsigned long long)child_id); + + /* Construct file handles from namespace IDs */ + parent_handle = (struct file_handle *)parent_buf; + parent_handle->handle_bytes = sizeof(struct nsfs_file_handle); + parent_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *parent_fh = (struct nsfs_file_handle *)parent_handle->f_handle; + parent_fh->ns_id = parent_id; + parent_fh->ns_type = 0; + parent_fh->ns_inum = 0; + + child_handle = (struct file_handle *)child_buf; + child_handle->handle_bytes = sizeof(struct nsfs_file_handle); + child_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *child_fh = (struct nsfs_file_handle *)child_handle->f_handle; + child_fh->ns_id = child_id; + child_fh->ns_type = 0; + child_fh->ns_inum = 0; + + TH_LOG("Parent: opening child namespace BEFORE child exits"); + + /* Open child namespace while child is still alive to keep it active */ + child_nsfd = open_by_handle_at(FD_NSFS_ROOT, child_handle, O_RDONLY); + if (child_nsfd < 0) { + TH_LOG("Failed to open child namespace: %s (errno=%d)", strerror(errno), errno); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child namespace"); + } + + TH_LOG("Opened child namespace fd %d", child_nsfd); + + /* Now wait for child to exit */ + TH_LOG("Parent: waiting for child to exit"); + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + TH_LOG("Child process exited, parent holds fd to child namespace"); + + /* + * With hierarchical active reference propagation: + * Since the child namespace is active (parent process holds fd), + * the parent user namespace should ALSO be active automatically. + * This is because when we took an active reference on the child, + * it propagated up to the owning user namespace. + */ + TH_LOG("Attempting to reopen parent namespace (should SUCCEED - hierarchical propagation)"); + int parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(parent_fd, 0); + + TH_LOG("SUCCESS: Parent namespace is active (fd=%d) due to active child", parent_fd); + + /* Verify we can also get parent via NS_GET_USERNS */ + TH_LOG("Verifying NS_GET_USERNS also works"); + int parent_fd2 = ioctl(child_nsfd, NS_GET_USERNS); + if (parent_fd2 < 0) { + close(parent_fd); + close(child_nsfd); + TH_LOG("NS_GET_USERNS failed: %s (errno=%d)", strerror(errno), errno); + SKIP(return, "NS_GET_USERNS not supported or failed"); + } + + TH_LOG("NS_GET_USERNS succeeded, got parent fd %d", parent_fd2); + + /* Verify both methods give us the same namespace */ + struct stat st1, st2; + ASSERT_EQ(fstat(parent_fd, &st1), 0); + ASSERT_EQ(fstat(parent_fd2, &st2), 0); + TH_LOG("Parent namespace inodes: parent_fd=%lu, parent_fd2=%lu", st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + /* + * Close child fd - parent should remain active because we still + * hold direct references to it (parent_fd and parent_fd2). + */ + TH_LOG("Closing child fd - parent should remain active (direct refs held)"); + close(child_nsfd); + + /* Parent should still be openable */ + TH_LOG("Verifying parent still active via file handle"); + int parent_fd3 = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(parent_fd3, 0); + close(parent_fd3); + + TH_LOG("Closing all fds to parent namespace"); + close(parent_fd); + close(parent_fd2); + + /* Both should now be inactive */ + TH_LOG("Attempting to reopen parent (should fail - inactive, no refs)"); + parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_LT(parent_fd, 0); + TH_LOG("Parent inactive as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that bind mounts keep namespaces in the tree even when inactive + */ +TEST(ns_bind_mount_keeps_in_tree) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + char tmpfile[] = "/tmp/ns-test-XXXXXX"; + int tmpfd; + + /* Create temporary file for bind mount */ + tmpfd = mkstemp(tmpfile); + if (tmpfd < 0) { + SKIP(return, "Cannot create temporary file"); + } + close(tmpfd); + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Unshare mount namespace and make mounts private to avoid propagation */ + ret = unshare(CLONE_NEWNS); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + ret = mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Bind mount the namespace */ + ret = mount("/proc/self/ns/net", tmpfile, NULL, MS_BIND, NULL); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Get file handle */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + umount(tmpfile); + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + umount(tmpfile); + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* + * Namespace should be inactive but still in tree due to bind mount. + * Reopening should fail with ENOENT (inactive) not ESTALE (not in tree). + */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + /* Should be ENOENT (inactive) since bind mount keeps it in tree */ + if (errno != ENOENT && errno != ESTALE) { + TH_LOG("Unexpected error: %d", errno); + } + + /* Cleanup */ + umount(tmpfile); + unlink(tmpfile); +} + +/* + * Test multi-level hierarchy (3+ levels deep). + * Grandparent → Parent → Child + * When child is active, both parent AND grandparent should be active. + */ +TEST(ns_multilevel_hierarchy) +{ + struct file_handle *gp_handle, *p_handle, *c_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 gp_id, p_id, c_id; + char gp_buf[sizeof(*gp_handle) + MAX_HANDLE_SZ]; + char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ]; + char c_buf[sizeof(*c_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create grandparent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int gp_fd = open("/proc/self/ns/user", O_RDONLY); + if (gp_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(gp_fd, NS_GET_ID, &gp_id) < 0) { + close(gp_fd); + close(pipefd[1]); + exit(1); + } + close(gp_fd); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int p_fd = open("/proc/self/ns/user", O_RDONLY); + if (p_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { + close(p_fd); + close(pipefd[1]); + exit(1); + } + close(p_fd); + + /* Create child user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int c_fd = open("/proc/self/ns/user", O_RDONLY); + if (c_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(c_fd, NS_GET_ID, &c_id) < 0) { + close(c_fd); + close(pipefd[1]); + exit(1); + } + close(c_fd); + + /* Send all three namespace IDs */ + write(pipefd[1], &gp_id, sizeof(gp_id)); + write(pipefd[1], &p_id, sizeof(p_id)); + write(pipefd[1], &c_id, sizeof(c_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &gp_id, sizeof(gp_id)); + if (ret != sizeof(gp_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read grandparent namespace ID from child"); + } + + ret = read(pipefd[0], &p_id, sizeof(p_id)); + if (ret != sizeof(p_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID from child"); + } + + ret = read(pipefd[0], &c_id, sizeof(c_id)); + close(pipefd[0]); + if (ret != sizeof(c_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read child namespace ID from child"); + } + + /* Construct file handles from namespace IDs */ + gp_handle = (struct file_handle *)gp_buf; + gp_handle->handle_bytes = sizeof(struct nsfs_file_handle); + gp_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *gp_fh = (struct nsfs_file_handle *)gp_handle->f_handle; + gp_fh->ns_id = gp_id; + gp_fh->ns_type = 0; + gp_fh->ns_inum = 0; + + p_handle = (struct file_handle *)p_buf; + p_handle->handle_bytes = sizeof(struct nsfs_file_handle); + p_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle; + p_fh->ns_id = p_id; + p_fh->ns_type = 0; + p_fh->ns_inum = 0; + + c_handle = (struct file_handle *)c_buf; + c_handle->handle_bytes = sizeof(struct nsfs_file_handle); + c_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *c_fh = (struct nsfs_file_handle *)c_handle->f_handle; + c_fh->ns_id = c_id; + c_fh->ns_type = 0; + c_fh->ns_inum = 0; + + /* Open child before process exits */ + int c_fd = open_by_handle_at(FD_NSFS_ROOT, c_handle, O_RDONLY); + if (c_fd < 0) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child namespace"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* + * With 3-level hierarchy and child active: + * - Child is active (we hold fd) + * - Parent should be active (propagated from child) + * - Grandparent should be active (propagated from parent) + */ + TH_LOG("Testing parent active when child is active"); + int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + + TH_LOG("Testing grandparent active when child is active"); + int gp_fd = open_by_handle_at(FD_NSFS_ROOT, gp_handle, O_RDONLY); + ASSERT_GE(gp_fd, 0); + + close(c_fd); + close(p_fd); + close(gp_fd); +} + +/* + * Test multiple children sharing same parent. + * Parent should stay active as long as ANY child is active. + */ +TEST(ns_multiple_children_same_parent) +{ + struct file_handle *p_handle, *c1_handle, *c2_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 p_id, c1_id, c2_id; + char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ]; + char c1_buf[sizeof(*c1_handle) + MAX_HANDLE_SZ]; + char c2_buf[sizeof(*c2_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int p_fd = open("/proc/self/ns/user", O_RDONLY); + if (p_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { + close(p_fd); + close(pipefd[1]); + exit(1); + } + close(p_fd); + + /* Create first child user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int c1_fd = open("/proc/self/ns/user", O_RDONLY); + if (c1_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(c1_fd, NS_GET_ID, &c1_id) < 0) { + close(c1_fd); + close(pipefd[1]); + exit(1); + } + close(c1_fd); + + /* Return to parent user namespace and create second child */ + /* We can't actually do this easily, so let's create a sibling namespace + * by creating a network namespace instead */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int c2_fd = open("/proc/self/ns/net", O_RDONLY); + if (c2_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(c2_fd, NS_GET_ID, &c2_id) < 0) { + close(c2_fd); + close(pipefd[1]); + exit(1); + } + close(c2_fd); + + /* Send all namespace IDs */ + write(pipefd[1], &p_id, sizeof(p_id)); + write(pipefd[1], &c1_id, sizeof(c1_id)); + write(pipefd[1], &c2_id, sizeof(c2_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &p_id, sizeof(p_id)); + if (ret != sizeof(p_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID"); + } + + ret = read(pipefd[0], &c1_id, sizeof(c1_id)); + if (ret != sizeof(c1_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read first child namespace ID"); + } + + ret = read(pipefd[0], &c2_id, sizeof(c2_id)); + close(pipefd[0]); + if (ret != sizeof(c2_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read second child namespace ID"); + } + + /* Construct file handles from namespace IDs */ + p_handle = (struct file_handle *)p_buf; + p_handle->handle_bytes = sizeof(struct nsfs_file_handle); + p_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle; + p_fh->ns_id = p_id; + p_fh->ns_type = 0; + p_fh->ns_inum = 0; + + c1_handle = (struct file_handle *)c1_buf; + c1_handle->handle_bytes = sizeof(struct nsfs_file_handle); + c1_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *c1_fh = (struct nsfs_file_handle *)c1_handle->f_handle; + c1_fh->ns_id = c1_id; + c1_fh->ns_type = 0; + c1_fh->ns_inum = 0; + + c2_handle = (struct file_handle *)c2_buf; + c2_handle->handle_bytes = sizeof(struct nsfs_file_handle); + c2_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *c2_fh = (struct nsfs_file_handle *)c2_handle->f_handle; + c2_fh->ns_id = c2_id; + c2_fh->ns_type = 0; + c2_fh->ns_inum = 0; + + /* Open both children before process exits */ + int c1_fd = open_by_handle_at(FD_NSFS_ROOT, c1_handle, O_RDONLY); + int c2_fd = open_by_handle_at(FD_NSFS_ROOT, c2_handle, O_RDONLY); + + if (c1_fd < 0 || c2_fd < 0) { + if (c1_fd >= 0) close(c1_fd); + if (c2_fd >= 0) close(c2_fd); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child namespaces"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Parent should be active (both children active) */ + TH_LOG("Both children active - parent should be active"); + int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close first child - parent should STILL be active */ + TH_LOG("Closing first child - parent should still be active"); + close(c1_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close second child - NOW parent should become inactive */ + TH_LOG("Closing second child - parent should become inactive"); + close(c2_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_LT(p_fd, 0); +} + +/* + * Test that different namespace types with same owner all contribute + * active references to the owning user namespace. + */ +TEST(ns_different_types_same_owner) +{ + struct file_handle *u_handle, *n_handle, *ut_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 u_id, n_id, ut_id; + char u_buf[sizeof(*u_handle) + MAX_HANDLE_SZ]; + char n_buf[sizeof(*n_handle) + MAX_HANDLE_SZ]; + char ut_buf[sizeof(*ut_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int u_fd = open("/proc/self/ns/user", O_RDONLY); + if (u_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) { + close(u_fd); + close(pipefd[1]); + exit(1); + } + close(u_fd); + + /* Create network namespace (owned by user namespace) */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int n_fd = open("/proc/self/ns/net", O_RDONLY); + if (n_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) { + close(n_fd); + close(pipefd[1]); + exit(1); + } + close(n_fd); + + /* Create UTS namespace (also owned by user namespace) */ + if (unshare(CLONE_NEWUTS) < 0) { + close(pipefd[1]); + exit(1); + } + + int ut_fd = open("/proc/self/ns/uts", O_RDONLY); + if (ut_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) { + close(ut_fd); + close(pipefd[1]); + exit(1); + } + close(ut_fd); + + /* Send all namespace IDs */ + write(pipefd[1], &u_id, sizeof(u_id)); + write(pipefd[1], &n_id, sizeof(n_id)); + write(pipefd[1], &ut_id, sizeof(ut_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &u_id, sizeof(u_id)); + if (ret != sizeof(u_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user namespace ID"); + } + + ret = read(pipefd[0], &n_id, sizeof(n_id)); + if (ret != sizeof(n_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID"); + } + + ret = read(pipefd[0], &ut_id, sizeof(ut_id)); + close(pipefd[0]); + if (ret != sizeof(ut_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read UTS namespace ID"); + } + + /* Construct file handles from namespace IDs */ + u_handle = (struct file_handle *)u_buf; + u_handle->handle_bytes = sizeof(struct nsfs_file_handle); + u_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)u_handle->f_handle; + u_fh->ns_id = u_id; + u_fh->ns_type = 0; + u_fh->ns_inum = 0; + + n_handle = (struct file_handle *)n_buf; + n_handle->handle_bytes = sizeof(struct nsfs_file_handle); + n_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)n_handle->f_handle; + n_fh->ns_id = n_id; + n_fh->ns_type = 0; + n_fh->ns_inum = 0; + + ut_handle = (struct file_handle *)ut_buf; + ut_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ut_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)ut_handle->f_handle; + ut_fh->ns_id = ut_id; + ut_fh->ns_type = 0; + ut_fh->ns_inum = 0; + + /* Open both non-user namespaces before process exits */ + int n_fd = open_by_handle_at(FD_NSFS_ROOT, n_handle, O_RDONLY); + int ut_fd = open_by_handle_at(FD_NSFS_ROOT, ut_handle, O_RDONLY); + + if (n_fd < 0 || ut_fd < 0) { + if (n_fd >= 0) close(n_fd); + if (ut_fd >= 0) close(ut_fd); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open namespaces"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* + * Both network and UTS namespaces are active. + * User namespace should be active (gets 2 active refs). + */ + TH_LOG("Both net and uts active - user namespace should be active"); + int u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close network namespace - user namespace should STILL be active */ + TH_LOG("Closing network ns - user ns should still be active (uts still active)"); + close(n_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close UTS namespace - user namespace should become inactive */ + TH_LOG("Closing uts ns - user ns should become inactive"); + close(ut_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); + ASSERT_LT(u_fd, 0); +} + +/* + * Test hierarchical propagation with deep namespace hierarchy. + * Create: init_user_ns -> user_A -> user_B -> net_ns + * When net_ns is active, both user_A and user_B should be active. + * This verifies the conditional recursion in __ns_ref_active_put() works. + */ +TEST(ns_deep_hierarchy_propagation) +{ + struct file_handle *ua_handle, *ub_handle, *net_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 ua_id, ub_id, net_id; + char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ]; + char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ]; + char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create user_A -> user_B -> net hierarchy */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ua_fd = open("/proc/self/ns/user", O_RDONLY); + if (ua_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) { + close(ua_fd); + close(pipefd[1]); + exit(1); + } + close(ua_fd); + + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ub_fd = open("/proc/self/ns/user", O_RDONLY); + if (ub_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) { + close(ub_fd); + close(pipefd[1]); + exit(1); + } + close(ub_fd); + + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int net_fd = open("/proc/self/ns/net", O_RDONLY); + if (net_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) { + close(net_fd); + close(pipefd[1]); + exit(1); + } + close(net_fd); + + /* Send all three namespace IDs */ + write(pipefd[1], &ua_id, sizeof(ua_id)); + write(pipefd[1], &ub_id, sizeof(ub_id)); + write(pipefd[1], &net_id, sizeof(net_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &ua_id, sizeof(ua_id)); + if (ret != sizeof(ua_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_A namespace ID"); + } + + ret = read(pipefd[0], &ub_id, sizeof(ub_id)); + if (ret != sizeof(ub_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_B namespace ID"); + } + + ret = read(pipefd[0], &net_id, sizeof(net_id)); + close(pipefd[0]); + if (ret != sizeof(net_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID"); + } + + /* Construct file handles from namespace IDs */ + ua_handle = (struct file_handle *)ua_buf; + ua_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ua_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle; + ua_fh->ns_id = ua_id; + ua_fh->ns_type = 0; + ua_fh->ns_inum = 0; + + ub_handle = (struct file_handle *)ub_buf; + ub_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ub_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle; + ub_fh->ns_id = ub_id; + ub_fh->ns_type = 0; + ub_fh->ns_inum = 0; + + net_handle = (struct file_handle *)net_buf; + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle; + net_fh->ns_id = net_id; + net_fh->ns_type = 0; + net_fh->ns_inum = 0; + + /* Open net_ns before child exits to keep it active */ + int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + if (net_fd < 0) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open network namespace"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* With net_ns active, both user_A and user_B should be active */ + TH_LOG("Testing user_B active (net_ns active causes propagation)"); + int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); + ASSERT_GE(ub_fd, 0); + + TH_LOG("Testing user_A active (propagated through user_B)"); + int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd, 0); + + /* Close net_ns - user_B should stay active (we hold direct ref) */ + TH_LOG("Closing net_ns, user_B should remain active (direct ref held)"); + close(net_fd); + int ub_fd2 = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); + ASSERT_GE(ub_fd2, 0); + close(ub_fd2); + + /* Close user_B - user_A should stay active (we hold direct ref) */ + TH_LOG("Closing user_B, user_A should remain active (direct ref held)"); + close(ub_fd); + int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd2, 0); + close(ua_fd2); + + /* Close user_A - everything should become inactive */ + TH_LOG("Closing user_A, all should become inactive"); + close(ua_fd); + + /* All should now be inactive */ + ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_LT(ua_fd, 0); +} + +/* + * Test that parent stays active as long as ANY child is active. + * Create parent user namespace with two child net namespaces. + * Parent should remain active until BOTH children are inactive. + */ +TEST(ns_parent_multiple_children_refcount) +{ + struct file_handle *parent_handle, *net1_handle, *net2_handle; + int ret, pipefd[2], syncpipe[2]; + pid_t pid; + int status; + __u64 p_id, n1_id, n2_id; + char p_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ]; + char n1_buf[sizeof(*net1_handle) + MAX_HANDLE_SZ]; + char n2_buf[sizeof(*net2_handle) + MAX_HANDLE_SZ]; + char sync_byte; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + close(syncpipe[1]); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int p_fd = open("/proc/self/ns/user", O_RDONLY); + if (p_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { + close(p_fd); + close(pipefd[1]); + exit(1); + } + close(p_fd); + + /* Create first network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + int n1_fd = open("/proc/self/ns/net", O_RDONLY); + if (n1_fd < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + if (ioctl(n1_fd, NS_GET_ID, &n1_id) < 0) { + close(n1_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + /* Keep n1_fd open so first namespace stays active */ + + /* Create second network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(n1_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + int n2_fd = open("/proc/self/ns/net", O_RDONLY); + if (n2_fd < 0) { + close(n1_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + if (ioctl(n2_fd, NS_GET_ID, &n2_id) < 0) { + close(n1_fd); + close(n2_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + /* Keep both n1_fd and n2_fd open */ + + /* Send all namespace IDs */ + write(pipefd[1], &p_id, sizeof(p_id)); + write(pipefd[1], &n1_id, sizeof(n1_id)); + write(pipefd[1], &n2_id, sizeof(n2_id)); + close(pipefd[1]); + + /* Wait for parent to signal before exiting */ + read(syncpipe[0], &sync_byte, 1); + close(syncpipe[0]); + exit(0); + } + + close(pipefd[1]); + close(syncpipe[0]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &p_id, sizeof(p_id)); + if (ret != sizeof(p_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID"); + } + + ret = read(pipefd[0], &n1_id, sizeof(n1_id)); + if (ret != sizeof(n1_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read first network namespace ID"); + } + + ret = read(pipefd[0], &n2_id, sizeof(n2_id)); + close(pipefd[0]); + if (ret != sizeof(n2_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read second network namespace ID"); + } + + /* Construct file handles from namespace IDs */ + parent_handle = (struct file_handle *)p_buf; + parent_handle->handle_bytes = sizeof(struct nsfs_file_handle); + parent_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)parent_handle->f_handle; + p_fh->ns_id = p_id; + p_fh->ns_type = 0; + p_fh->ns_inum = 0; + + net1_handle = (struct file_handle *)n1_buf; + net1_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net1_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n1_fh = (struct nsfs_file_handle *)net1_handle->f_handle; + n1_fh->ns_id = n1_id; + n1_fh->ns_type = 0; + n1_fh->ns_inum = 0; + + net2_handle = (struct file_handle *)n2_buf; + net2_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net2_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n2_fh = (struct nsfs_file_handle *)net2_handle->f_handle; + n2_fh->ns_id = n2_id; + n2_fh->ns_type = 0; + n2_fh->ns_inum = 0; + + /* Open both net namespaces while child is still alive */ + int n1_fd = open_by_handle_at(FD_NSFS_ROOT, net1_handle, O_RDONLY); + int n2_fd = open_by_handle_at(FD_NSFS_ROOT, net2_handle, O_RDONLY); + if (n1_fd < 0 || n2_fd < 0) { + if (n1_fd >= 0) close(n1_fd); + if (n2_fd >= 0) close(n2_fd); + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open net namespaces"); + } + + /* Signal child that we have opened the namespaces */ + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Parent should be active (has 2 active children) */ + TH_LOG("Both net namespaces active - parent should be active"); + int p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close first net namespace - parent should STILL be active */ + TH_LOG("Closing first net ns - parent should still be active"); + close(n1_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close second net namespace - parent should become inactive */ + TH_LOG("Closing second net ns - parent should become inactive"); + close(n2_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_LT(p_fd, 0); +} + +/* + * Test that user namespace as a child also propagates correctly. + * Create user_A -> user_B, verify when user_B is active that user_A + * is also active. This is different from non-user namespace children. + */ +TEST(ns_userns_child_propagation) +{ + struct file_handle *ua_handle, *ub_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 ua_id, ub_id; + char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ]; + char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create user_A */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ua_fd = open("/proc/self/ns/user", O_RDONLY); + if (ua_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) { + close(ua_fd); + close(pipefd[1]); + exit(1); + } + close(ua_fd); + + /* Create user_B (child of user_A) */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ub_fd = open("/proc/self/ns/user", O_RDONLY); + if (ub_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) { + close(ub_fd); + close(pipefd[1]); + exit(1); + } + close(ub_fd); + + /* Send both namespace IDs */ + write(pipefd[1], &ua_id, sizeof(ua_id)); + write(pipefd[1], &ub_id, sizeof(ub_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read both namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &ua_id, sizeof(ua_id)); + if (ret != sizeof(ua_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_A namespace ID"); + } + + ret = read(pipefd[0], &ub_id, sizeof(ub_id)); + close(pipefd[0]); + if (ret != sizeof(ub_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_B namespace ID"); + } + + /* Construct file handles from namespace IDs */ + ua_handle = (struct file_handle *)ua_buf; + ua_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ua_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle; + ua_fh->ns_id = ua_id; + ua_fh->ns_type = 0; + ua_fh->ns_inum = 0; + + ub_handle = (struct file_handle *)ub_buf; + ub_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ub_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle; + ub_fh->ns_id = ub_id; + ub_fh->ns_type = 0; + ub_fh->ns_inum = 0; + + /* Open user_B before child exits */ + int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); + if (ub_fd < 0) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open user_B"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* With user_B active, user_A should also be active */ + TH_LOG("Testing user_A active when child user_B is active"); + int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd, 0); + + /* Close user_B */ + TH_LOG("Closing user_B"); + close(ub_fd); + + /* user_A should remain active (we hold direct ref) */ + int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd2, 0); + close(ua_fd2); + + /* Close user_A - should become inactive */ + TH_LOG("Closing user_A - should become inactive"); + close(ua_fd); + + ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_LT(ua_fd, 0); +} + +/* + * Test different namespace types (net, uts, ipc) all contributing + * active references to the same owning user namespace. + */ +TEST(ns_mixed_types_same_owner) +{ + struct file_handle *user_handle, *net_handle, *uts_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 u_id, n_id, ut_id; + char u_buf[sizeof(*user_handle) + MAX_HANDLE_SZ]; + char n_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; + char ut_buf[sizeof(*uts_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int u_fd = open("/proc/self/ns/user", O_RDONLY); + if (u_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) { + close(u_fd); + close(pipefd[1]); + exit(1); + } + close(u_fd); + + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int n_fd = open("/proc/self/ns/net", O_RDONLY); + if (n_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) { + close(n_fd); + close(pipefd[1]); + exit(1); + } + close(n_fd); + + if (unshare(CLONE_NEWUTS) < 0) { + close(pipefd[1]); + exit(1); + } + + int ut_fd = open("/proc/self/ns/uts", O_RDONLY); + if (ut_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) { + close(ut_fd); + close(pipefd[1]); + exit(1); + } + close(ut_fd); + + /* Send all namespace IDs */ + write(pipefd[1], &u_id, sizeof(u_id)); + write(pipefd[1], &n_id, sizeof(n_id)); + write(pipefd[1], &ut_id, sizeof(ut_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &u_id, sizeof(u_id)); + if (ret != sizeof(u_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user namespace ID"); + } + + ret = read(pipefd[0], &n_id, sizeof(n_id)); + if (ret != sizeof(n_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID"); + } + + ret = read(pipefd[0], &ut_id, sizeof(ut_id)); + close(pipefd[0]); + if (ret != sizeof(ut_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read UTS namespace ID"); + } + + /* Construct file handles from namespace IDs */ + user_handle = (struct file_handle *)u_buf; + user_handle->handle_bytes = sizeof(struct nsfs_file_handle); + user_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)user_handle->f_handle; + u_fh->ns_id = u_id; + u_fh->ns_type = 0; + u_fh->ns_inum = 0; + + net_handle = (struct file_handle *)n_buf; + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)net_handle->f_handle; + n_fh->ns_id = n_id; + n_fh->ns_type = 0; + n_fh->ns_inum = 0; + + uts_handle = (struct file_handle *)ut_buf; + uts_handle->handle_bytes = sizeof(struct nsfs_file_handle); + uts_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)uts_handle->f_handle; + ut_fh->ns_id = ut_id; + ut_fh->ns_type = 0; + ut_fh->ns_inum = 0; + + /* Open both non-user namespaces */ + int n_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + int ut_fd = open_by_handle_at(FD_NSFS_ROOT, uts_handle, O_RDONLY); + if (n_fd < 0 || ut_fd < 0) { + if (n_fd >= 0) close(n_fd); + if (ut_fd >= 0) close(ut_fd); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open namespaces"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* User namespace should be active (2 active children) */ + TH_LOG("Both net and uts active - user ns should be active"); + int u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close net - user ns should STILL be active (uts still active) */ + TH_LOG("Closing net - user ns should still be active"); + close(n_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close uts - user ns should become inactive */ + TH_LOG("Closing uts - user ns should become inactive"); + close(ut_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_LT(u_fd, 0); +} + +/* Thread test helpers and structures */ +struct thread_ns_info { + __u64 ns_id; + int pipefd; + int syncfd_read; + int syncfd_write; + int exit_code; +}; + +static void *thread_create_namespace(void *arg) +{ + struct thread_ns_info *info = (struct thread_ns_info *)arg; + int ret; + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + info->exit_code = 1; + return NULL; + } + + /* Get namespace ID */ + int fd = open("/proc/thread-self/ns/net", O_RDONLY); + if (fd < 0) { + info->exit_code = 2; + return NULL; + } + + ret = ioctl(fd, NS_GET_ID, &info->ns_id); + close(fd); + if (ret < 0) { + info->exit_code = 3; + return NULL; + } + + /* Send namespace ID to main thread */ + if (write(info->pipefd, &info->ns_id, sizeof(info->ns_id)) != sizeof(info->ns_id)) { + info->exit_code = 4; + return NULL; + } + + /* Wait for signal to exit */ + char sync_byte; + if (read(info->syncfd_read, &sync_byte, 1) != 1) { + info->exit_code = 5; + return NULL; + } + + info->exit_code = 0; + return NULL; +} + +/* + * Test that namespace becomes inactive after thread exits. + * This verifies active reference counting works with threads, not just processes. + */ +TEST(thread_ns_inactive_after_exit) +{ + pthread_t thread; + struct thread_ns_info info; + struct file_handle *handle; + int pipefd[2]; + int syncpipe[2]; + int ret; + char sync_byte; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + + info.pipefd = pipefd[1]; + info.syncfd_read = syncpipe[0]; + info.syncfd_write = -1; + info.exit_code = -1; + + /* Create thread that will create a namespace */ + ret = pthread_create(&thread, NULL, thread_create_namespace, &info); + ASSERT_EQ(ret, 0); + + /* Read namespace ID from thread */ + __u64 ns_id; + ret = read(pipefd[0], &ns_id, sizeof(ns_id)); + if (ret != sizeof(ns_id)) { + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + pthread_join(thread, NULL); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + close(syncpipe[1]); + SKIP(return, "Failed to read namespace ID from thread"); + } + + TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id); + + /* Construct file handle */ + handle = (struct file_handle *)buf; + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle; + fh->ns_id = ns_id; + fh->ns_type = 0; + fh->ns_inum = 0; + + /* Namespace should be active while thread is alive */ + TH_LOG("Attempting to open namespace while thread is alive (should succeed)"); + int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(nsfd, 0); + close(nsfd); + + /* Signal thread to exit */ + TH_LOG("Signaling thread to exit"); + sync_byte = 'X'; + ASSERT_EQ(write(syncpipe[1], &sync_byte, 1), 1); + close(syncpipe[1]); + + /* Wait for thread to exit */ + ASSERT_EQ(pthread_join(thread, NULL), 0); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + + if (info.exit_code != 0) + SKIP(return, "Thread failed to create namespace"); + + TH_LOG("Thread exited, namespace should be inactive"); + + /* Namespace should now be inactive */ + nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(nsfd, 0); + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ + TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that a namespace remains active while a thread holds an fd to it. + * Even after the thread exits, the namespace should remain active as long as + * another thread holds a file descriptor to it. + */ +TEST(thread_ns_fd_keeps_active) +{ + pthread_t thread; + struct thread_ns_info info; + struct file_handle *handle; + int pipefd[2]; + int syncpipe[2]; + int ret; + char sync_byte; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + + info.pipefd = pipefd[1]; + info.syncfd_read = syncpipe[0]; + info.syncfd_write = -1; + info.exit_code = -1; + + /* Create thread that will create a namespace */ + ret = pthread_create(&thread, NULL, thread_create_namespace, &info); + ASSERT_EQ(ret, 0); + + /* Read namespace ID from thread */ + __u64 ns_id; + ret = read(pipefd[0], &ns_id, sizeof(ns_id)); + if (ret != sizeof(ns_id)) { + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + pthread_join(thread, NULL); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + close(syncpipe[1]); + SKIP(return, "Failed to read namespace ID from thread"); + } + + TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id); + + /* Construct file handle */ + handle = (struct file_handle *)buf; + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle; + fh->ns_id = ns_id; + fh->ns_type = 0; + fh->ns_inum = 0; + + /* Open namespace while thread is alive */ + TH_LOG("Opening namespace while thread is alive"); + int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(nsfd, 0); + + /* Signal thread to exit */ + TH_LOG("Signaling thread to exit"); + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + /* Wait for thread to exit */ + pthread_join(thread, NULL); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + + if (info.exit_code != 0) { + close(nsfd); + SKIP(return, "Thread failed to create namespace"); + } + + TH_LOG("Thread exited, but main thread holds fd - namespace should remain active"); + + /* Namespace should still be active because we hold an fd */ + int nsfd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(nsfd2, 0); + + /* Verify it's the same namespace */ + struct stat st1, st2; + ASSERT_EQ(fstat(nsfd, &st1), 0); + ASSERT_EQ(fstat(nsfd2, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + close(nsfd2); + + TH_LOG("Closing fd - namespace should become inactive"); + close(nsfd); + + /* Now namespace should be inactive */ + nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(nsfd, 0); + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ + TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* Structure for thread data in subprocess */ +struct thread_sleep_data { + int syncfd_read; +}; + +static void *thread_sleep_and_wait(void *arg) +{ + struct thread_sleep_data *data = (struct thread_sleep_data *)arg; + char sync_byte; + + /* Wait for signal to exit - read will unblock when pipe is closed */ + (void)read(data->syncfd_read, &sync_byte, 1); + return NULL; +} + +/* + * Test that namespaces become inactive after subprocess with multiple threads exits. + * Create a subprocess that unshares user and network namespaces, then creates two + * threads that share those namespaces. Verify that after all threads and subprocess + * exit, the namespaces are no longer listed by listns() and cannot be opened by + * open_by_handle_at(). + */ +TEST(thread_subprocess_ns_inactive_after_all_exit) +{ + int pipefd[2]; + int sv[2]; + pid_t pid; + int status; + __u64 user_id, net_id; + struct file_handle *user_handle, *net_handle; + char user_buf[sizeof(*user_handle) + MAX_HANDLE_SZ]; + char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; + char sync_byte; + int ret; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + close(sv[0]); + + /* Create user namespace with mappings */ + if (setup_userns() < 0) { + fprintf(stderr, "Child: setup_userns() failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + fprintf(stderr, "Child: setup_userns() succeeded\n"); + + /* Get user namespace ID */ + int user_fd = open("/proc/self/ns/user", O_RDONLY); + if (user_fd < 0) { + fprintf(stderr, "Child: open(/proc/self/ns/user) failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + + if (ioctl(user_fd, NS_GET_ID, &user_id) < 0) { + fprintf(stderr, "Child: ioctl(NS_GET_ID) for user ns failed: %s\n", strerror(errno)); + close(user_fd); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + close(user_fd); + fprintf(stderr, "Child: user ns ID = %llu\n", (unsigned long long)user_id); + + /* Unshare network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + fprintf(stderr, "Child: unshare(CLONE_NEWNET) failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + fprintf(stderr, "Child: unshare(CLONE_NEWNET) succeeded\n"); + + /* Get network namespace ID */ + int net_fd = open("/proc/self/ns/net", O_RDONLY); + if (net_fd < 0) { + fprintf(stderr, "Child: open(/proc/self/ns/net) failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + + if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) { + fprintf(stderr, "Child: ioctl(NS_GET_ID) for net ns failed: %s\n", strerror(errno)); + close(net_fd); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + close(net_fd); + fprintf(stderr, "Child: net ns ID = %llu\n", (unsigned long long)net_id); + + /* Send namespace IDs to parent */ + if (write(pipefd[1], &user_id, sizeof(user_id)) != sizeof(user_id)) { + fprintf(stderr, "Child: write(user_id) failed: %s\n", strerror(errno)); + exit(1); + } + if (write(pipefd[1], &net_id, sizeof(net_id)) != sizeof(net_id)) { + fprintf(stderr, "Child: write(net_id) failed: %s\n", strerror(errno)); + exit(1); + } + close(pipefd[1]); + fprintf(stderr, "Child: sent namespace IDs to parent\n"); + + /* Create two threads that share the namespaces */ + pthread_t thread1, thread2; + struct thread_sleep_data data; + data.syncfd_read = sv[1]; + + int ret_thread = pthread_create(&thread1, NULL, thread_sleep_and_wait, &data); + if (ret_thread != 0) { + fprintf(stderr, "Child: pthread_create(thread1) failed: %s\n", strerror(ret_thread)); + close(sv[1]); + exit(1); + } + fprintf(stderr, "Child: created thread1\n"); + + ret_thread = pthread_create(&thread2, NULL, thread_sleep_and_wait, &data); + if (ret_thread != 0) { + fprintf(stderr, "Child: pthread_create(thread2) failed: %s\n", strerror(ret_thread)); + close(sv[1]); + pthread_cancel(thread1); + exit(1); + } + fprintf(stderr, "Child: created thread2\n"); + + /* Wait for threads to complete - they will unblock when parent writes */ + fprintf(stderr, "Child: waiting for threads to exit\n"); + pthread_join(thread1, NULL); + fprintf(stderr, "Child: thread1 exited\n"); + pthread_join(thread2, NULL); + fprintf(stderr, "Child: thread2 exited\n"); + + close(sv[1]); + + /* Exit - namespaces should become inactive */ + fprintf(stderr, "Child: all threads joined, exiting with success\n"); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + close(sv[1]); + + TH_LOG("Parent: waiting to read namespace IDs from child"); + + /* Read namespace IDs from child */ + ret = read(pipefd[0], &user_id, sizeof(user_id)); + if (ret != sizeof(user_id)) { + TH_LOG("Parent: failed to read user_id, ret=%d, errno=%s", ret, strerror(errno)); + close(pipefd[0]); + sync_byte = 'X'; + (void)write(sv[0], &sync_byte, 1); + close(sv[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user namespace ID from child"); + } + + ret = read(pipefd[0], &net_id, sizeof(net_id)); + close(pipefd[0]); + if (ret != sizeof(net_id)) { + TH_LOG("Parent: failed to read net_id, ret=%d, errno=%s", ret, strerror(errno)); + sync_byte = 'X'; + (void)write(sv[0], &sync_byte, 1); + close(sv[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID from child"); + } + + TH_LOG("Child created user ns %llu and net ns %llu with 2 threads", + (unsigned long long)user_id, (unsigned long long)net_id); + + /* Construct file handles */ + user_handle = (struct file_handle *)user_buf; + user_handle->handle_bytes = sizeof(struct nsfs_file_handle); + user_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *user_fh = (struct nsfs_file_handle *)user_handle->f_handle; + user_fh->ns_id = user_id; + user_fh->ns_type = 0; + user_fh->ns_inum = 0; + + net_handle = (struct file_handle *)net_buf; + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle; + net_fh->ns_id = net_id; + net_fh->ns_type = 0; + net_fh->ns_inum = 0; + + /* Verify namespaces are active while subprocess and threads are alive */ + TH_LOG("Verifying namespaces are active while subprocess with threads is running"); + int user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_GE(user_fd, 0); + + int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + ASSERT_GE(net_fd, 0); + + close(user_fd); + close(net_fd); + + /* Also verify they appear in listns() */ + TH_LOG("Verifying namespaces appear in listns() while active"); + struct ns_id_req req = { + .size = sizeof(struct ns_id_req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids < 0) { + TH_LOG("listns() not available, skipping listns verification"); + } else { + /* Check if user_id is in the list */ + int found_user = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == user_id) { + found_user = 1; + break; + } + } + ASSERT_TRUE(found_user); + TH_LOG("User namespace found in listns() as expected"); + + /* Check network namespace */ + req.ns_type = CLONE_NEWNET; + nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids >= 0) { + int found_net = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == net_id) { + found_net = 1; + break; + } + } + ASSERT_TRUE(found_net); + TH_LOG("Network namespace found in listns() as expected"); + } + } + + /* Signal threads to exit */ + TH_LOG("Signaling threads to exit"); + sync_byte = 'X'; + /* Write two bytes - one for each thread */ + ASSERT_EQ(write(sv[0], &sync_byte, 1), 1); + ASSERT_EQ(write(sv[0], &sync_byte, 1), 1); + close(sv[0]); + + /* Wait for child process to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + if (WEXITSTATUS(status) != 0) { + TH_LOG("Child process failed with exit code %d", WEXITSTATUS(status)); + SKIP(return, "Child process failed"); + } + + TH_LOG("Subprocess and all threads have exited successfully"); + + /* Verify namespaces are now inactive - open_by_handle_at should fail */ + TH_LOG("Verifying namespaces are inactive after subprocess and threads exit"); + user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_LT(user_fd, 0); + TH_LOG("User namespace inactive as expected: %s (errno=%d)", + strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); + + net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + ASSERT_LT(net_fd, 0); + TH_LOG("Network namespace inactive as expected: %s (errno=%d)", + strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); + + /* Verify namespaces do NOT appear in listns() */ + TH_LOG("Verifying namespaces do NOT appear in listns() when inactive"); + memset(&req, 0, sizeof(req)); + req.size = sizeof(struct ns_id_req); + req.ns_type = CLONE_NEWUSER; + nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids >= 0) { + int found_user = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == user_id) { + found_user = 1; + break; + } + } + ASSERT_FALSE(found_user); + TH_LOG("User namespace correctly not listed in listns()"); + + /* Check network namespace */ + req.ns_type = CLONE_NEWNET; + nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids >= 0) { + int found_net = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == net_id) { + found_net = 1; + break; + } + } + ASSERT_FALSE(found_net); + TH_LOG("Network namespace correctly not listed in listns()"); + } + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c index e28accd74a57..527ade0a8673 100644 --- a/tools/testing/selftests/namespaces/nsid_test.c +++ b/tools/testing/selftests/namespaces/nsid_test.c @@ -6,6 +6,7 @@ #include <libgen.h> #include <limits.h> #include <pthread.h> +#include <signal.h> #include <string.h> #include <sys/mount.h> #include <poll.h> @@ -14,12 +15,30 @@ #include <sys/stat.h> #include <sys/socket.h> #include <sys/un.h> +#include <sys/wait.h> #include <unistd.h> #include <linux/fs.h> #include <linux/limits.h> #include <linux/nsfs.h> #include "../kselftest_harness.h" +/* Fixture for tests that create child processes */ +FIXTURE(nsid) { + pid_t child_pid; +}; + +FIXTURE_SETUP(nsid) { + self->child_pid = 0; +} + +FIXTURE_TEARDOWN(nsid) { + /* Clean up any child process that may still be running */ + if (self->child_pid > 0) { + kill(self->child_pid, SIGKILL); + waitpid(self->child_pid, NULL, 0); + } +} + TEST(nsid_mntns_basic) { __u64 mnt_ns_id = 0; @@ -44,7 +63,7 @@ TEST(nsid_mntns_basic) close(fd_mntns); } -TEST(nsid_mntns_separate) +TEST_F(nsid, mntns_separate) { __u64 parent_mnt_ns_id = 0; __u64 child_mnt_ns_id = 0; @@ -90,6 +109,9 @@ TEST(nsid_mntns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -99,8 +121,6 @@ TEST(nsid_mntns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_mntns); SKIP(return, "No permission to create mount namespace"); } @@ -123,10 +143,6 @@ TEST(nsid_mntns_separate) close(fd_parent_mntns); close(fd_child_mntns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_cgroupns_basic) @@ -153,7 +169,7 @@ TEST(nsid_cgroupns_basic) close(fd_cgroupns); } -TEST(nsid_cgroupns_separate) +TEST_F(nsid, cgroupns_separate) { __u64 parent_cgroup_ns_id = 0; __u64 child_cgroup_ns_id = 0; @@ -199,6 +215,9 @@ TEST(nsid_cgroupns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -208,8 +227,6 @@ TEST(nsid_cgroupns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_cgroupns); SKIP(return, "No permission to create cgroup namespace"); } @@ -232,10 +249,6 @@ TEST(nsid_cgroupns_separate) close(fd_parent_cgroupns); close(fd_child_cgroupns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_ipcns_basic) @@ -262,7 +275,7 @@ TEST(nsid_ipcns_basic) close(fd_ipcns); } -TEST(nsid_ipcns_separate) +TEST_F(nsid, ipcns_separate) { __u64 parent_ipc_ns_id = 0; __u64 child_ipc_ns_id = 0; @@ -308,6 +321,9 @@ TEST(nsid_ipcns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -317,8 +333,6 @@ TEST(nsid_ipcns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_ipcns); SKIP(return, "No permission to create IPC namespace"); } @@ -341,10 +355,6 @@ TEST(nsid_ipcns_separate) close(fd_parent_ipcns); close(fd_child_ipcns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_utsns_basic) @@ -371,7 +381,7 @@ TEST(nsid_utsns_basic) close(fd_utsns); } -TEST(nsid_utsns_separate) +TEST_F(nsid, utsns_separate) { __u64 parent_uts_ns_id = 0; __u64 child_uts_ns_id = 0; @@ -417,6 +427,9 @@ TEST(nsid_utsns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -426,8 +439,6 @@ TEST(nsid_utsns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_utsns); SKIP(return, "No permission to create UTS namespace"); } @@ -450,10 +461,6 @@ TEST(nsid_utsns_separate) close(fd_parent_utsns); close(fd_child_utsns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_userns_basic) @@ -480,7 +487,7 @@ TEST(nsid_userns_basic) close(fd_userns); } -TEST(nsid_userns_separate) +TEST_F(nsid, userns_separate) { __u64 parent_user_ns_id = 0; __u64 child_user_ns_id = 0; @@ -526,6 +533,9 @@ TEST(nsid_userns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -535,8 +545,6 @@ TEST(nsid_userns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_userns); SKIP(return, "No permission to create user namespace"); } @@ -559,10 +567,6 @@ TEST(nsid_userns_separate) close(fd_parent_userns); close(fd_child_userns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_timens_basic) @@ -591,7 +595,7 @@ TEST(nsid_timens_basic) close(fd_timens); } -TEST(nsid_timens_separate) +TEST_F(nsid, timens_separate) { __u64 parent_time_ns_id = 0; __u64 child_time_ns_id = 0; @@ -652,6 +656,9 @@ TEST(nsid_timens_separate) } } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -660,8 +667,6 @@ TEST(nsid_timens_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_timens); close(pipefd[0]); SKIP(return, "Cannot create time namespace"); @@ -689,10 +694,6 @@ TEST(nsid_timens_separate) close(fd_parent_timens); close(fd_child_timens); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_pidns_basic) @@ -719,7 +720,7 @@ TEST(nsid_pidns_basic) close(fd_pidns); } -TEST(nsid_pidns_separate) +TEST_F(nsid, pidns_separate) { __u64 parent_pid_ns_id = 0; __u64 child_pid_ns_id = 0; @@ -776,6 +777,9 @@ TEST(nsid_pidns_separate) } } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -784,8 +788,6 @@ TEST(nsid_pidns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_pidns); close(pipefd[0]); SKIP(return, "No permission to create PID namespace"); @@ -813,10 +815,6 @@ TEST(nsid_pidns_separate) close(fd_parent_pidns); close(fd_child_pidns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_netns_basic) @@ -860,7 +858,7 @@ TEST(nsid_netns_basic) close(fd_netns); } -TEST(nsid_netns_separate) +TEST_F(nsid, netns_separate) { __u64 parent_net_ns_id = 0; __u64 parent_netns_cookie = 0; @@ -920,6 +918,9 @@ TEST(nsid_netns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -929,8 +930,6 @@ TEST(nsid_netns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_netns); close(parent_sock); SKIP(return, "No permission to create network namespace"); @@ -977,10 +976,6 @@ TEST(nsid_netns_separate) close(fd_parent_netns); close(fd_child_netns); close(parent_sock); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c new file mode 100644 index 000000000000..753fd29dffd8 --- /dev/null +++ b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <unistd.h> +#include "../pidfd/pidfd.h" +#include "../kselftest_harness.h" + +/* + * Regression tests for the setns(pidfd) active reference counting bug. + * + * These tests are based on the reproducers that triggered the race condition + * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly"). + * + * The bug: When using setns() with a pidfd, if the target task exits between + * prepare_nsset() and commit_nsset(), the namespaces would become inactive. + * Then ns_ref_active_get() would increment from 0 without properly resurrecting + * the owner chain, causing active reference count underflows. + */ + +/* + * Simple pidfd setns test using create_child()+unshare(). + * + * Without the fix, this would trigger active refcount warnings when the + * parent exits after doing setns(pidfd) on a child that has already exited. + */ +TEST(simple_pidfd_setns) +{ + pid_t child_pid; + int pidfd = -1; + int ret; + int sv[2]; + char c; + + /* Ignore SIGCHLD for autoreap */ + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create a child process without namespaces initially */ + child_pid = create_child(&pidfd, 0); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + close(sv[0]); + + if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) { + close(sv[1]); + _exit(1); + } + + /* Signal parent that namespaces are ready */ + if (write_nointr(sv[1], "1", 1) < 0) { + close(sv[1]); + _exit(1); + } + + close(sv[1]); + _exit(0); + } + ASSERT_GE(pidfd, 0); + EXPECT_EQ(close(sv[1]), 0); + + ret = read_nointr(sv[0], &c, 1); + ASSERT_EQ(ret, 1); + EXPECT_EQ(close(sv[0]), 0); + + /* Set to child's namespaces via pidfd */ + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); + TH_LOG("setns() returned %d", ret); + close(pidfd); +} + +/* + * Simple pidfd setns test using create_child(). + * + * This variation uses create_child() with namespace flags directly. + * Namespaces are created immediately at clone time. + */ +TEST(simple_pidfd_setns_clone) +{ + pid_t child_pid; + int pidfd = -1; + int ret; + + /* Ignore SIGCHLD for autoreap */ + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); + + /* Create a child process with new namespaces using create_child() */ + child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + /* Child: sleep for a while so parent can setns to us */ + sleep(2); + _exit(0); + } + + /* Parent: pidfd was already created by create_child() */ + ASSERT_GE(pidfd, 0); + + /* Set to child's namespaces via pidfd */ + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); + close(pidfd); + TH_LOG("setns() returned %d", ret); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/siocgskns_test.c b/tools/testing/selftests/namespaces/siocgskns_test.c new file mode 100644 index 000000000000..ba689a22d82f --- /dev/null +++ b/tools/testing/selftests/namespaces/siocgskns_test.c @@ -0,0 +1,1824 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/if.h> +#include <linux/sockios.h> +#include <linux/nsfs.h> +#include <arpa/inet.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +#ifndef SIOCGSKNS +#define SIOCGSKNS 0x894C +#endif + +#ifndef FD_NSFS_ROOT +#define FD_NSFS_ROOT -10003 +#endif + +#ifndef FILEID_NSFS +#define FILEID_NSFS 0xf1 +#endif + +/* + * Test basic SIOCGSKNS functionality. + * Create a socket and verify SIOCGSKNS returns the correct network namespace. + */ +TEST(siocgskns_basic) +{ + int sock_fd, netns_fd, current_netns_fd; + struct stat st1, st2; + + /* Create a TCP socket */ + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(sock_fd, 0); + + /* Use SIOCGSKNS to get network namespace */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Get current network namespace */ + current_netns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(current_netns_fd, 0); + + /* Verify they match */ + ASSERT_EQ(fstat(netns_fd, &st1), 0); + ASSERT_EQ(fstat(current_netns_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + close(sock_fd); + close(netns_fd); + close(current_netns_fd); +} + +/* + * Test that socket file descriptors keep network namespaces active. + * Create a network namespace, create a socket in it, then exit the namespace. + * The namespace should remain active while the socket FD is held. + */ +TEST(siocgskns_keeps_netns_active) +{ + int sock_fd, netns_fd, test_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + struct stat st; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new netns and socket */ + close(ipc_sockets[0]); + + if (unshare(CLONE_NEWNET) < 0) { + TH_LOG("unshare(CLONE_NEWNET) failed: %s", strerror(errno)); + close(ipc_sockets[1]); + exit(1); + } + + /* Create a socket in the new network namespace */ + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + TH_LOG("socket() failed: %s", strerror(errno)); + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS); + + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + ASSERT_EQ(fstat(netns_fd, &st), 0); + + /* + * Namespace should still be active because socket FD keeps it alive. + * Try to access it via /proc/self/fd/<fd>. + */ + char path[64]; + snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fd); + test_fd = open(path, O_RDONLY); + ASSERT_GE(test_fd, 0); + close(test_fd); + close(netns_fd); + + /* Close socket - namespace should become inactive */ + close(sock_fd); + + /* Try SIOCGSKNS again - should fail since socket is closed */ + ASSERT_LT(ioctl(sock_fd, SIOCGSKNS), 0); +} + +/* + * Test SIOCGSKNS with different socket types (TCP, UDP, RAW). + */ +TEST(siocgskns_socket_types) +{ + int sock_tcp, sock_udp, sock_raw; + int netns_tcp, netns_udp, netns_raw; + struct stat st_tcp, st_udp, st_raw; + + /* TCP socket */ + sock_tcp = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(sock_tcp, 0); + + /* UDP socket */ + sock_udp = socket(AF_INET, SOCK_DGRAM, 0); + ASSERT_GE(sock_udp, 0); + + /* RAW socket (may require privileges) */ + sock_raw = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP); + if (sock_raw < 0 && (errno == EPERM || errno == EACCES)) { + sock_raw = -1; /* Skip raw socket test */ + } + + /* Test SIOCGSKNS on TCP */ + netns_tcp = ioctl(sock_tcp, SIOCGSKNS); + if (netns_tcp < 0) { + close(sock_tcp); + close(sock_udp); + if (sock_raw >= 0) close(sock_raw); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_tcp, 0); + } + + /* Test SIOCGSKNS on UDP */ + netns_udp = ioctl(sock_udp, SIOCGSKNS); + ASSERT_GE(netns_udp, 0); + + /* Test SIOCGSKNS on RAW (if available) */ + if (sock_raw >= 0) { + netns_raw = ioctl(sock_raw, SIOCGSKNS); + ASSERT_GE(netns_raw, 0); + } + + /* Verify all return the same network namespace */ + ASSERT_EQ(fstat(netns_tcp, &st_tcp), 0); + ASSERT_EQ(fstat(netns_udp, &st_udp), 0); + ASSERT_EQ(st_tcp.st_ino, st_udp.st_ino); + + if (sock_raw >= 0) { + ASSERT_EQ(fstat(netns_raw, &st_raw), 0); + ASSERT_EQ(st_tcp.st_ino, st_raw.st_ino); + close(netns_raw); + close(sock_raw); + } + + close(netns_tcp); + close(netns_udp); + close(sock_tcp); + close(sock_udp); +} + +/* + * Test SIOCGSKNS across setns. + * Create a socket in netns A, switch to netns B, verify SIOCGSKNS still + * returns netns A. + */ +TEST(siocgskns_across_setns) +{ + int sock_fd, netns_a_fd, netns_b_fd, result_fd; + struct stat st_a; + + /* Get current netns (A) */ + netns_a_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(netns_a_fd, 0); + ASSERT_EQ(fstat(netns_a_fd, &st_a), 0); + + /* Create socket in netns A */ + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(sock_fd, 0); + + /* Create new netns (B) */ + ASSERT_EQ(unshare(CLONE_NEWNET), 0); + + netns_b_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(netns_b_fd, 0); + + /* Get netns from socket created in A */ + result_fd = ioctl(sock_fd, SIOCGSKNS); + if (result_fd < 0) { + close(sock_fd); + setns(netns_a_fd, CLONE_NEWNET); + close(netns_a_fd); + close(netns_b_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(result_fd, 0); + } + + /* Verify it still points to netns A */ + struct stat st_result_stat; + ASSERT_EQ(fstat(result_fd, &st_result_stat), 0); + ASSERT_EQ(st_a.st_ino, st_result_stat.st_ino); + + close(result_fd); + close(sock_fd); + close(netns_b_fd); + + /* Restore original netns */ + ASSERT_EQ(setns(netns_a_fd, CLONE_NEWNET), 0); + close(netns_a_fd); +} + +/* + * Test SIOCGSKNS fails on non-socket file descriptors. + */ +TEST(siocgskns_non_socket) +{ + int fd; + int pipefd[2]; + + /* Test on regular file */ + fd = open("/dev/null", O_RDONLY); + ASSERT_GE(fd, 0); + + ASSERT_LT(ioctl(fd, SIOCGSKNS), 0); + ASSERT_TRUE(errno == ENOTTY || errno == EINVAL); + close(fd); + + /* Test on pipe */ + ASSERT_EQ(pipe(pipefd), 0); + + ASSERT_LT(ioctl(pipefd[0], SIOCGSKNS), 0); + ASSERT_TRUE(errno == ENOTTY || errno == EINVAL); + + close(pipefd[0]); + close(pipefd[1]); +} + +/* + * Test multiple sockets keep the same network namespace active. + * Create multiple sockets, verify closing some doesn't affect others. + */ +TEST(siocgskns_multiple_sockets) +{ + int socks[5]; + int netns_fds[5]; + int i; + struct stat st; + ino_t netns_ino; + + /* Create new network namespace */ + ASSERT_EQ(unshare(CLONE_NEWNET), 0); + + /* Create multiple sockets */ + for (i = 0; i < 5; i++) { + socks[i] = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(socks[i], 0); + } + + /* Get netns from all sockets */ + for (i = 0; i < 5; i++) { + netns_fds[i] = ioctl(socks[i], SIOCGSKNS); + if (netns_fds[i] < 0) { + int j; + for (j = 0; j <= i; j++) { + close(socks[j]); + if (j < i && netns_fds[j] >= 0) + close(netns_fds[j]); + } + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fds[i], 0); + } + } + + /* Verify all point to same netns */ + ASSERT_EQ(fstat(netns_fds[0], &st), 0); + netns_ino = st.st_ino; + + for (i = 1; i < 5; i++) { + ASSERT_EQ(fstat(netns_fds[i], &st), 0); + ASSERT_EQ(st.st_ino, netns_ino); + } + + /* Close some sockets */ + for (i = 0; i < 3; i++) { + close(socks[i]); + } + + /* Remaining netns FDs should still be valid */ + for (i = 3; i < 5; i++) { + char path[64]; + snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fds[i]); + int test_fd = open(path, O_RDONLY); + ASSERT_GE(test_fd, 0); + close(test_fd); + } + + /* Cleanup */ + for (i = 0; i < 5; i++) { + if (i >= 3) + close(socks[i]); + close(netns_fds[i]); + } +} + +/* + * Test socket keeps netns active after creating process exits. + * Verify that as long as the socket FD exists, the namespace remains active. + */ +TEST(siocgskns_netns_lifecycle) +{ + int sock_fd, netns_fd; + int ipc_sockets[2]; + int syncpipe[2]; + pid_t pid; + int status; + char sync_byte; + struct stat st; + ino_t netns_ino; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + ASSERT_EQ(pipe(syncpipe), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child */ + close(ipc_sockets[0]); + close(syncpipe[1]); + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + close(syncpipe[0]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + close(syncpipe[0]); + exit(1); + } + + /* Send socket to parent */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + close(syncpipe[0]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + + /* Wait for parent signal */ + read(syncpipe[0], &sync_byte, 1); + close(syncpipe[0]); + exit(0); + } + + /* Parent */ + close(ipc_sockets[1]); + close(syncpipe[0]); + + /* Receive socket FD */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Get netns from socket while child is alive */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + close(sock_fd); + waitpid(pid, NULL, 0); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + ASSERT_EQ(fstat(netns_fd, &st), 0); + netns_ino = st.st_ino; + + /* Signal child to exit */ + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + /* + * Socket FD should still keep namespace active even after + * the creating process exited. + */ + int test_fd = ioctl(sock_fd, SIOCGSKNS); + ASSERT_GE(test_fd, 0); + + struct stat st_test; + ASSERT_EQ(fstat(test_fd, &st_test), 0); + ASSERT_EQ(st_test.st_ino, netns_ino); + + close(test_fd); + close(netns_fd); + + /* Close socket - namespace should become inactive */ + close(sock_fd); +} + +/* + * Test IPv6 sockets also work with SIOCGSKNS. + */ +TEST(siocgskns_ipv6) +{ + int sock_fd, netns_fd, current_netns_fd; + struct stat st1, st2; + + /* Create an IPv6 TCP socket */ + sock_fd = socket(AF_INET6, SOCK_STREAM, 0); + ASSERT_GE(sock_fd, 0); + + /* Use SIOCGSKNS */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Verify it matches current namespace */ + current_netns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(current_netns_fd, 0); + + ASSERT_EQ(fstat(netns_fd, &st1), 0); + ASSERT_EQ(fstat(current_netns_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + close(sock_fd); + close(netns_fd); + close(current_netns_fd); +} + +/* + * Test that socket-kept netns appears in listns() output. + * Verify that a network namespace kept alive by a socket FD appears in + * listns() output even after the creating process exits, and that it + * disappears when the socket is closed. + */ +TEST(siocgskns_listns_visibility) +{ + int sock_fd, netns_fd, owner_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + __u64 netns_id, owner_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int ret, i; + bool found_netns = false; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new netns and socket */ + close(ipc_sockets[0]); + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Get namespace ID */ + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); + if (ret < 0) { + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Get owner user namespace */ + owner_fd = ioctl(netns_fd, NS_GET_USERNS); + if (owner_fd < 0) { + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_USERNS not supported"); + ASSERT_GE(owner_fd, 0); + } + + /* Get owner namespace ID */ + ret = ioctl(owner_fd, NS_GET_ID, &owner_id); + if (ret < 0) { + close(owner_fd); + close(sock_fd); + close(netns_fd); + ASSERT_EQ(ret, 0); + } + close(owner_fd); + + /* Namespace should appear in listns() output */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + /* Search for our network namespace in the list */ + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) { + found_netns = true; + break; + } + } + + ASSERT_TRUE(found_netns); + TH_LOG("Found netns %llu in listns() output (kept alive by socket)", netns_id); + + /* Now verify with owner filtering */ + req.user_ns_id = owner_id; + found_netns = false; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) { + found_netns = true; + break; + } + } + + ASSERT_TRUE(found_netns); + TH_LOG("Found netns %llu owned by userns %llu", netns_id, owner_id); + + /* Close socket - namespace should become inactive and disappear from listns() */ + close(sock_fd); + close(netns_fd); + + /* Verify it's no longer in listns() output */ + req.user_ns_id = 0; + found_netns = false; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) { + found_netns = true; + break; + } + } + + ASSERT_FALSE(found_netns); + TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id); +} + +/* + * Test that socket-kept netns can be reopened via file handle. + * Verify that a network namespace kept alive by a socket FD can be + * reopened using file handles even after the creating process exits. + */ +TEST(siocgskns_file_handle) +{ + int sock_fd, netns_fd, reopened_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + struct stat st1, st2; + ino_t netns_ino; + __u64 netns_id; + struct file_handle *handle; + struct nsfs_file_handle *nsfs_fh; + int ret; + + /* Allocate file_handle structure for nsfs */ + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); + ASSERT_NE(handle, NULL); + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new netns and socket */ + close(ipc_sockets[0]); + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + ASSERT_EQ(fstat(netns_fd, &st1), 0); + netns_ino = st1.st_ino; + + /* Get namespace ID */ + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Construct file handle from namespace ID */ + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; + nsfs_fh->ns_id = netns_id; + nsfs_fh->ns_type = 0; /* Type field not needed for reopening */ + nsfs_fh->ns_inum = 0; /* Inum field not needed for reopening */ + + TH_LOG("Constructed file handle for netns %lu (id=%llu)", netns_ino, netns_id); + + /* Reopen namespace using file handle (while socket still keeps it alive) */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + /* Verify it's the same namespace */ + ASSERT_EQ(fstat(reopened_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + TH_LOG("Successfully reopened netns %lu via file handle", netns_ino); + + close(reopened_fd); + + /* Close the netns FD */ + close(netns_fd); + + /* Try to reopen via file handle - should fail since namespace is now inactive */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(reopened_fd, 0); + TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno)); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Reopen namespace using file handle (while socket still keeps it alive) */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + /* Verify it's the same namespace */ + ASSERT_EQ(fstat(reopened_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + TH_LOG("Successfully reopened netns %lu via file handle", netns_ino); + + /* Close socket - namespace should become inactive */ + close(sock_fd); + free(handle); +} + +/* + * Test combined listns() and file handle operations with socket-kept netns. + * Create a netns, keep it alive with a socket, verify it appears in listns(), + * then reopen it via file handle obtained from listns() entry. + */ +TEST(siocgskns_listns_and_file_handle) +{ + int sock_fd, netns_fd, userns_fd, reopened_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + struct stat st; + ino_t netns_ino; + __u64 netns_id, userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET | CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int ret, i; + bool found_netns = false, found_userns = false; + struct file_handle *handle; + struct nsfs_file_handle *nsfs_fh; + + /* Allocate file_handle structure for nsfs */ + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); + ASSERT_NE(handle, NULL); + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new userns and netns with socket */ + close(ipc_sockets[0]); + + if (setup_userns() < 0) { + close(ipc_sockets[1]); + exit(1); + } + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + ASSERT_EQ(fstat(netns_fd, &st), 0); + netns_ino = st.st_ino; + + /* Get namespace ID */ + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Get owner user namespace */ + userns_fd = ioctl(netns_fd, NS_GET_USERNS); + if (userns_fd < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_USERNS not supported"); + ASSERT_GE(userns_fd, 0); + } + + /* Get owner namespace ID */ + ret = ioctl(userns_fd, NS_GET_ID, &userns_id); + if (ret < 0) { + close(userns_fd); + free(handle); + close(sock_fd); + close(netns_fd); + ASSERT_EQ(ret, 0); + } + close(userns_fd); + + TH_LOG("Testing netns %lu (id=%llu) owned by userns id=%llu", netns_ino, netns_id, userns_id); + + /* Verify namespace appears in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + found_netns = false; + found_userns = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) + found_netns = true; + if (ns_ids[i] == userns_id) + found_userns = true; + } + ASSERT_TRUE(found_netns); + ASSERT_TRUE(found_userns); + TH_LOG("Found netns %llu in listns() output", netns_id); + + /* Construct file handle from namespace ID */ + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; + nsfs_fh->ns_id = netns_id; + nsfs_fh->ns_type = 0; + nsfs_fh->ns_inum = 0; + + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + struct stat reopened_st; + ASSERT_EQ(fstat(reopened_fd, &reopened_st), 0); + ASSERT_EQ(reopened_st.st_ino, netns_ino); + + TH_LOG("Successfully reopened netns %lu via file handle (socket-kept)", netns_ino); + + close(reopened_fd); + close(netns_fd); + + /* Try to reopen via file handle - should fail since namespace is now inactive */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(reopened_fd, 0); + TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno)); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Verify namespace appears in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + found_netns = false; + found_userns = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) + found_netns = true; + if (ns_ids[i] == userns_id) + found_userns = true; + } + ASSERT_TRUE(found_netns); + ASSERT_TRUE(found_userns); + TH_LOG("Found netns %llu in listns() output", netns_id); + + close(netns_fd); + + /* Verify namespace appears in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + found_netns = false; + found_userns = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) + found_netns = true; + if (ns_ids[i] == userns_id) + found_userns = true; + } + ASSERT_FALSE(found_netns); + ASSERT_FALSE(found_userns); + TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id); + + close(sock_fd); + free(handle); +} + +/* + * Test multi-level namespace resurrection across three user namespace levels. + * + * This test creates a complex namespace hierarchy with three levels of user + * namespaces and a network namespace at the deepest level. It verifies that + * the resurrection semantics work correctly when SIOCGSKNS is called on a + * socket from an inactive namespace tree, and that listns() and + * open_by_handle_at() correctly respect visibility rules. + * + * Hierarchy after child processes exit (all with 0 active refcount): + * + * net_L3A (0) <- Level 3 network namespace + * | + * + + * userns_L3 (0) <- Level 3 user namespace + * | + * + + * userns_L2 (0) <- Level 2 user namespace + * | + * + + * userns_L1 (0) <- Level 1 user namespace + * | + * x + * init_user_ns + * + * The test verifies: + * 1. SIOCGSKNS on a socket from inactive net_L3A resurrects the entire chain + * 2. After resurrection, all namespaces are visible in listns() + * 3. Resurrected namespaces can be reopened via file handles + * 4. Closing the netns FD cascades down: the entire ownership chain + * (userns_L3 -> userns_L2 -> userns_L1) becomes inactive again + * 5. Inactive namespaces disappear from listns() and cannot be reopened + * 6. Calling SIOCGSKNS again on the same socket resurrects the tree again + * 7. After second resurrection, namespaces are visible and can be reopened + */ +TEST(siocgskns_multilevel_resurrection) +{ + int ipc_sockets[2]; + pid_t pid_l1, pid_l2, pid_l3; + int status; + + /* Namespace file descriptors to be received from child */ + int sock_L3A_fd = -1; + int netns_L3A_fd = -1; + __u64 netns_L3A_id; + __u64 userns_L1_id, userns_L2_id, userns_L3_id; + + /* For listns() and file handle testing */ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET | CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int ret, i; + struct file_handle *handle; + struct nsfs_file_handle *nsfs_fh; + int reopened_fd; + + /* Allocate file handle for testing */ + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); + ASSERT_NE(handle, NULL); + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + /* + * Fork level 1 child that creates userns_L1 + */ + pid_l1 = fork(); + ASSERT_GE(pid_l1, 0); + + if (pid_l1 == 0) { + /* Level 1 child */ + int ipc_L2[2]; + close(ipc_sockets[0]); + + /* Create userns_L1 */ + if (setup_userns() < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Create socketpair for communicating with L2 child */ + if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L2) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* + * Fork level 2 child that creates userns_L2 + */ + pid_l2 = fork(); + if (pid_l2 < 0) { + close(ipc_sockets[1]); + close(ipc_L2[0]); + close(ipc_L2[1]); + exit(1); + } + + if (pid_l2 == 0) { + /* Level 2 child */ + int ipc_L3[2]; + close(ipc_L2[0]); + + /* Create userns_L2 (nested inside userns_L1) */ + if (setup_userns() < 0) { + close(ipc_L2[1]); + exit(1); + } + + /* Create socketpair for communicating with L3 child */ + if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L3) < 0) { + close(ipc_L2[1]); + exit(1); + } + + /* + * Fork level 3 child that creates userns_L3 and network namespaces + */ + pid_l3 = fork(); + if (pid_l3 < 0) { + close(ipc_L2[1]); + close(ipc_L3[0]); + close(ipc_L3[1]); + exit(1); + } + + if (pid_l3 == 0) { + /* Level 3 child - the deepest level */ + int sock_fd; + close(ipc_L3[0]); + + /* Create userns_L3 (nested inside userns_L2) */ + if (setup_userns() < 0) { + close(ipc_L3[1]); + exit(1); + } + + /* Create network namespace at level 3 */ + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_L3[1]); + exit(1); + } + + /* Create socket in net_L3A */ + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_L3[1]); + exit(1); + } + + /* Send socket FD to L2 parent */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_L3[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_L3[1]); + exit(1); + } + + close(sock_fd); + close(ipc_L3[1]); + exit(0); + } + + /* Level 2 child - receive from L3 and forward to L1 */ + close(ipc_L3[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + int received_fd; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_L3[0], &msg, 0); + close(ipc_L3[0]); + + if (n != 1) { + close(ipc_L2[1]); + waitpid(pid_l3, NULL, 0); + exit(1); + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + close(ipc_L2[1]); + waitpid(pid_l3, NULL, 0); + exit(1); + } + memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for L3 child */ + waitpid(pid_l3, NULL, 0); + + /* Forward the socket FD to L1 parent */ + memset(&msg, 0, sizeof(msg)); + buf[0] = 'Y'; + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int)); + + if (sendmsg(ipc_L2[1], &msg, 0) < 0) { + close(received_fd); + close(ipc_L2[1]); + exit(1); + } + + close(received_fd); + close(ipc_L2[1]); + exit(0); + } + + /* Level 1 child - receive from L2 and forward to parent */ + close(ipc_L2[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + int received_fd; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_L2[0], &msg, 0); + close(ipc_L2[0]); + + if (n != 1) { + close(ipc_sockets[1]); + waitpid(pid_l2, NULL, 0); + exit(1); + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + close(ipc_sockets[1]); + waitpid(pid_l2, NULL, 0); + exit(1); + } + memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for L2 child */ + waitpid(pid_l2, NULL, 0); + + /* Forward the socket FD to parent */ + memset(&msg, 0, sizeof(msg)); + buf[0] = 'Z'; + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(received_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(received_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent - receive the socket from the deepest level */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + + if (n != 1) { + free(handle); + waitpid(pid_l1, NULL, 0); + SKIP(return, "Failed to receive socket from child"); + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + free(handle); + waitpid(pid_l1, NULL, 0); + SKIP(return, "Failed to receive socket from child"); + } + memcpy(&sock_L3A_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for L1 child */ + waitpid(pid_l1, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* + * At this point, all child processes have exited. The socket itself + * doesn't keep the namespace active - we need to call SIOCGSKNS which + * will resurrect the entire namespace tree by taking active references. + */ + + /* Get network namespace from socket - this resurrects the tree */ + netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS); + if (netns_L3A_fd < 0) { + free(handle); + close(sock_L3A_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_L3A_fd, 0); + } + + /* Get namespace ID for net_L3A */ + ret = ioctl(netns_L3A_fd, NS_GET_ID, &netns_L3A_id); + if (ret < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Get owner user namespace chain: userns_L3 -> userns_L2 -> userns_L1 */ + int userns_L3_fd = ioctl(netns_L3A_fd, NS_GET_USERNS); + if (userns_L3_fd < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_USERNS not supported"); + ASSERT_GE(userns_L3_fd, 0); + } + + ret = ioctl(userns_L3_fd, NS_GET_ID, &userns_L3_id); + ASSERT_EQ(ret, 0); + + int userns_L2_fd = ioctl(userns_L3_fd, NS_GET_USERNS); + ASSERT_GE(userns_L2_fd, 0); + ret = ioctl(userns_L2_fd, NS_GET_ID, &userns_L2_id); + ASSERT_EQ(ret, 0); + + int userns_L1_fd = ioctl(userns_L2_fd, NS_GET_USERNS); + ASSERT_GE(userns_L1_fd, 0); + ret = ioctl(userns_L1_fd, NS_GET_ID, &userns_L1_id); + ASSERT_EQ(ret, 0); + + close(userns_L1_fd); + close(userns_L2_fd); + close(userns_L3_fd); + + TH_LOG("Multi-level hierarchy: net_L3A (id=%llu) -> userns_L3 (id=%llu) -> userns_L2 (id=%llu) -> userns_L1 (id=%llu)", + netns_L3A_id, userns_L3_id, userns_L2_id, userns_L1_id); + + /* + * Test 1: Verify net_L3A is visible in listns() after resurrection. + * The entire ownership chain should be resurrected and visible. + */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + bool found_netns_L3A = false; + bool found_userns_L1 = false; + bool found_userns_L2 = false; + bool found_userns_L3 = false; + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_L3A_id) + found_netns_L3A = true; + if (ns_ids[i] == userns_L1_id) + found_userns_L1 = true; + if (ns_ids[i] == userns_L2_id) + found_userns_L2 = true; + if (ns_ids[i] == userns_L3_id) + found_userns_L3 = true; + } + + ASSERT_TRUE(found_netns_L3A); + ASSERT_TRUE(found_userns_L1); + ASSERT_TRUE(found_userns_L2); + ASSERT_TRUE(found_userns_L3); + TH_LOG("Resurrection verified: all namespaces in hierarchy visible in listns()"); + + /* + * Test 2: Verify net_L3A can be reopened via file handle. + */ + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; + nsfs_fh->ns_id = netns_L3A_id; + nsfs_fh->ns_type = 0; + nsfs_fh->ns_inum = 0; + + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + close(reopened_fd); + TH_LOG("File handle test passed: net_L3A can be reopened"); + + /* + * Test 3: Verify that when we close the netns FD (dropping the last + * active reference), the entire tree becomes inactive and disappears + * from listns(). The cascade goes: net_L3A drops -> userns_L3 drops -> + * userns_L2 drops -> userns_L1 drops. + */ + close(netns_L3A_fd); + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found_netns_L3A = false; + found_userns_L1 = false; + found_userns_L2 = false; + found_userns_L3 = false; + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_L3A_id) + found_netns_L3A = true; + if (ns_ids[i] == userns_L1_id) + found_userns_L1 = true; + if (ns_ids[i] == userns_L2_id) + found_userns_L2 = true; + if (ns_ids[i] == userns_L3_id) + found_userns_L3 = true; + } + + ASSERT_FALSE(found_netns_L3A); + ASSERT_FALSE(found_userns_L1); + ASSERT_FALSE(found_userns_L2); + ASSERT_FALSE(found_userns_L3); + TH_LOG("Cascade test passed: all namespaces disappeared after netns FD closed"); + + /* + * Test 4: Verify file handle no longer works for inactive namespace. + */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd >= 0) { + close(reopened_fd); + free(handle); + ASSERT_TRUE(false); /* Should have failed */ + } + TH_LOG("Inactive namespace correctly cannot be reopened via file handle"); + + /* + * Test 5: Verify that calling SIOCGSKNS again resurrects the tree again. + * The socket is still valid, so we can call SIOCGSKNS on it to resurrect + * the namespace tree once more. + */ + netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS); + ASSERT_GE(netns_L3A_fd, 0); + + TH_LOG("Called SIOCGSKNS again to resurrect the namespace tree"); + + /* Verify the namespace tree is resurrected and visible in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found_netns_L3A = false; + found_userns_L1 = false; + found_userns_L2 = false; + found_userns_L3 = false; + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_L3A_id) + found_netns_L3A = true; + if (ns_ids[i] == userns_L1_id) + found_userns_L1 = true; + if (ns_ids[i] == userns_L2_id) + found_userns_L2 = true; + if (ns_ids[i] == userns_L3_id) + found_userns_L3 = true; + } + + ASSERT_TRUE(found_netns_L3A); + ASSERT_TRUE(found_userns_L1); + ASSERT_TRUE(found_userns_L2); + ASSERT_TRUE(found_userns_L3); + TH_LOG("Second resurrection verified: all namespaces in hierarchy visible in listns() again"); + + /* Verify we can reopen via file handle again */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + TH_LOG("open_by_handle_at failed after second resurrection: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + close(reopened_fd); + TH_LOG("File handle test passed: net_L3A can be reopened after second resurrection"); + + /* Final cleanup */ + close(sock_L3A_fd); + close(netns_L3A_fd); + free(handle); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/stress_test.c b/tools/testing/selftests/namespaces/stress_test.c new file mode 100644 index 000000000000..dd7df7d6cb27 --- /dev/null +++ b/tools/testing/selftests/namespaces/stress_test.c @@ -0,0 +1,626 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/nsfs.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Stress tests for namespace active reference counting. + * + * These tests validate that the active reference counting system can handle + * high load scenarios including rapid namespace creation/destruction, large + * numbers of concurrent namespaces, and various edge cases under stress. + */ + +/* + * Test rapid creation and destruction of user namespaces. + * Create and destroy namespaces in quick succession to stress the + * active reference tracking and ensure no leaks occur. + */ +TEST(rapid_namespace_creation_destruction) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[256], ns_ids_after[256]; + ssize_t ret_before, ret_after; + int i; + + /* Get baseline count of active user namespaces */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active user namespaces", ret_before); + + /* Rapidly create and destroy 100 user namespaces */ + for (i = 0; i < 100; i++) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create user namespace and immediately exit */ + if (setup_userns() < 0) + exit(1); + exit(0); + } + + /* Parent: wait for child */ + int status; + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + } + + /* Verify we're back to baseline (no leaked namespaces) */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 100 rapid create/destroy cycles: %zd active user namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test creating many concurrent namespaces. + * Verify that listns() correctly tracks all of them and that they all + * become inactive after processes exit. + */ +TEST(many_concurrent_namespaces) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_during[512], ns_ids_after[512]; + ssize_t ret_before, ret_during, ret_after; + pid_t pids[50]; + int num_children = 50; + int i; + int sv[2]; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active user namespaces", ret_before); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create many children, each with their own user namespace */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + /* Child: create user namespace and wait for parent signal */ + char c; + + close(sv[0]); + + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Signal parent we're ready */ + if (write(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal to exit */ + if (read(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + close(sv[1]); + exit(0); + } + } + + close(sv[1]); + + /* Wait for all children to signal ready */ + for (i = 0; i < num_children; i++) { + char c; + if (read(sv[0], &c, 1) != 1) { + /* If we fail to read, kill all children and exit */ + close(sv[0]); + for (int j = 0; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + /* List namespaces while all children are running */ + ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0); + ASSERT_GE(ret_during, 0); + + TH_LOG("With %d children running: %zd active user namespaces", num_children, ret_during); + + /* Should have at least num_children more namespaces than baseline */ + ASSERT_GE(ret_during, ret_before + num_children); + + /* Signal all children to exit */ + for (i = 0; i < num_children; i++) { + char c = 'X'; + if (write(sv[0], &c, 1) != 1) { + /* If we fail to write, kill remaining children */ + close(sv[0]); + for (int j = i; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + close(sv[0]); + + /* Wait for all children */ + for (i = 0; i < num_children; i++) { + int status; + waitpid(pids[i], &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After all children exit: %zd active user namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test rapid namespace creation with different namespace types. + * Create multiple types of namespaces rapidly to stress the tracking system. + */ +TEST(rapid_mixed_namespace_creation) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + int i; + + /* Get baseline count */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active namespaces (all types)", ret_before); + + /* Rapidly create and destroy namespaces with multiple types */ + for (i = 0; i < 50; i++) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create multiple namespace types */ + if (setup_userns() < 0) + exit(1); + + /* Create additional namespace types */ + if (unshare(CLONE_NEWNET) < 0) + exit(1); + if (unshare(CLONE_NEWUTS) < 0) + exit(1); + if (unshare(CLONE_NEWIPC) < 0) + exit(1); + + exit(0); + } + + /* Parent: wait for child */ + int status; + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 50 rapid mixed namespace cycles: %zd active namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test nested namespace creation under stress. + * Create deeply nested namespace hierarchies and verify proper cleanup. + */ +TEST(nested_namespace_stress) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + int i; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active user namespaces", ret_before); + + /* Create 20 processes, each with nested user namespaces */ + for (i = 0; i < 20; i++) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int userns_fd; + uid_t orig_uid = getuid(); + int depth; + + /* Create nested user namespaces (up to 5 levels) */ + for (depth = 0; depth < 5; depth++) { + userns_fd = get_userns_fd(0, (depth == 0) ? orig_uid : 0, 1); + if (userns_fd < 0) + exit(1); + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + exit(1); + } + close(userns_fd); + } + + exit(0); + } + + /* Parent: wait for child */ + int status; + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 20 nested namespace hierarchies: %zd active user namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test listns() pagination under stress. + * Create many namespaces and verify pagination works correctly. + */ +TEST(listns_pagination_stress) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + pid_t pids[30]; + int num_children = 30; + int i; + int sv[2]; + __u64 all_ns_ids[512]; + int total_found = 0; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create many children with user namespaces */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + char c; + close(sv[0]); + + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Signal parent we're ready */ + if (write(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal to exit */ + if (read(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + close(sv[1]); + exit(0); + } + } + + close(sv[1]); + + /* Wait for all children to signal ready */ + for (i = 0; i < num_children; i++) { + char c; + if (read(sv[0], &c, 1) != 1) { + /* If we fail to read, kill all children and exit */ + close(sv[0]); + for (int j = 0; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + /* Paginate through all namespaces using small batch sizes */ + req.ns_id = 0; + while (1) { + __u64 batch[5]; /* Small batch size to force pagination */ + ssize_t ret; + + ret = sys_listns(&req, batch, ARRAY_SIZE(batch), 0); + if (ret < 0) { + if (errno == ENOSYS) { + close(sv[0]); + for (i = 0; i < num_children; i++) + kill(pids[i], SIGKILL); + for (i = 0; i < num_children; i++) + waitpid(pids[i], NULL, 0); + SKIP(return, "listns() not supported"); + } + ASSERT_GE(ret, 0); + } + + if (ret == 0) + break; + + /* Store results */ + for (i = 0; i < ret && total_found < 512; i++) { + all_ns_ids[total_found++] = batch[i]; + } + + /* Update cursor for next batch */ + if (ret == ARRAY_SIZE(batch)) + req.ns_id = batch[ret - 1]; + else + break; + } + + TH_LOG("Paginated through %d user namespaces", total_found); + + /* Verify no duplicates in pagination */ + for (i = 0; i < total_found; i++) { + for (int j = i + 1; j < total_found; j++) { + if (all_ns_ids[i] == all_ns_ids[j]) { + TH_LOG("Found duplicate ns_id: %llu at positions %d and %d", + (unsigned long long)all_ns_ids[i], i, j); + ASSERT_TRUE(false); + } + } + } + + /* Signal all children to exit */ + for (i = 0; i < num_children; i++) { + char c = 'X'; + if (write(sv[0], &c, 1) != 1) { + close(sv[0]); + for (int j = i; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + close(sv[0]); + + /* Wait for all children */ + for (i = 0; i < num_children; i++) { + int status; + waitpid(pids[i], &status, 0); + } +} + +/* + * Test concurrent namespace operations. + * Multiple processes creating, querying, and destroying namespaces concurrently. + */ +TEST(concurrent_namespace_operations) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + pid_t pids[20]; + int num_workers = 20; + int i; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active namespaces", ret_before); + + /* Create worker processes that do concurrent operations */ + for (i = 0; i < num_workers; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + /* Each worker: create namespaces, list them, repeat */ + int iterations; + + for (iterations = 0; iterations < 10; iterations++) { + int userns_fd; + __u64 temp_ns_ids[100]; + ssize_t ret; + + /* Create a user namespace */ + userns_fd = get_userns_fd(0, getuid(), 1); + if (userns_fd < 0) + continue; + + /* List namespaces */ + ret = sys_listns(&req, temp_ns_ids, ARRAY_SIZE(temp_ns_ids), 0); + (void)ret; + + close(userns_fd); + + /* Small delay */ + usleep(1000); + } + + exit(0); + } + } + + /* Wait for all workers */ + for (i = 0; i < num_workers; i++) { + int status; + waitpid(pids[i], &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After concurrent operations: %zd active namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test namespace churn - continuous creation and destruction. + * Simulates high-churn scenarios like container orchestration. + */ +TEST(namespace_churn) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWUTS, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + int cycle; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active namespaces", ret_before); + + /* Simulate churn: batches of namespaces created and destroyed */ + for (cycle = 0; cycle < 10; cycle++) { + pid_t batch_pids[10]; + int i; + + /* Create batch */ + for (i = 0; i < 10; i++) { + batch_pids[i] = fork(); + ASSERT_GE(batch_pids[i], 0); + + if (batch_pids[i] == 0) { + /* Create multiple namespace types */ + if (setup_userns() < 0) + exit(1); + if (unshare(CLONE_NEWNET) < 0) + exit(1); + if (unshare(CLONE_NEWUTS) < 0) + exit(1); + + /* Keep namespaces alive briefly */ + usleep(10000); + exit(0); + } + } + + /* Wait for batch to complete */ + for (i = 0; i < 10; i++) { + int status; + waitpid(batch_pids[i], &status, 0); + } + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 10 churn cycles (100 namespace sets): %zd active namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/wrappers.h b/tools/testing/selftests/namespaces/wrappers.h new file mode 100644 index 000000000000..9741a64a5b1d --- /dev/null +++ b/tools/testing/selftests/namespaces/wrappers.h @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/nsfs.h> +#include <linux/types.h> +#include <sys/syscall.h> +#include <unistd.h> + +#ifndef __SELFTESTS_NAMESPACES_WRAPPERS_H__ +#define __SELFTESTS_NAMESPACES_WRAPPERS_H__ + +#ifndef __NR_listns + #if defined __alpha__ + #define __NR_listns 580 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_listns 4470 + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_listns 6470 + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_listns 5470 + #endif + #else + #define __NR_listns 470 + #endif +#endif + +static inline int sys_listns(const struct ns_id_req *req, __u64 *ns_ids, + size_t nr_ns_ids, unsigned int flags) +{ + return syscall(__NR_listns, req, ns_ids, nr_ns_ids, flags); +} + +#endif /* __SELFTESTS_NAMESPACES_WRAPPERS_H__ */ diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 439101b518ee..8f9850a71f54 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -45,6 +45,7 @@ skf_net_off socket so_incoming_cpu so_netns_cookie +so_peek_off so_txtime so_rcv_listener stress_reuseport_listen diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile index de805cbbdf69..528d14c598bb 100644 --- a/tools/testing/selftests/net/af_unix/Makefile +++ b/tools/testing/selftests/net/af_unix/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS := \ scm_inq \ scm_pidfd \ scm_rights \ + so_peek_off \ unix_connect \ # end of TEST_GEN_PROGS diff --git a/tools/testing/selftests/net/af_unix/so_peek_off.c b/tools/testing/selftests/net/af_unix/so_peek_off.c new file mode 100644 index 000000000000..1a77728128e5 --- /dev/null +++ b/tools/testing/selftests/net/af_unix/so_peek_off.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2025 Google LLC */ + +#include <stdlib.h> +#include <unistd.h> + +#include <sys/socket.h> + +#include "../../kselftest_harness.h" + +FIXTURE(so_peek_off) +{ + int fd[2]; /* 0: sender, 1: receiver */ +}; + +FIXTURE_VARIANT(so_peek_off) +{ + int type; +}; + +FIXTURE_VARIANT_ADD(so_peek_off, stream) +{ + .type = SOCK_STREAM, +}; + +FIXTURE_VARIANT_ADD(so_peek_off, dgram) +{ + .type = SOCK_DGRAM, +}; + +FIXTURE_VARIANT_ADD(so_peek_off, seqpacket) +{ + .type = SOCK_SEQPACKET, +}; + +FIXTURE_SETUP(so_peek_off) +{ + struct timeval timeout = { + .tv_sec = 0, + .tv_usec = 3000, + }; + int ret; + + ret = socketpair(AF_UNIX, variant->type, 0, self->fd); + ASSERT_EQ(0, ret); + + ret = setsockopt(self->fd[1], SOL_SOCKET, SO_RCVTIMEO_NEW, + &timeout, sizeof(timeout)); + ASSERT_EQ(0, ret); + + ret = setsockopt(self->fd[1], SOL_SOCKET, SO_PEEK_OFF, + &(int){0}, sizeof(int)); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(so_peek_off) +{ + close_range(self->fd[0], self->fd[1], 0); +} + +#define sendeq(fd, str, flags) \ + do { \ + int bytes, len = strlen(str); \ + \ + bytes = send(fd, str, len, flags); \ + ASSERT_EQ(len, bytes); \ + } while (0) + +#define recveq(fd, str, buflen, flags) \ + do { \ + char buf[(buflen) + 1] = {}; \ + int bytes; \ + \ + bytes = recv(fd, buf, buflen, flags); \ + ASSERT_NE(-1, bytes); \ + ASSERT_STREQ(str, buf); \ + } while (0) + +#define async \ + for (pid_t pid = (pid = fork(), \ + pid < 0 ? \ + __TH_LOG("Failed to start async {}"), \ + _metadata->exit_code = KSFT_FAIL, \ + __bail(1, _metadata), \ + 0xdead : \ + pid); \ + !pid; exit(0)) + +TEST_F(so_peek_off, single_chunk) +{ + sendeq(self->fd[0], "aaaabbbb", 0); + + recveq(self->fd[1], "aaaa", 4, MSG_PEEK); + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_F(so_peek_off, two_chunks) +{ + sendeq(self->fd[0], "aaaa", 0); + sendeq(self->fd[0], "bbbb", 0); + + recveq(self->fd[1], "aaaa", 4, MSG_PEEK); + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_F(so_peek_off, two_chunks_blocking) +{ + async { + usleep(1000); + sendeq(self->fd[0], "aaaa", 0); + } + + recveq(self->fd[1], "aaaa", 4, MSG_PEEK); + + async { + usleep(1000); + sendeq(self->fd[0], "bbbb", 0); + } + + /* goto again; -> goto redo; in unix_stream_read_generic(). */ + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_F(so_peek_off, two_chunks_overlap) +{ + sendeq(self->fd[0], "aaaa", 0); + recveq(self->fd[1], "aa", 2, MSG_PEEK); + + sendeq(self->fd[0], "bbbb", 0); + + if (variant->type == SOCK_STREAM) { + /* SOCK_STREAM tries to fill the buffer. */ + recveq(self->fd[1], "aabb", 4, MSG_PEEK); + recveq(self->fd[1], "bb", 100, MSG_PEEK); + } else { + /* SOCK_DGRAM and SOCK_SEQPACKET returns at the skb boundary. */ + recveq(self->fd[1], "aa", 100, MSG_PEEK); + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); + } +} + +TEST_F(so_peek_off, two_chunks_overlap_blocking) +{ + async { + usleep(1000); + sendeq(self->fd[0], "aaaa", 0); + } + + recveq(self->fd[1], "aa", 2, MSG_PEEK); + + async { + usleep(1000); + sendeq(self->fd[0], "bbbb", 0); + } + + /* Even SOCK_STREAM does not wait if at least one byte is read. */ + recveq(self->fd[1], "aa", 100, MSG_PEEK); + + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/forwarding/lib_sh_test.sh b/tools/testing/selftests/net/forwarding/lib_sh_test.sh index ff2accccaf4d..b4eda6c6199e 100755 --- a/tools/testing/selftests/net/forwarding/lib_sh_test.sh +++ b/tools/testing/selftests/net/forwarding/lib_sh_test.sh @@ -30,6 +30,11 @@ tfail() do_test "tfail" false } +tfail2() +{ + do_test "tfail2" false +} + txfail() { FAIL_TO_XFAIL=yes do_test "txfail" false @@ -132,6 +137,8 @@ test_ret() ret_subtest $ksft_fail "tfail" txfail tfail ret_subtest $ksft_xfail "txfail" txfail txfail + + ret_subtest $ksft_fail "tfail2" tfail2 tfail } exit_status_tests_run() diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index ecd34f364125..892895659c7e 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -176,6 +176,8 @@ run_test() local rcv_dmac=$(mac_get $rcv_if_name) local should_receive + setup_wait + tcpdump_start $rcv_if_name mc_route_prepare $send_if_name diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index feba4ef69a54..f448bafb3f20 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -43,7 +43,7 @@ __ksft_status_merge() weights[$i]=$((weight++)) done - if [[ ${weights[$a]} > ${weights[$b]} ]]; then + if [[ ${weights[$a]} -ge ${weights[$b]} ]]; then echo "$a" return 0 else diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index b148cadb96d0..fc7e22b503d3 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -710,8 +710,14 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bw = do_rnd_write(peerfd, winfo->buf + winfo->off, winfo->len); if (bw < 0) { - if (cfg_rcv_trunc) - return 0; + /* expected reset, continue to read */ + if (cfg_rcv_trunc && + (errno == ECONNRESET || + errno == EPIPE)) { + fds.events &= ~POLLOUT; + continue; + } + perror("write"); return 111; } @@ -737,8 +743,10 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, } if (fds.revents & (POLLERR | POLLNVAL)) { - if (cfg_rcv_trunc) - return 0; + if (cfg_rcv_trunc) { + fds.events &= ~(POLLERR | POLLNVAL); + continue; + } fprintf(stderr, "Unexpected revents: " "POLLERR/POLLNVAL(%x)\n", fds.revents); return 5; @@ -1433,7 +1441,7 @@ static void parse_opts(int argc, char **argv) */ if (cfg_truncate < 0) { cfg_rcv_trunc = true; - signal(SIGPIPE, handle_signal); + signal(SIGPIPE, SIG_IGN); } break; case 'j': diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 47ecb5b3836e..9b7b93f8eb0c 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -492,7 +492,7 @@ do_transfer() "than expected (${expect_synrx})" retc=1 fi - if [ ${stat_ackrx_now_l} -lt ${expect_ackrx} ] && [ ${stat_ooo_now} -eq 0 ]; then + if [ ${stat_ackrx_now_l} -lt ${expect_ackrx} ]; then if [ ${stat_ooo_now} -eq 0 ]; then mptcp_lib_pr_fail "lower MPC ACK rx (${stat_ackrx_now_l})" \ "than expected (${expect_ackrx})" diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 78a1aa4ecff2..43f31f8d587f 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -2532,7 +2532,7 @@ remove_tests() if reset "remove single subflow"; then pm_nl_set_limits $ns1 0 1 pm_nl_set_limits $ns2 0 1 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup addr_nr_ns2=-1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 @@ -2545,8 +2545,8 @@ remove_tests() if reset "remove multiple subflows"; then pm_nl_set_limits $ns1 0 2 pm_nl_set_limits $ns2 0 2 - pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup addr_nr_ns2=-2 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 2 2 2 @@ -2557,7 +2557,7 @@ remove_tests() # single address, remove if reset "remove single address"; then pm_nl_set_limits $ns1 0 1 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 1 addr_nr_ns1=-1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 @@ -2570,9 +2570,9 @@ remove_tests() # subflow and signal, remove if reset "remove subflow and signal"; then pm_nl_set_limits $ns1 0 2 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 2 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup addr_nr_ns1=-1 addr_nr_ns2=-1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 2 2 2 @@ -2584,10 +2584,10 @@ remove_tests() # subflows and signal, remove if reset "remove subflows and signal"; then pm_nl_set_limits $ns1 0 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 3 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup addr_nr_ns1=-1 addr_nr_ns2=-2 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 @@ -2599,9 +2599,9 @@ remove_tests() # addresses remove if reset "remove addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal id 250 - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal - pm_nl_add_endpoint $ns1 10.0.4.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup id 250 + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.4.1 flags signal,backup pm_nl_set_limits $ns2 3 3 addr_nr_ns1=-3 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 @@ -2614,10 +2614,10 @@ remove_tests() # invalid addresses remove if reset "remove invalid addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.12.1 flags signal + pm_nl_add_endpoint $ns1 10.0.12.1 flags signal,backup # broadcast IP: no packet for this address will be received on ns1 - pm_nl_add_endpoint $ns1 224.0.0.1 flags signal - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal + pm_nl_add_endpoint $ns1 224.0.0.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup pm_nl_set_limits $ns2 2 2 addr_nr_ns1=-3 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 @@ -2631,10 +2631,10 @@ remove_tests() # subflows and signal, flush if reset "flush subflows and signal"; then pm_nl_set_limits $ns1 0 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 3 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 @@ -2647,9 +2647,9 @@ remove_tests() if reset "flush subflows"; then pm_nl_set_limits $ns1 3 3 pm_nl_set_limits $ns2 3 3 - pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow id 150 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow,backup id 150 + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 @@ -2666,9 +2666,9 @@ remove_tests() # addresses flush if reset "flush addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal id 250 - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal - pm_nl_add_endpoint $ns1 10.0.4.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup id 250 + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.4.1 flags signal,backup pm_nl_set_limits $ns2 3 3 addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 @@ -2681,9 +2681,9 @@ remove_tests() # invalid addresses flush if reset "flush invalid addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.12.1 flags signal - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal - pm_nl_add_endpoint $ns1 10.0.14.1 flags signal + pm_nl_add_endpoint $ns1 10.0.12.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.14.1 flags signal,backup pm_nl_set_limits $ns2 3 3 addr_nr_ns1=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 @@ -3500,7 +3500,6 @@ fullmesh_tests() fastclose_tests() { if reset_check_counter "fastclose test" "MPTcpExtMPFastcloseTx"; then - MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=client \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 @@ -3509,7 +3508,6 @@ fastclose_tests() fi if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then - MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=server \ run_tests $ns1 $ns2 10.0.1.1 join_rst_nr=1 \ @@ -3806,7 +3804,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns1 pm_nl_set_limits $ns2 2 2 - { speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 @@ -3831,7 +3829,7 @@ userspace_tests() chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm create destroy subflow @@ -3839,7 +3837,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - { speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3859,7 +3857,7 @@ userspace_tests() chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm create id 0 subflow @@ -3867,7 +3865,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - { speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3880,7 +3878,7 @@ userspace_tests() chk_mptcp_info subflows 1 subflows 1 chk_subflows_total 2 2 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm remove initial subflow @@ -3888,7 +3886,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - { speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3904,7 +3902,7 @@ userspace_tests() chk_mptcp_info subflows 1 subflows 1 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm send RM_ADDR for ID 0 @@ -3912,7 +3910,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns1 pm_nl_set_limits $ns2 1 1 - { speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 @@ -3930,7 +3928,7 @@ userspace_tests() chk_mptcp_info subflows 1 subflows 1 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi } @@ -3943,7 +3941,7 @@ endpoint_tests() pm_nl_set_limits $ns1 2 2 pm_nl_set_limits $ns2 2 2 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal - { speed=slow \ + { timeout_test=120 test_linkfail=128 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -3960,7 +3958,7 @@ endpoint_tests() pm_nl_add_endpoint $ns2 10.0.2.2 flags signal pm_nl_check_endpoint "modif is allowed" \ $ns2 10.0.2.2 id 1 flags signal - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT && @@ -3970,7 +3968,7 @@ endpoint_tests() pm_nl_set_limits $ns2 0 3 pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow - { test_linkfail=4 speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -4015,7 +4013,7 @@ endpoint_tests() chk_mptcp_info subflows 3 subflows 3 done - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid kill_events_pids chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1 @@ -4048,7 +4046,7 @@ endpoint_tests() # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal - { test_linkfail=4 speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -4057,39 +4055,46 @@ endpoint_tests() $ns1 10.0.2.1 id 1 flags signal chk_subflow_nr "before delete" 2 chk_mptcp_info subflows 1 subflows 1 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 1 pm_nl_del_endpoint $ns1 1 10.0.2.1 pm_nl_del_endpoint $ns1 2 224.0.0.1 sleep 0.5 chk_subflow_nr "after delete" 1 chk_mptcp_info subflows 0 subflows 0 + chk_mptcp_info add_addr_signal 0 add_addr_accepted 0 pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal wait_mpj $ns2 chk_subflow_nr "after re-add" 3 chk_mptcp_info subflows 2 subflows 2 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 pm_nl_del_endpoint $ns1 42 10.0.1.1 sleep 0.5 chk_subflow_nr "after delete ID 0" 2 chk_mptcp_info subflows 2 subflows 2 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal wait_mpj $ns2 chk_subflow_nr "after re-add ID 0" 3 chk_mptcp_info subflows 3 subflows 3 + chk_mptcp_info add_addr_signal 3 add_addr_accepted 2 pm_nl_del_endpoint $ns1 99 10.0.1.1 sleep 0.5 chk_subflow_nr "after re-delete ID 0" 2 chk_mptcp_info subflows 2 subflows 2 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 pm_nl_add_endpoint $ns1 10.0.1.1 id 88 flags signal wait_mpj $ns2 chk_subflow_nr "after re-re-add ID 0" 3 chk_mptcp_info subflows 3 subflows 3 - mptcp_lib_kill_wait $tests_pid + chk_mptcp_info add_addr_signal 3 add_addr_accepted 2 + mptcp_lib_kill_group_wait $tests_pid kill_events_pids chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1 @@ -4121,7 +4126,7 @@ endpoint_tests() # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow - { test_linkfail=4 speed=20 \ + { timeout_test=120 test_linkfail=128 speed=20 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -4137,7 +4142,7 @@ endpoint_tests() wait_mpj $ns2 pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal wait_mpj $ns2 - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid join_syn_tx=3 join_connect_err=1 \ chk_join_nr 2 2 2 diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index d62e653d48b0..f4388900016a 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -350,6 +350,27 @@ mptcp_lib_kill_wait() { wait "${1}" 2>/dev/null } +# $1: PID +mptcp_lib_pid_list_children() { + local curr="${1}" + # evoke 'ps' only once + local pids="${2:-"$(ps o pid,ppid)"}" + + echo "${curr}" + + local pid + for pid in $(echo "${pids}" | awk "\$2 == ${curr} { print \$1 }"); do + mptcp_lib_pid_list_children "${pid}" "${pids}" + done +} + +# $1: PID +mptcp_lib_kill_group_wait() { + # Some users might not have procps-ng: cannot use "kill -- -PID" + mptcp_lib_pid_list_children "${1}" | xargs -r kill &>/dev/null + wait "${1}" 2>/dev/null +} + # $1: IP address mptcp_lib_is_v6() { [ -z "${1##*:*}" ] diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc index 330e000baeb1..9416ae952e18 100644 --- a/tools/testing/selftests/nolibc/Makefile.nolibc +++ b/tools/testing/selftests/nolibc/Makefile.nolibc @@ -87,7 +87,6 @@ IMAGE_riscv = arch/riscv/boot/Image IMAGE_riscv32 = arch/riscv/boot/Image IMAGE_riscv64 = arch/riscv/boot/Image IMAGE_s390x = arch/s390/boot/bzImage -IMAGE_s390 = arch/s390/boot/bzImage IMAGE_loongarch = arch/loongarch/boot/vmlinuz.efi IMAGE_sparc32 = arch/sparc/boot/image IMAGE_sparc64 = arch/sparc/boot/image @@ -117,7 +116,6 @@ DEFCONFIG_riscv = defconfig DEFCONFIG_riscv32 = rv32_defconfig DEFCONFIG_riscv64 = defconfig DEFCONFIG_s390x = defconfig -DEFCONFIG_s390 = defconfig compat.config DEFCONFIG_loongarch = defconfig DEFCONFIG_sparc32 = sparc32_defconfig DEFCONFIG_sparc64 = sparc64_defconfig @@ -156,7 +154,6 @@ QEMU_ARCH_riscv = riscv64 QEMU_ARCH_riscv32 = riscv32 QEMU_ARCH_riscv64 = riscv64 QEMU_ARCH_s390x = s390x -QEMU_ARCH_s390 = s390x QEMU_ARCH_loongarch = loongarch64 QEMU_ARCH_sparc32 = sparc QEMU_ARCH_sparc64 = sparc64 @@ -197,7 +194,6 @@ QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_T QEMU_ARGS_riscv32 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_riscv64 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_s390x = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" -QEMU_ARGS_s390 = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_loongarch = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_sparc32 = -M SS-5 -m 256M -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_sparc64 = -M sun4u -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" @@ -223,7 +219,6 @@ CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2) CFLAGS_s390x = -m64 -CFLAGS_s390 = -m31 CFLAGS_mips32le = -EL -mabi=32 -fPIC CFLAGS_mips32be = -EB -mabi=32 CFLAGS_mipsn32le = -EL -mabi=n32 -fPIC -march=mips64r2 diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index e8af1fb505cf..210abe715ed9 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -23,7 +23,7 @@ all_archs=( mips32le mips32be mipsn32le mipsn32be mips64le mips64be ppc ppc64 ppc64le riscv32 riscv64 - s390x s390 + s390x loongarch sparc32 sparc64 m68k @@ -185,10 +185,6 @@ test_arch() { exit 1 esac printf '%-15s' "$arch:" - if [ "$arch" = "s390" ] && ([ "$llvm" = "1" ] || [ "$test_mode" = "user" ]); then - echo "Unsupported configuration" - return - fi if [ "$arch" = "m68k" -o "$arch" = "sh4" ] && [ "$llvm" = "1" ]; then echo "Unsupported configuration" return diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index f87993def738..d60f10a873bb 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -148,6 +148,14 @@ #define PIDFD_INFO_COREDUMP (1UL << 4) #endif +#ifndef PIDFD_INFO_SUPPORTED_MASK +#define PIDFD_INFO_SUPPORTED_MASK (1UL << 5) +#endif + +#ifndef PIDFD_INFO_COREDUMP_SIGNAL +#define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6) +#endif + #ifndef PIDFD_COREDUMPED #define PIDFD_COREDUMPED (1U << 0) /* Did crash and... */ #endif @@ -183,8 +191,11 @@ struct pidfd_info { __u32 fsuid; __u32 fsgid; __s32 exit_code; - __u32 coredump_mask; - __u32 __spare1; + struct { + __u32 coredump_mask; + __u32 coredump_signal; + }; + __u64 supported_mask; }; /* diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c index a0eb6e81eaa2..cb5430a2fd75 100644 --- a/tools/testing/selftests/pidfd/pidfd_info_test.c +++ b/tools/testing/selftests/pidfd/pidfd_info_test.c @@ -690,4 +690,77 @@ TEST_F(pidfd_info, thread_group_exec_thread) EXPECT_EQ(close(pidfd_thread), 0); } +/* + * Test: PIDFD_INFO_SUPPORTED_MASK field + * + * Verify that when PIDFD_INFO_SUPPORTED_MASK is requested, the kernel + * returns the supported_mask field indicating which flags the kernel supports. + */ +TEST(supported_mask_field) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_SUPPORTED_MASK, + }; + int pidfd; + pid_t pid; + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + + if (pid == 0) + pause(); + + /* Request supported_mask field */ + ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0); + + /* Verify PIDFD_INFO_SUPPORTED_MASK is set in the reply */ + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_SUPPORTED_MASK)); + + /* Verify supported_mask contains expected flags */ + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_PID)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_CGROUPID)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_EXIT)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_SUPPORTED_MASK)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP_SIGNAL)); + + /* Clean up */ + sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0); + sys_waitid(P_PIDFD, pidfd, NULL, WEXITED); + close(pidfd); +} + +/* + * Test: PIDFD_INFO_SUPPORTED_MASK always available + * + * Verify that supported_mask is returned even when other fields are requested. + */ +TEST(supported_mask_with_other_fields) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_SUPPORTED_MASK, + }; + int pidfd; + pid_t pid; + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + + if (pid == 0) + pause(); + + ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0); + + /* Both fields should be present */ + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CGROUPID)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_SUPPORTED_MASK)); + ASSERT_NE(info.supported_mask, 0); + + /* Clean up */ + sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0); + sys_waitid(P_PIDFD, pidfd, NULL, WEXITED); + close(pidfd); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/rseq/rseq-s390.h b/tools/testing/selftests/rseq/rseq-s390.h index 33baaa9f9997..e7b858cd3736 100644 --- a/tools/testing/selftests/rseq/rseq-s390.h +++ b/tools/testing/selftests/rseq/rseq-s390.h @@ -28,8 +28,6 @@ do { \ RSEQ_WRITE_ONCE(*(p), v); \ } while (0) -#ifdef __s390x__ - #define LONG_L "lg" #define LONG_S "stg" #define LONG_LT_R "ltgr" @@ -63,43 +61,6 @@ do { \ ".quad " __rseq_str(start_ip) ", " __rseq_str(exit_ip) "\n\t" \ ".popsection\n\t" -#elif __s390__ - -#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \ - start_ip, post_commit_offset, abort_ip) \ - ".pushsection __rseq_cs, \"aw\"\n\t" \ - ".balign 32\n\t" \ - __rseq_str(label) ":\n\t" \ - ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \ - ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \ - ".popsection\n\t" \ - ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \ - ".long 0x0, " __rseq_str(label) "b\n\t" \ - ".popsection\n\t" - -/* - * Exit points of a rseq critical section consist of all instructions outside - * of the critical section where a critical section can either branch to or - * reach through the normal course of its execution. The abort IP and the - * post-commit IP are already part of the __rseq_cs section and should not be - * explicitly defined as additional exit points. Knowing all exit points is - * useful to assist debuggers stepping over the critical section. - */ -#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \ - ".pushsection __rseq_exit_point_array, \"aw\"\n\t" \ - ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) "\n\t" \ - ".popsection\n\t" - -#define LONG_L "l" -#define LONG_S "st" -#define LONG_LT_R "ltr" -#define LONG_CMP "c" -#define LONG_CMP_R "cr" -#define LONG_ADDI "ahi" -#define LONG_ADD_R "ar" - -#endif - #define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \ (post_commit_ip - start_ip), abort_ip) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 998e5a2f4579..0091bcd91c2c 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -961,5 +961,49 @@ "teardown": [ "$TC qdisc del dev $DUMMY root" ] + }, + { + "id": "4989", + "name": "Try to add an fq child to an ingress qdisc", + "category": [ + "qdisc", + "ingress" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DUMMY handle ffff:0 ingress" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY parent ffff:0 handle ffe0:0 fq", + "expExitCode": "2", + "verifyCmd": "$TC -j qdisc ls dev $DUMMY handle ffe0:", + "matchJSON": [], + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY ingress" + ] + }, + { + "id": "c2b0", + "name": "Try to add an fq child to a clsact qdisc", + "category": [ + "qdisc", + "ingress" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DUMMY handle ffff:0 clsact" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY parent ffff:0 handle ffe0:0 fq", + "expExitCode": "2", + "verifyCmd": "$TC -j qdisc ls dev $DUMMY handle ffe0:", + "matchJSON": [], + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY clsact" + ] } ] diff --git a/tools/testing/selftests/timers/nanosleep.c b/tools/testing/selftests/timers/nanosleep.c index 252c6308c569..10badae13ebe 100644 --- a/tools/testing/selftests/timers/nanosleep.c +++ b/tools/testing/selftests/timers/nanosleep.c @@ -116,6 +116,56 @@ int nanosleep_test(int clockid, long long ns) return 0; } +static void dummy_event_handler(int val) +{ + /* No action needed */ +} + +static int nanosleep_test_remaining(int clockid) +{ + struct timespec rqtp = {}, rmtp = {}; + struct itimerspec itimer = {}; + struct sigaction sa = {}; + timer_t timer; + int ret; + + sa.sa_handler = dummy_event_handler; + ret = sigaction(SIGALRM, &sa, NULL); + if (ret) + return -1; + + ret = timer_create(clockid, NULL, &timer); + if (ret) + return -1; + + itimer.it_value.tv_nsec = NSEC_PER_SEC / 4; + ret = timer_settime(timer, 0, &itimer, NULL); + if (ret) + return -1; + + rqtp.tv_nsec = NSEC_PER_SEC / 2; + ret = clock_nanosleep(clockid, 0, &rqtp, &rmtp); + if (ret != EINTR) + return -1; + + ret = timer_delete(timer); + if (ret) + return -1; + + sa.sa_handler = SIG_DFL; + ret = sigaction(SIGALRM, &sa, NULL); + if (ret) + return -1; + + if (!in_order((struct timespec) {}, rmtp)) + return -1; + + if (!in_order(rmtp, rqtp)) + return -1; + + return 0; +} + int main(int argc, char **argv) { long long length; @@ -150,6 +200,11 @@ int main(int argc, char **argv) } length *= 100; } + ret = nanosleep_test_remaining(clockid); + if (ret < 0) { + ksft_test_result_fail("%-31s\n", clockstring(clockid)); + ksft_exit_fail(); + } ksft_test_result_pass("%-31s\n", clockstring(clockid)); next: ret = 0; diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index f0eceb0faf34..a563c438ac79 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -18,6 +18,7 @@ #include <time.h> #include <include/vdso/time64.h> #include <pthread.h> +#include <stdbool.h> #include "../kselftest.h" @@ -670,8 +671,14 @@ static void check_timer_create_exact(void) int main(int argc, char **argv) { + bool run_sig_ign_tests = ksft_min_kernel_version(6, 13); + ksft_print_header(); - ksft_set_plan(19); + if (run_sig_ign_tests) { + ksft_set_plan(19); + } else { + ksft_set_plan(10); + } ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n"); ksft_print_msg("based timers if other threads run on the CPU...\n"); @@ -695,15 +702,20 @@ int main(int argc, char **argv) check_timer_create(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); check_timer_distribution(); - check_sig_ign(0); - check_sig_ign(1); - check_rearm(); - check_delete(); - check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); - check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); - check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); - check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); - check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); + if (run_sig_ign_tests) { + check_sig_ign(0); + check_sig_ign(1); + check_rearm(); + check_delete(); + check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); + check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); + check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); + } else { + ksft_print_msg("Skipping SIG_IGN tests on kernel < 6.13\n"); + } + check_overrun(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); check_overrun(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); check_overrun(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c index 5288e768b207..68625362add2 100644 --- a/tools/testing/selftests/user_events/perf_test.c +++ b/tools/testing/selftests/user_events/perf_test.c @@ -236,7 +236,7 @@ TEST_F(user, perf_empty_events) { ASSERT_EQ(1 << reg.enable_bit, self->check); /* Ensure write shows up at correct offset */ - ASSERT_NE(-1, write(self->data_fd, ®.write_index, + ASSERT_NE(-1, write(self->data_fd, (void *)®.write_index, sizeof(reg.write_index))); val = (void *)(((char *)perf_page) + perf_page->data_offset); ASSERT_EQ(PERF_RECORD_SAMPLE, *val); diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h index 5fdd0f362337..50c261005111 100644 --- a/tools/testing/selftests/vDSO/vdso_config.h +++ b/tools/testing/selftests/vDSO/vdso_config.h @@ -25,10 +25,6 @@ #define VDSO_VERSION 1 #define VDSO_NAMES 0 #define VDSO_32BIT 1 -#elif defined (__s390__) && !defined(__s390x__) -#define VDSO_VERSION 2 -#define VDSO_NAMES 0 -#define VDSO_32BIT 1 #elif defined (__s390x__) #define VDSO_VERSION 2 #define VDSO_NAMES 0 diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index 240409bf5f8a..69ec0c856481 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -4,9 +4,12 @@ #include <fcntl.h> #include <string.h> -#include <linux/vfio.h> + +#include <uapi/linux/types.h> +#include <linux/iommufd.h> #include <linux/list.h> #include <linux/pci_regs.h> +#include <linux/vfio.h> #include "../../../kselftest.h" @@ -185,6 +188,13 @@ struct vfio_pci_device { struct vfio_pci_driver driver; }; +struct iova_allocator { + struct iommu_iova_range *ranges; + u32 nranges; + u32 range_idx; + u64 range_offset; +}; + /* * Return the BDF string of the device that the test should use. * @@ -206,6 +216,13 @@ struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_ void vfio_pci_device_cleanup(struct vfio_pci_device *device); void vfio_pci_device_reset(struct vfio_pci_device *device); +struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, + u32 *nranges); + +struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device); +void iova_allocator_cleanup(struct iova_allocator *allocator); +iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size); + int __vfio_pci_dma_map(struct vfio_pci_device *device, struct vfio_dma_region *region); int __vfio_pci_dma_unmap(struct vfio_pci_device *device, diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index a381fd253aa7..b479a359da12 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -12,11 +12,12 @@ #include <sys/mman.h> #include <uapi/linux/types.h> +#include <linux/iommufd.h> #include <linux/limits.h> #include <linux/mman.h> +#include <linux/overflow.h> #include <linux/types.h> #include <linux/vfio.h> -#include <linux/iommufd.h> #include "../../../kselftest.h" #include <vfio_util.h> @@ -29,6 +30,249 @@ VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ } while (0) +static struct vfio_info_cap_header *next_cap_hdr(void *buf, u32 bufsz, + u32 *cap_offset) +{ + struct vfio_info_cap_header *hdr; + + if (!*cap_offset) + return NULL; + + VFIO_ASSERT_LT(*cap_offset, bufsz); + VFIO_ASSERT_GE(bufsz - *cap_offset, sizeof(*hdr)); + + hdr = (struct vfio_info_cap_header *)((u8 *)buf + *cap_offset); + *cap_offset = hdr->next; + + return hdr; +} + +static struct vfio_info_cap_header *vfio_iommu_info_cap_hdr(struct vfio_iommu_type1_info *info, + u16 cap_id) +{ + struct vfio_info_cap_header *hdr; + u32 cap_offset = info->cap_offset; + u32 max_depth; + u32 depth = 0; + + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) + return NULL; + + if (cap_offset) + VFIO_ASSERT_GE(cap_offset, sizeof(*info)); + + max_depth = (info->argsz - sizeof(*info)) / sizeof(*hdr); + + while ((hdr = next_cap_hdr(info, info->argsz, &cap_offset))) { + depth++; + VFIO_ASSERT_LE(depth, max_depth, "Capability chain contains a cycle\n"); + + if (hdr->id == cap_id) + return hdr; + } + + return NULL; +} + +/* Return buffer including capability chain, if present. Free with free() */ +static struct vfio_iommu_type1_info *vfio_iommu_get_info(struct vfio_pci_device *device) +{ + struct vfio_iommu_type1_info *info; + + info = malloc(sizeof(*info)); + VFIO_ASSERT_NOT_NULL(info); + + *info = (struct vfio_iommu_type1_info) { + .argsz = sizeof(*info), + }; + + ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info); + VFIO_ASSERT_GE(info->argsz, sizeof(*info)); + + info = realloc(info, info->argsz); + VFIO_ASSERT_NOT_NULL(info); + + ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info); + VFIO_ASSERT_GE(info->argsz, sizeof(*info)); + + return info; +} + +/* + * Return iova ranges for the device's container. Normalize vfio_iommu_type1 to + * report iommufd's iommu_iova_range. Free with free(). + */ +static struct iommu_iova_range *vfio_iommu_iova_ranges(struct vfio_pci_device *device, + u32 *nranges) +{ + struct vfio_iommu_type1_info_cap_iova_range *cap_range; + struct vfio_iommu_type1_info *info; + struct vfio_info_cap_header *hdr; + struct iommu_iova_range *ranges = NULL; + + info = vfio_iommu_get_info(device); + hdr = vfio_iommu_info_cap_hdr(info, VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); + VFIO_ASSERT_NOT_NULL(hdr); + + cap_range = container_of(hdr, struct vfio_iommu_type1_info_cap_iova_range, header); + VFIO_ASSERT_GT(cap_range->nr_iovas, 0); + + ranges = calloc(cap_range->nr_iovas, sizeof(*ranges)); + VFIO_ASSERT_NOT_NULL(ranges); + + for (u32 i = 0; i < cap_range->nr_iovas; i++) { + ranges[i] = (struct iommu_iova_range){ + .start = cap_range->iova_ranges[i].start, + .last = cap_range->iova_ranges[i].end, + }; + } + + *nranges = cap_range->nr_iovas; + + free(info); + return ranges; +} + +/* Return iova ranges of the device's IOAS. Free with free() */ +static struct iommu_iova_range *iommufd_iova_ranges(struct vfio_pci_device *device, + u32 *nranges) +{ + struct iommu_iova_range *ranges; + int ret; + + struct iommu_ioas_iova_ranges query = { + .size = sizeof(query), + .ioas_id = device->ioas_id, + }; + + ret = ioctl(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + VFIO_ASSERT_EQ(ret, -1); + VFIO_ASSERT_EQ(errno, EMSGSIZE); + VFIO_ASSERT_GT(query.num_iovas, 0); + + ranges = calloc(query.num_iovas, sizeof(*ranges)); + VFIO_ASSERT_NOT_NULL(ranges); + + query.allowed_iovas = (uintptr_t)ranges; + + ioctl_assert(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + *nranges = query.num_iovas; + + return ranges; +} + +static int iova_range_comp(const void *a, const void *b) +{ + const struct iommu_iova_range *ra = a, *rb = b; + + if (ra->start < rb->start) + return -1; + + if (ra->start > rb->start) + return 1; + + return 0; +} + +/* Return sorted IOVA ranges of the device. Free with free(). */ +struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, + u32 *nranges) +{ + struct iommu_iova_range *ranges; + + if (device->iommufd) + ranges = iommufd_iova_ranges(device, nranges); + else + ranges = vfio_iommu_iova_ranges(device, nranges); + + if (!ranges) + return NULL; + + VFIO_ASSERT_GT(*nranges, 0); + + /* Sort and check that ranges are sane and non-overlapping */ + qsort(ranges, *nranges, sizeof(*ranges), iova_range_comp); + VFIO_ASSERT_LT(ranges[0].start, ranges[0].last); + + for (u32 i = 1; i < *nranges; i++) { + VFIO_ASSERT_LT(ranges[i].start, ranges[i].last); + VFIO_ASSERT_LT(ranges[i - 1].last, ranges[i].start); + } + + return ranges; +} + +struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device) +{ + struct iova_allocator *allocator; + struct iommu_iova_range *ranges; + u32 nranges; + + ranges = vfio_pci_iova_ranges(device, &nranges); + VFIO_ASSERT_NOT_NULL(ranges); + + allocator = malloc(sizeof(*allocator)); + VFIO_ASSERT_NOT_NULL(allocator); + + *allocator = (struct iova_allocator){ + .ranges = ranges, + .nranges = nranges, + .range_idx = 0, + .range_offset = 0, + }; + + return allocator; +} + +void iova_allocator_cleanup(struct iova_allocator *allocator) +{ + free(allocator->ranges); + free(allocator); +} + +iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size) +{ + VFIO_ASSERT_GT(size, 0, "Invalid size arg, zero\n"); + VFIO_ASSERT_EQ(size & (size - 1), 0, "Invalid size arg, non-power-of-2\n"); + + for (;;) { + struct iommu_iova_range *range; + iova_t iova, last; + + VFIO_ASSERT_LT(allocator->range_idx, allocator->nranges, + "IOVA allocator out of space\n"); + + range = &allocator->ranges[allocator->range_idx]; + iova = range->start + allocator->range_offset; + + /* Check for sufficient space at the current offset */ + if (check_add_overflow(iova, size - 1, &last) || + last > range->last) + goto next_range; + + /* Align iova to size */ + iova = last & ~(size - 1); + + /* Check for sufficient space at the aligned iova */ + if (check_add_overflow(iova, size - 1, &last) || + last > range->last) + goto next_range; + + if (last == range->last) { + allocator->range_idx++; + allocator->range_offset = 0; + } else { + allocator->range_offset = last - range->start + 1; + } + + return iova; + +next_range: + allocator->range_idx++; + allocator->range_offset = 0; + } +} + iova_t __to_iova(struct vfio_pci_device *device, void *vaddr) { struct vfio_dma_region *region; diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 4f1ea79a200c..102603d4407d 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -3,6 +3,8 @@ #include <sys/mman.h> #include <unistd.h> +#include <uapi/linux/types.h> +#include <linux/iommufd.h> #include <linux/limits.h> #include <linux/mman.h> #include <linux/sizes.h> @@ -93,6 +95,7 @@ static int iommu_mapping_get(const char *bdf, u64 iova, FIXTURE(vfio_dma_mapping_test) { struct vfio_pci_device *device; + struct iova_allocator *iova_allocator; }; FIXTURE_VARIANT(vfio_dma_mapping_test) { @@ -117,10 +120,12 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous_hugetlb_1gb, SZ_1G, MAP_HUGETLB | FIXTURE_SETUP(vfio_dma_mapping_test) { self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); + self->iova_allocator = iova_allocator_init(self->device); } FIXTURE_TEARDOWN(vfio_dma_mapping_test) { + iova_allocator_cleanup(self->iova_allocator); vfio_pci_device_cleanup(self->device); } @@ -142,7 +147,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) else ASSERT_NE(region.vaddr, MAP_FAILED); - region.iova = (u64)region.vaddr; + region.iova = iova_allocator_alloc(self->iova_allocator, size); region.size = size; vfio_pci_dma_map(self->device, ®ion); @@ -219,7 +224,10 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); FIXTURE_SETUP(vfio_dma_map_limit_test) { struct vfio_dma_region *region = &self->region; + struct iommu_iova_range *ranges; u64 region_size = getpagesize(); + iova_t last_iova; + u32 nranges; /* * Over-allocate mmap by double the size to provide enough backing vaddr @@ -232,8 +240,13 @@ FIXTURE_SETUP(vfio_dma_map_limit_test) MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); ASSERT_NE(region->vaddr, MAP_FAILED); - /* One page prior to the end of address space */ - region->iova = ~(iova_t)0 & ~(region_size - 1); + ranges = vfio_pci_iova_ranges(self->device, &nranges); + VFIO_ASSERT_NOT_NULL(ranges); + last_iova = ranges[nranges - 1].last; + free(ranges); + + /* One page prior to the last iova */ + region->iova = last_iova & ~(region_size - 1); region->size = region_size; } @@ -276,6 +289,7 @@ TEST_F(vfio_dma_map_limit_test, overflow) struct vfio_dma_region *region = &self->region; int rc; + region->iova = ~(iova_t)0 & ~(region->size - 1); region->size = self->mmap_size; rc = __vfio_pci_dma_map(self->device, region); diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c index 2dbd70b7db62..f69eec8b928d 100644 --- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c @@ -19,6 +19,7 @@ static const char *device_bdf; } while (0) static void region_setup(struct vfio_pci_device *device, + struct iova_allocator *iova_allocator, struct vfio_dma_region *region, u64 size) { const int flags = MAP_SHARED | MAP_ANONYMOUS; @@ -29,7 +30,7 @@ static void region_setup(struct vfio_pci_device *device, VFIO_ASSERT_NE(vaddr, MAP_FAILED); region->vaddr = vaddr; - region->iova = (u64)vaddr; + region->iova = iova_allocator_alloc(iova_allocator, size); region->size = size; vfio_pci_dma_map(device, region); @@ -44,6 +45,7 @@ static void region_teardown(struct vfio_pci_device *device, FIXTURE(vfio_pci_driver_test) { struct vfio_pci_device *device; + struct iova_allocator *iova_allocator; struct vfio_dma_region memcpy_region; void *vaddr; int msi_fd; @@ -72,14 +74,15 @@ FIXTURE_SETUP(vfio_pci_driver_test) struct vfio_pci_driver *driver; self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); + self->iova_allocator = iova_allocator_init(self->device); driver = &self->device->driver; - region_setup(self->device, &self->memcpy_region, SZ_1G); - region_setup(self->device, &driver->region, SZ_2M); + region_setup(self->device, self->iova_allocator, &self->memcpy_region, SZ_1G); + region_setup(self->device, self->iova_allocator, &driver->region, SZ_2M); /* Any IOVA that doesn't overlap memcpy_region and driver->region. */ - self->unmapped_iova = 8UL * SZ_1G; + self->unmapped_iova = iova_allocator_alloc(self->iova_allocator, SZ_1G); vfio_pci_driver_init(self->device); self->msi_fd = self->device->msi_eventfds[driver->msi]; @@ -108,6 +111,7 @@ FIXTURE_TEARDOWN(vfio_pci_driver_test) region_teardown(self->device, &self->memcpy_region); region_teardown(self->device, &driver->region); + iova_allocator_cleanup(self->iova_allocator); vfio_pci_device_cleanup(self->device); } diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index 05e1e6774fba..918eaec8bfbe 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -308,12 +308,13 @@ static void test_getcpu(int cpu) #ifdef __x86_64__ static jmp_buf jmpbuf; -static volatile unsigned long segv_err; +static volatile unsigned long segv_err, segv_trapno; static void sigsegv(int sig, siginfo_t *info, void *ctx_void) { ucontext_t *ctx = (ucontext_t *)ctx_void; + segv_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO]; segv_err = ctx->uc_mcontext.gregs[REG_ERR]; siglongjmp(jmpbuf, 1); } @@ -336,7 +337,8 @@ static void test_vsys_r(void) else if (can_read) ksft_test_result_pass("We have read access\n"); else - ksft_test_result_pass("We do not have read access: #PF(0x%lx)\n", segv_err); + ksft_test_result_pass("We do not have read access (trap=%ld, error=0x%lx)\n", + segv_trapno, segv_err); } static void test_vsys_x(void) @@ -347,7 +349,7 @@ static void test_vsys_x(void) return; } - ksft_print_msg("Make sure that vsyscalls really page fault\n"); + ksft_print_msg("Make sure that vsyscalls really cause a fault\n"); bool can_exec; if (sigsetjmp(jmpbuf, 1) == 0) { @@ -358,13 +360,14 @@ static void test_vsys_x(void) } if (can_exec) - ksft_test_result_fail("Executing the vsyscall did not page fault\n"); - else if (segv_err & (1 << 4)) /* INSTR */ - ksft_test_result_pass("Executing the vsyscall page failed: #PF(0x%lx)\n", - segv_err); + ksft_test_result_fail("Executing the vsyscall did not fault\n"); + /* #GP or #PF (with X86_PF_INSTR) */ + else if ((segv_trapno == 13) || ((segv_trapno == 14) && (segv_err & (1 << 4)))) + ksft_test_result_pass("Executing the vsyscall page failed (trap=%ld, error=0x%lx)\n", + segv_trapno, segv_err); else - ksft_test_result_fail("Execution failed with the wrong error: #PF(0x%lx)\n", - segv_err); + ksft_test_result_fail("Execution failed with the wrong error (trap=%ld, error=0x%lx)\n", + segv_trapno, segv_err); } /* diff --git a/tools/thermal/thermal-engine/thermal-engine.c b/tools/thermal/thermal-engine/thermal-engine.c index 0764dc754771..66b0ba1fcd23 100644 --- a/tools/thermal/thermal-engine/thermal-engine.c +++ b/tools/thermal/thermal-engine/thermal-engine.c @@ -374,7 +374,7 @@ int main(int argc, char *argv[]) } if (options.daemonize && daemon(0, 0)) { - ERROR("Failed to daemonize: %p\n"); + ERROR("Failed to daemonize: %m\n"); return THERMAL_ENGINE_DAEMON_ERROR; } |
