diff options
Diffstat (limited to 'tools')
159 files changed, 8525 insertions, 5130 deletions
diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h index 7c7493cb571f..52f076afeb96 100644 --- a/tools/arch/arm64/include/asm/cputype.h +++ b/tools/arch/arm64/include/asm/cputype.h @@ -61,6 +61,7 @@ #define ARM_CPU_IMP_HISI 0x48 #define ARM_CPU_IMP_APPLE 0x61 #define ARM_CPU_IMP_AMPERE 0xC0 +#define ARM_CPU_IMP_MICROSOFT 0x6D #define ARM_CPU_PART_AEM_V8 0xD0F #define ARM_CPU_PART_FOUNDATION 0xD00 @@ -135,6 +136,8 @@ #define AMPERE_CPU_PART_AMPERE1 0xAC3 +#define MICROSOFT_CPU_PART_AZURE_COBALT_100 0xD49 /* Based on r0p0 of ARM Neoverse N2 */ + #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) #define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) @@ -193,6 +196,7 @@ #define MIDR_APPLE_M2_BLIZZARD_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_MAX) #define MIDR_APPLE_M2_AVALANCHE_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_MAX) #define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1) +#define MIDR_MICROSOFT_AZURE_COBALT_100 MIDR_CPU_MODEL(ARM_CPU_IMP_MICROSOFT, MICROSOFT_CPU_PART_AZURE_COBALT_100) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ #define MIDR_FUJITSU_ERRATUM_010001 MIDR_FUJITSU_A64FX diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index 89d2fc872d9f..964df31da975 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -37,9 +37,7 @@ #include <asm/ptrace.h> #include <asm/sve_context.h> -#define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_IRQ_LINE -#define __KVM_HAVE_READONLY_MEM #define __KVM_HAVE_VCPU_EVENTS #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -76,11 +74,11 @@ struct kvm_regs { /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */ #define KVM_ARM_DEVICE_TYPE_SHIFT 0 -#define KVM_ARM_DEVICE_TYPE_MASK GENMASK(KVM_ARM_DEVICE_TYPE_SHIFT + 15, \ - KVM_ARM_DEVICE_TYPE_SHIFT) +#define KVM_ARM_DEVICE_TYPE_MASK __GENMASK(KVM_ARM_DEVICE_TYPE_SHIFT + 15, \ + KVM_ARM_DEVICE_TYPE_SHIFT) #define KVM_ARM_DEVICE_ID_SHIFT 16 -#define KVM_ARM_DEVICE_ID_MASK GENMASK(KVM_ARM_DEVICE_ID_SHIFT + 15, \ - KVM_ARM_DEVICE_ID_SHIFT) +#define KVM_ARM_DEVICE_ID_MASK __GENMASK(KVM_ARM_DEVICE_ID_SHIFT + 15, \ + KVM_ARM_DEVICE_ID_SHIFT) /* Supported device IDs */ #define KVM_ARM_DEVICE_VGIC_V2 0 @@ -162,6 +160,11 @@ struct kvm_sync_regs { __u64 device_irq_level; }; +/* Bits for run->s.regs.device_irq_level */ +#define KVM_ARM_DEV_EL1_VTIMER (1 << 0) +#define KVM_ARM_DEV_EL1_PTIMER (1 << 1) +#define KVM_ARM_DEV_PMU (1 << 2) + /* * PMU filter structure. Describe a range of events with a particular * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER. diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index 9f18fa090f1f..1691297a766a 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -28,7 +28,6 @@ #define __KVM_HAVE_PPC_SMT #define __KVM_HAVE_IRQCHIP #define __KVM_HAVE_IRQ_LINE -#define __KVM_HAVE_GUEST_DEBUG /* Not always available, but if it is, this is the correct offset. */ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -733,4 +732,48 @@ struct kvm_ppc_xive_eq { #define KVM_XIVE_TIMA_PAGE_OFFSET 0 #define KVM_XIVE_ESB_PAGE_OFFSET 4 +/* for KVM_PPC_GET_PVINFO */ + +#define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) + +struct kvm_ppc_pvinfo { + /* out */ + __u32 flags; + __u32 hcall[4]; + __u8 pad[108]; +}; + +/* for KVM_PPC_GET_SMMU_INFO */ +#define KVM_PPC_PAGE_SIZES_MAX_SZ 8 + +struct kvm_ppc_one_page_size { + __u32 page_shift; /* Page shift (or 0) */ + __u32 pte_enc; /* Encoding in the HPTE (>>12) */ +}; + +struct kvm_ppc_one_seg_page_size { + __u32 page_shift; /* Base page shift of segment (or 0) */ + __u32 slb_enc; /* SLB encoding for BookS */ + struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; +}; + +#define KVM_PPC_PAGE_SIZES_REAL 0x00000001 +#define KVM_PPC_1T_SEGMENTS 0x00000002 +#define KVM_PPC_NO_HASH 0x00000004 + +struct kvm_ppc_smmu_info { + __u64 flags; + __u32 slb_size; + __u16 data_keys; /* # storage keys supported for data */ + __u16 instr_keys; /* # storage keys supported for instructions */ + struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; +}; + +/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */ +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h index abe926d43cbe..05eaf6db3ad4 100644 --- a/tools/arch/s390/include/uapi/asm/kvm.h +++ b/tools/arch/s390/include/uapi/asm/kvm.h @@ -12,7 +12,320 @@ #include <linux/types.h> #define __KVM_S390 -#define __KVM_HAVE_GUEST_DEBUG + +struct kvm_s390_skeys { + __u64 start_gfn; + __u64 count; + __u64 skeydata_addr; + __u32 flags; + __u32 reserved[9]; +}; + +#define KVM_S390_CMMA_PEEK (1 << 0) + +/** + * kvm_s390_cmma_log - Used for CMMA migration. + * + * Used both for input and output. + * + * @start_gfn: Guest page number to start from. + * @count: Size of the result buffer. + * @flags: Control operation mode via KVM_S390_CMMA_* flags + * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty + * pages are still remaining. + * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set + * in the PGSTE. + * @values: Pointer to the values buffer. + * + * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls. + */ +struct kvm_s390_cmma_log { + __u64 start_gfn; + __u32 count; + __u32 flags; + union { + __u64 remaining; + __u64 mask; + }; + __u64 values; +}; + +#define KVM_S390_RESET_POR 1 +#define KVM_S390_RESET_CLEAR 2 +#define KVM_S390_RESET_SUBSYSTEM 4 +#define KVM_S390_RESET_CPU_INIT 8 +#define KVM_S390_RESET_IPL 16 + +/* for KVM_S390_MEM_OP */ +struct kvm_s390_mem_op { + /* in */ + __u64 gaddr; /* the guest address */ + __u64 flags; /* flags */ + __u32 size; /* amount of bytes */ + __u32 op; /* type of operation */ + __u64 buf; /* buffer in userspace */ + union { + struct { + __u8 ar; /* the access register number */ + __u8 key; /* access key, ignored if flag unset */ + __u8 pad1[6]; /* ignored */ + __u64 old_addr; /* ignored if cmpxchg flag unset */ + }; + __u32 sida_offset; /* offset into the sida */ + __u8 reserved[32]; /* ignored */ + }; +}; +/* types for kvm_s390_mem_op->op */ +#define KVM_S390_MEMOP_LOGICAL_READ 0 +#define KVM_S390_MEMOP_LOGICAL_WRITE 1 +#define KVM_S390_MEMOP_SIDA_READ 2 +#define KVM_S390_MEMOP_SIDA_WRITE 3 +#define KVM_S390_MEMOP_ABSOLUTE_READ 4 +#define KVM_S390_MEMOP_ABSOLUTE_WRITE 5 +#define KVM_S390_MEMOP_ABSOLUTE_CMPXCHG 6 + +/* flags for kvm_s390_mem_op->flags */ +#define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) +#define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) +#define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2) + +/* flags specifying extension support via KVM_CAP_S390_MEM_OP_EXTENSION */ +#define KVM_S390_MEMOP_EXTENSION_CAP_BASE (1 << 0) +#define KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG (1 << 1) + +struct kvm_s390_psw { + __u64 mask; + __u64 addr; +}; + +/* valid values for type in kvm_s390_interrupt */ +#define KVM_S390_SIGP_STOP 0xfffe0000u +#define KVM_S390_PROGRAM_INT 0xfffe0001u +#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u +#define KVM_S390_RESTART 0xfffe0003u +#define KVM_S390_INT_PFAULT_INIT 0xfffe0004u +#define KVM_S390_INT_PFAULT_DONE 0xfffe0005u +#define KVM_S390_MCHK 0xfffe1000u +#define KVM_S390_INT_CLOCK_COMP 0xffff1004u +#define KVM_S390_INT_CPU_TIMER 0xffff1005u +#define KVM_S390_INT_VIRTIO 0xffff2603u +#define KVM_S390_INT_SERVICE 0xffff2401u +#define KVM_S390_INT_EMERGENCY 0xffff1201u +#define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u +/* Anything below 0xfffe0000u is taken by INT_IO */ +#define KVM_S390_INT_IO(ai,cssid,ssid,schid) \ + (((schid)) | \ + ((ssid) << 16) | \ + ((cssid) << 18) | \ + ((ai) << 26)) +#define KVM_S390_INT_IO_MIN 0x00000000u +#define KVM_S390_INT_IO_MAX 0xfffdffffu +#define KVM_S390_INT_IO_AI_MASK 0x04000000u + + +struct kvm_s390_interrupt { + __u32 type; + __u32 parm; + __u64 parm64; +}; + +struct kvm_s390_io_info { + __u16 subchannel_id; + __u16 subchannel_nr; + __u32 io_int_parm; + __u32 io_int_word; +}; + +struct kvm_s390_ext_info { + __u32 ext_params; + __u32 pad; + __u64 ext_params2; +}; + +struct kvm_s390_pgm_info { + __u64 trans_exc_code; + __u64 mon_code; + __u64 per_address; + __u32 data_exc_code; + __u16 code; + __u16 mon_class_nr; + __u8 per_code; + __u8 per_atmid; + __u8 exc_access_id; + __u8 per_access_id; + __u8 op_access_id; +#define KVM_S390_PGM_FLAGS_ILC_VALID 0x01 +#define KVM_S390_PGM_FLAGS_ILC_0 0x02 +#define KVM_S390_PGM_FLAGS_ILC_1 0x04 +#define KVM_S390_PGM_FLAGS_ILC_MASK 0x06 +#define KVM_S390_PGM_FLAGS_NO_REWIND 0x08 + __u8 flags; + __u8 pad[2]; +}; + +struct kvm_s390_prefix_info { + __u32 address; +}; + +struct kvm_s390_extcall_info { + __u16 code; +}; + +struct kvm_s390_emerg_info { + __u16 code; +}; + +#define KVM_S390_STOP_FLAG_STORE_STATUS 0x01 +struct kvm_s390_stop_info { + __u32 flags; +}; + +struct kvm_s390_mchk_info { + __u64 cr14; + __u64 mcic; + __u64 failing_storage_address; + __u32 ext_damage_code; + __u32 pad; + __u8 fixed_logout[16]; +}; + +struct kvm_s390_irq { + __u64 type; + union { + struct kvm_s390_io_info io; + struct kvm_s390_ext_info ext; + struct kvm_s390_pgm_info pgm; + struct kvm_s390_emerg_info emerg; + struct kvm_s390_extcall_info extcall; + struct kvm_s390_prefix_info prefix; + struct kvm_s390_stop_info stop; + struct kvm_s390_mchk_info mchk; + char reserved[64]; + } u; +}; + +struct kvm_s390_irq_state { + __u64 buf; + __u32 flags; /* will stay unused for compatibility reasons */ + __u32 len; + __u32 reserved[4]; /* will stay unused for compatibility reasons */ +}; + +struct kvm_s390_ucas_mapping { + __u64 user_addr; + __u64 vcpu_addr; + __u64 length; +}; + +struct kvm_s390_pv_sec_parm { + __u64 origin; + __u64 length; +}; + +struct kvm_s390_pv_unp { + __u64 addr; + __u64 size; + __u64 tweak; +}; + +enum pv_cmd_dmp_id { + KVM_PV_DUMP_INIT, + KVM_PV_DUMP_CONFIG_STOR_STATE, + KVM_PV_DUMP_COMPLETE, + KVM_PV_DUMP_CPU, +}; + +struct kvm_s390_pv_dmp { + __u64 subcmd; + __u64 buff_addr; + __u64 buff_len; + __u64 gaddr; /* For dump storage state */ + __u64 reserved[4]; +}; + +enum pv_cmd_info_id { + KVM_PV_INFO_VM, + KVM_PV_INFO_DUMP, +}; + +struct kvm_s390_pv_info_dump { + __u64 dump_cpu_buffer_len; + __u64 dump_config_mem_buffer_per_1m; + __u64 dump_config_finalize_len; +}; + +struct kvm_s390_pv_info_vm { + __u64 inst_calls_list[4]; + __u64 max_cpus; + __u64 max_guests; + __u64 max_guest_addr; + __u64 feature_indication; +}; + +struct kvm_s390_pv_info_header { + __u32 id; + __u32 len_max; + __u32 len_written; + __u32 reserved; +}; + +struct kvm_s390_pv_info { + struct kvm_s390_pv_info_header header; + union { + struct kvm_s390_pv_info_dump dump; + struct kvm_s390_pv_info_vm vm; + }; +}; + +enum pv_cmd_id { + KVM_PV_ENABLE, + KVM_PV_DISABLE, + KVM_PV_SET_SEC_PARMS, + KVM_PV_UNPACK, + KVM_PV_VERIFY, + KVM_PV_PREP_RESET, + KVM_PV_UNSHARE_ALL, + KVM_PV_INFO, + KVM_PV_DUMP, + KVM_PV_ASYNC_CLEANUP_PREPARE, + KVM_PV_ASYNC_CLEANUP_PERFORM, +}; + +struct kvm_pv_cmd { + __u32 cmd; /* Command to be executed */ + __u16 rc; /* Ultravisor return code */ + __u16 rrc; /* Ultravisor return reason code */ + __u64 data; /* Data or address */ + __u32 flags; /* flags for future extensions. Must be 0 for now */ + __u32 reserved[3]; +}; + +struct kvm_s390_zpci_op { + /* in */ + __u32 fh; /* target device */ + __u8 op; /* operation to perform */ + __u8 pad[3]; + union { + /* for KVM_S390_ZPCIOP_REG_AEN */ + struct { + __u64 ibv; /* Guest addr of interrupt bit vector */ + __u64 sb; /* Guest addr of summary bit */ + __u32 flags; + __u32 noi; /* Number of interrupts */ + __u8 isc; /* Guest interrupt subclass */ + __u8 sbo; /* Offset of guest summary bit vector */ + __u16 pad; + } reg_aen; + __u64 reserved[8]; + } u; +}; + +/* types for kvm_s390_zpci_op->op */ +#define KVM_S390_ZPCIOP_REG_AEN 0 +#define KVM_S390_ZPCIOP_DEREG_AEN 1 + +/* flags for kvm_s390_zpci_op->u.reg_aen.flags */ +#define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) /* Device control API: s390-specific devices */ #define KVM_DEV_FLIC_GET_ALL_IRQS 1 diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 25160d26764b..a38f8f9ba657 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 21 /* N 32-bit words worth of info */ +#define NCAPINTS 22 /* N 32-bit words worth of info */ #define NBUGINTS 2 /* N 32-bit bug flags */ /* @@ -81,10 +81,8 @@ #define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ #define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ #define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ - -/* CPU types for specific tunings: */ #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -/* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */ +#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* "" CPU based on Zen5 microarchitecture */ #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ @@ -97,7 +95,7 @@ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ #define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ -/* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */ +#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ @@ -462,6 +460,14 @@ #define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */ /* + * Extended auxiliary flags: Linux defined - for features scattered in various + * CPUID levels like 0x80000022, etc. + * + * Reuse free bits when adding new feature flags! + */ +#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ + +/* * BUG word(s) */ #define X86_BUG(x) (NCAPINTS*32 + (x)) @@ -508,4 +514,5 @@ /* BUG word 2 */ #define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ #define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ +#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 1f23960d2b06..c492bdc97b05 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -123,6 +123,12 @@ # define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) #endif +#ifdef CONFIG_KVM_AMD_SEV +#define DISABLE_SEV_SNP 0 +#else +#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -147,8 +153,9 @@ DISABLE_ENQCMD) #define DISABLED_MASK17 0 #define DISABLED_MASK18 (DISABLE_IBT) -#define DISABLED_MASK19 0 +#define DISABLED_MASK19 (DISABLE_SEV_SNP) #define DISABLED_MASK20 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define DISABLED_MASK21 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/tools/arch/x86/include/asm/irq_vectors.h b/tools/arch/x86/include/asm/irq_vectors.h index 3f73ac3ed3a0..d18bfb238f66 100644 --- a/tools/arch/x86/include/asm/irq_vectors.h +++ b/tools/arch/x86/include/asm/irq_vectors.h @@ -84,11 +84,9 @@ #define HYPERVISOR_CALLBACK_VECTOR 0xf3 /* Vector for KVM to deliver posted interrupt IPI */ -#if IS_ENABLED(CONFIG_KVM) #define POSTED_INTR_VECTOR 0xf2 #define POSTED_INTR_WAKEUP_VECTOR 0xf1 #define POSTED_INTR_NESTED_VECTOR 0xf0 -#endif #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 1f9dc9bd13eb..05956bd8bacf 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -176,6 +176,14 @@ * CPU is not vulnerable to Gather * Data Sampling (GDS). */ +#define ARCH_CAP_RFDS_NO BIT(27) /* + * Not susceptible to Register + * File Data Sampling. + */ +#define ARCH_CAP_RFDS_CLEAR BIT(28) /* + * VERW clears CPU Register + * File. + */ #define ARCH_CAP_XAPIC_DISABLE BIT(21) /* * IA32_XAPIC_DISABLE_STATUS MSR @@ -605,34 +613,47 @@ #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 -#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 -#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) +#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) +#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT) - -/* SNP feature bits enabled by the hypervisor */ -#define MSR_AMD64_SNP_VTOM BIT_ULL(3) -#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(4) -#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(5) -#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(6) -#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(7) -#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(8) -#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(9) -#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(10) -#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(11) -#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(12) -#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(14) -#define MSR_AMD64_SNP_VMSA_REG_PROTECTION BIT_ULL(16) -#define MSR_AMD64_SNP_SMT_PROTECTION BIT_ULL(17) - -/* SNP feature bits reserved for future use. */ -#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) -#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) -#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, 18) +#define MSR_AMD64_SNP_VTOM_BIT 3 +#define MSR_AMD64_SNP_VTOM BIT_ULL(MSR_AMD64_SNP_VTOM_BIT) +#define MSR_AMD64_SNP_REFLECT_VC_BIT 4 +#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(MSR_AMD64_SNP_REFLECT_VC_BIT) +#define MSR_AMD64_SNP_RESTRICTED_INJ_BIT 5 +#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(MSR_AMD64_SNP_RESTRICTED_INJ_BIT) +#define MSR_AMD64_SNP_ALT_INJ_BIT 6 +#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(MSR_AMD64_SNP_ALT_INJ_BIT) +#define MSR_AMD64_SNP_DEBUG_SWAP_BIT 7 +#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(MSR_AMD64_SNP_DEBUG_SWAP_BIT) +#define MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT 8 +#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT) +#define MSR_AMD64_SNP_BTB_ISOLATION_BIT 9 +#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(MSR_AMD64_SNP_BTB_ISOLATION_BIT) +#define MSR_AMD64_SNP_VMPL_SSS_BIT 10 +#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(MSR_AMD64_SNP_VMPL_SSS_BIT) +#define MSR_AMD64_SNP_SECURE_TSC_BIT 11 +#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(MSR_AMD64_SNP_SECURE_TSC_BIT) +#define MSR_AMD64_SNP_VMGEXIT_PARAM_BIT 12 +#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(MSR_AMD64_SNP_VMGEXIT_PARAM_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) +#define MSR_AMD64_SNP_IBS_VIRT_BIT 14 +#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(MSR_AMD64_SNP_IBS_VIRT_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) +#define MSR_AMD64_SNP_VMSA_REG_PROT_BIT 16 +#define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) +#define MSR_AMD64_SNP_SMT_PROT_BIT 17 +#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) +#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f +#define MSR_AMD64_RMP_BASE 0xc0010132 +#define MSR_AMD64_RMP_END 0xc0010133 + /* AMD Collaborative Processor Performance Control MSRs */ #define MSR_AMD_CPPC_CAP1 0xc00102b0 #define MSR_AMD_CPPC_ENABLE 0xc00102b1 @@ -719,8 +740,15 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_AMD64_SYSCFG 0xc0010010 -#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 #define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) +#define MSR_AMD64_SYSCFG_SNP_EN_BIT 24 +#define MSR_AMD64_SYSCFG_SNP_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_EN_BIT) +#define MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT 25 +#define MSR_AMD64_SYSCFG_SNP_VMPL_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT) +#define MSR_AMD64_SYSCFG_MFDM_BIT 19 +#define MSR_AMD64_SYSCFG_MFDM BIT_ULL(MSR_AMD64_SYSCFG_MFDM_BIT) + #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h index 7ba1726b71c7..e9187ddd3d1f 100644 --- a/tools/arch/x86/include/asm/required-features.h +++ b/tools/arch/x86/include/asm/required-features.h @@ -99,6 +99,7 @@ #define REQUIRED_MASK18 0 #define REQUIRED_MASK19 0 #define REQUIRED_MASK20 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define REQUIRED_MASK21 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index a448d0964fc0..ef11aa4cab42 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -7,6 +7,8 @@ * */ +#include <linux/const.h> +#include <linux/bits.h> #include <linux/types.h> #include <linux/ioctl.h> #include <linux/stddef.h> @@ -40,7 +42,6 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_MSI #define __KVM_HAVE_USER_NMI -#define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_MSIX #define __KVM_HAVE_MCE #define __KVM_HAVE_PIT_STATE2 @@ -49,7 +50,6 @@ #define __KVM_HAVE_DEBUGREGS #define __KVM_HAVE_XSAVE #define __KVM_HAVE_XCRS -#define __KVM_HAVE_READONLY_MEM /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -526,9 +526,301 @@ struct kvm_pmu_event_filter { #define KVM_PMU_EVENT_ALLOW 0 #define KVM_PMU_EVENT_DENY 1 -#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0) +#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS _BITUL(0) #define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS) +/* for KVM_CAP_MCE */ +struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; +}; + +/* for KVM_CAP_XEN_HVM */ +#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) +#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) +#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) +#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) +#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) +#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) +#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8) + +struct kvm_xen_hvm_config { + __u32 flags; + __u32 msr; + __u64 blob_addr_32; + __u64 blob_addr_64; + __u8 blob_size_32; + __u8 blob_size_64; + __u8 pad2[30]; +}; + +struct kvm_xen_hvm_attr { + __u16 type; + __u16 pad[3]; + union { + __u8 long_mode; + __u8 vector; + __u8 runstate_update_flag; + union { + __u64 gfn; +#define KVM_XEN_INVALID_GFN ((__u64)-1) + __u64 hva; + } shared_info; + struct { + __u32 send_port; + __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */ + __u32 flags; +#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0) +#define KVM_XEN_EVTCHN_UPDATE (1 << 1) +#define KVM_XEN_EVTCHN_RESET (1 << 2) + /* + * Events sent by the guest are either looped back to + * the guest itself (potentially on a different port#) + * or signalled via an eventfd. + */ + union { + struct { + __u32 port; + __u32 vcpu; + __u32 priority; + } port; + struct { + __u32 port; /* Zero for eventfd */ + __s32 fd; + } eventfd; + __u32 padding[4]; + } deliver; + } evtchn; + __u32 xen_version; + __u64 pad[8]; + } u; +}; + + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 +#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 +#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 +#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ +#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 0x6 + +struct kvm_xen_vcpu_attr { + __u16 type; + __u16 pad[3]; + union { + __u64 gpa; +#define KVM_XEN_INVALID_GPA ((__u64)-1) + __u64 hva; + __u64 pad[8]; + struct { + __u64 state; + __u64 state_entry_time; + __u64 time_running; + __u64 time_runnable; + __u64 time_blocked; + __u64 time_offline; + } runstate; + __u32 vcpu_id; + struct { + __u32 port; + __u32 priority; + __u64 expires_ns; + } timer; + __u8 vector; + } u; +}; + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6 +#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7 +#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 0x9 + +/* Secure Encrypted Virtualization command */ +enum sev_cmd_id { + /* Guest initialization commands */ + KVM_SEV_INIT = 0, + KVM_SEV_ES_INIT, + /* Guest launch commands */ + KVM_SEV_LAUNCH_START, + KVM_SEV_LAUNCH_UPDATE_DATA, + KVM_SEV_LAUNCH_UPDATE_VMSA, + KVM_SEV_LAUNCH_SECRET, + KVM_SEV_LAUNCH_MEASURE, + KVM_SEV_LAUNCH_FINISH, + /* Guest migration commands (outgoing) */ + KVM_SEV_SEND_START, + KVM_SEV_SEND_UPDATE_DATA, + KVM_SEV_SEND_UPDATE_VMSA, + KVM_SEV_SEND_FINISH, + /* Guest migration commands (incoming) */ + KVM_SEV_RECEIVE_START, + KVM_SEV_RECEIVE_UPDATE_DATA, + KVM_SEV_RECEIVE_UPDATE_VMSA, + KVM_SEV_RECEIVE_FINISH, + /* Guest status and debug commands */ + KVM_SEV_GUEST_STATUS, + KVM_SEV_DBG_DECRYPT, + KVM_SEV_DBG_ENCRYPT, + /* Guest certificates commands */ + KVM_SEV_CERT_EXPORT, + /* Attestation report */ + KVM_SEV_GET_ATTESTATION_REPORT, + /* Guest Migration Extension */ + KVM_SEV_SEND_CANCEL, + + KVM_SEV_NR_MAX, +}; + +struct kvm_sev_cmd { + __u32 id; + __u32 pad0; + __u64 data; + __u32 error; + __u32 sev_fd; +}; + +struct kvm_sev_launch_start { + __u32 handle; + __u32 policy; + __u64 dh_uaddr; + __u32 dh_len; + __u32 pad0; + __u64 session_uaddr; + __u32 session_len; + __u32 pad1; +}; + +struct kvm_sev_launch_update_data { + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + + +struct kvm_sev_launch_secret { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +struct kvm_sev_launch_measure { + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_guest_status { + __u32 handle; + __u32 policy; + __u32 state; +}; + +struct kvm_sev_dbg { + __u64 src_uaddr; + __u64 dst_uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_attestation_report { + __u8 mnonce[16]; + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_send_start { + __u32 policy; + __u32 pad0; + __u64 pdh_cert_uaddr; + __u32 pdh_cert_len; + __u32 pad1; + __u64 plat_certs_uaddr; + __u32 plat_certs_len; + __u32 pad2; + __u64 amd_certs_uaddr; + __u32 amd_certs_len; + __u32 pad3; + __u64 session_uaddr; + __u32 session_len; + __u32 pad4; +}; + +struct kvm_sev_send_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +struct kvm_sev_receive_start { + __u32 handle; + __u32 policy; + __u64 pdh_uaddr; + __u32 pdh_len; + __u32 pad0; + __u64 session_uaddr; + __u32 session_len; + __u32 pad1; +}; + +struct kvm_sev_receive_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + +struct kvm_hyperv_eventfd { + __u32 conn_id; + __s32 fd; + __u32 flags; + __u32 padding[3]; +}; + +#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff +#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) + /* * Masked event layout. * Bits Description @@ -549,10 +841,10 @@ struct kvm_pmu_event_filter { ((__u64)(!!(exclude)) << 55)) #define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \ - (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32)) -#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56)) -#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8)) -#define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55)) + (__GENMASK_ULL(7, 0) | __GENMASK_ULL(35, 32)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (__GENMASK_ULL(63, 56)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (__GENMASK_ULL(15, 8)) +#define KVM_PMU_MASKED_ENTRY_EXCLUDE (_BITULL(55)) #define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56) /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ @@ -560,7 +852,7 @@ struct kvm_pmu_event_filter { #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ /* x86-specific KVM_EXIT_HYPERCALL flags. */ -#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0) +#define KVM_EXIT_HYPERCALL_LONG_MODE _BITULL(0) #define KVM_X86_DEFAULT_VM 0 #define KVM_X86_SW_PROTECTED_VM 1 diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 318e2dad27e0..ae57bf69ad4a 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -76,6 +76,12 @@ enum { DNS }; +enum { + IPV4 = 1, + IPV6, + IP_TYPE_MAX +}; + static int in_hand_shake; static char *os_name = ""; @@ -102,6 +108,11 @@ static struct utsname uts_buf; #define MAX_FILE_NAME 100 #define ENTRIES_PER_BLOCK 50 +/* + * Change this entry if the number of addresses increases in future + */ +#define MAX_IP_ENTRIES 64 +#define OUTSTR_BUF_SIZE ((INET6_ADDRSTRLEN + 1) * MAX_IP_ENTRIES) struct kvp_record { char key[HV_KVP_EXCHANGE_MAX_KEY_SIZE]; @@ -1171,6 +1182,18 @@ static int process_ip_string(FILE *f, char *ip_string, int type) return 0; } +int ip_version_check(const char *input_addr) +{ + struct in6_addr addr; + + if (inet_pton(AF_INET, input_addr, &addr)) + return IPV4; + else if (inet_pton(AF_INET6, input_addr, &addr)) + return IPV6; + + return -EINVAL; +} + /* * Only IPv4 subnet strings needs to be converted to plen * For IPv6 the subnet is already privided in plen format @@ -1197,14 +1220,75 @@ static int kvp_subnet_to_plen(char *subnet_addr_str) return plen; } +static int process_dns_gateway_nm(FILE *f, char *ip_string, int type, + int ip_sec) +{ + char addr[INET6_ADDRSTRLEN], *output_str; + int ip_offset = 0, error = 0, ip_ver; + char *param_name; + + if (type == DNS) + param_name = "dns"; + else if (type == GATEWAY) + param_name = "gateway"; + else + return -EINVAL; + + output_str = (char *)calloc(OUTSTR_BUF_SIZE, sizeof(char)); + if (!output_str) + return -ENOMEM; + + while (1) { + memset(addr, 0, sizeof(addr)); + + if (!parse_ip_val_buffer(ip_string, &ip_offset, addr, + (MAX_IP_ADDR_SIZE * 2))) + break; + + ip_ver = ip_version_check(addr); + if (ip_ver < 0) + continue; + + if ((ip_ver == IPV4 && ip_sec == IPV4) || + (ip_ver == IPV6 && ip_sec == IPV6)) { + /* + * do a bound check to avoid out-of bound writes + */ + if ((OUTSTR_BUF_SIZE - strlen(output_str)) > + (strlen(addr) + 1)) { + strncat(output_str, addr, + OUTSTR_BUF_SIZE - + strlen(output_str) - 1); + strncat(output_str, ",", + OUTSTR_BUF_SIZE - + strlen(output_str) - 1); + } + } else { + continue; + } + } + + if (strlen(output_str)) { + /* + * This is to get rid of that extra comma character + * in the end of the string + */ + output_str[strlen(output_str) - 1] = '\0'; + error = fprintf(f, "%s=%s\n", param_name, output_str); + } + + free(output_str); + return error; +} + static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet, - int is_ipv6) + int ip_sec) { char addr[INET6_ADDRSTRLEN]; char subnet_addr[INET6_ADDRSTRLEN]; - int error, i = 0; + int error = 0, i = 0; int ip_offset = 0, subnet_offset = 0; - int plen; + int plen, ip_ver; memset(addr, 0, sizeof(addr)); memset(subnet_addr, 0, sizeof(subnet_addr)); @@ -1216,10 +1300,16 @@ static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet, subnet_addr, (MAX_IP_ADDR_SIZE * 2))) { - if (!is_ipv6) + ip_ver = ip_version_check(addr); + if (ip_ver < 0) + continue; + + if (ip_ver == IPV4 && ip_sec == IPV4) plen = kvp_subnet_to_plen((char *)subnet_addr); - else + else if (ip_ver == IPV6 && ip_sec == IPV6) plen = atoi(subnet_addr); + else + continue; if (plen < 0) return plen; @@ -1233,17 +1323,16 @@ static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet, memset(subnet_addr, 0, sizeof(subnet_addr)); } - return 0; + return error; } static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val) { - int error = 0; + int error = 0, ip_ver; char if_filename[PATH_MAX]; char nm_filename[PATH_MAX]; FILE *ifcfg_file, *nmfile; char cmd[PATH_MAX]; - int is_ipv6 = 0; char *mac_addr; int str_len; @@ -1421,52 +1510,94 @@ static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val) if (error) goto setval_error; - if (new_val->addr_family & ADDR_FAMILY_IPV6) { - error = fprintf(nmfile, "\n[ipv6]\n"); - if (error < 0) - goto setval_error; - is_ipv6 = 1; - } else { - error = fprintf(nmfile, "\n[ipv4]\n"); - if (error < 0) - goto setval_error; - } - /* * Now we populate the keyfile format + * + * The keyfile format expects the IPv6 and IPv4 configuration in + * different sections. Therefore we iterate through the list twice, + * once to populate the IPv4 section and the next time for IPv6 */ + ip_ver = IPV4; + do { + if (ip_ver == IPV4) { + error = fprintf(nmfile, "\n[ipv4]\n"); + if (error < 0) + goto setval_error; + } else { + error = fprintf(nmfile, "\n[ipv6]\n"); + if (error < 0) + goto setval_error; + } - if (new_val->dhcp_enabled) { - error = kvp_write_file(nmfile, "method", "", "auto"); - if (error < 0) - goto setval_error; - } else { - error = kvp_write_file(nmfile, "method", "", "manual"); + /* + * Write the configuration for ipaddress, netmask, gateway and + * name services + */ + error = process_ip_string_nm(nmfile, (char *)new_val->ip_addr, + (char *)new_val->sub_net, + ip_ver); if (error < 0) goto setval_error; - } - /* - * Write the configuration for ipaddress, netmask, gateway and - * name services - */ - error = process_ip_string_nm(nmfile, (char *)new_val->ip_addr, - (char *)new_val->sub_net, is_ipv6); - if (error < 0) - goto setval_error; + /* + * As dhcp_enabled is only valid for ipv4, we do not set dhcp + * methods for ipv6 based on dhcp_enabled flag. + * + * For ipv4, set method to manual only when dhcp_enabled is + * false and specific ipv4 addresses are configured. If neither + * dhcp_enabled is true and no ipv4 addresses are configured, + * set method to 'disabled'. + * + * For ipv6, set method to manual when we configure ipv6 + * addresses. Otherwise set method to 'auto' so that SLAAC from + * RA may be used. + */ + if (ip_ver == IPV4) { + if (new_val->dhcp_enabled) { + error = kvp_write_file(nmfile, "method", "", + "auto"); + if (error < 0) + goto setval_error; + } else if (error) { + error = kvp_write_file(nmfile, "method", "", + "manual"); + if (error < 0) + goto setval_error; + } else { + error = kvp_write_file(nmfile, "method", "", + "disabled"); + if (error < 0) + goto setval_error; + } + } else if (ip_ver == IPV6) { + if (error) { + error = kvp_write_file(nmfile, "method", "", + "manual"); + if (error < 0) + goto setval_error; + } else { + error = kvp_write_file(nmfile, "method", "", + "auto"); + if (error < 0) + goto setval_error; + } + } - /* we do not want ipv4 addresses in ipv6 section and vice versa */ - if (is_ipv6 != is_ipv4((char *)new_val->gate_way)) { - error = fprintf(nmfile, "gateway=%s\n", (char *)new_val->gate_way); + error = process_dns_gateway_nm(nmfile, + (char *)new_val->gate_way, + GATEWAY, ip_ver); if (error < 0) goto setval_error; - } - if (is_ipv6 != is_ipv4((char *)new_val->dns_addr)) { - error = fprintf(nmfile, "dns=%s\n", (char *)new_val->dns_addr); + error = process_dns_gateway_nm(nmfile, + (char *)new_val->dns_addr, DNS, + ip_ver); if (error < 0) goto setval_error; - } + + ip_ver++; + } while (ip_ver < IP_TYPE_MAX); + fclose(nmfile); fclose(ifcfg_file); diff --git a/tools/include/asm-generic/bitops/__fls.h b/tools/include/asm-generic/bitops/__fls.h index 03f721a8a2b1..54ccccf96e21 100644 --- a/tools/include/asm-generic/bitops/__fls.h +++ b/tools/include/asm-generic/bitops/__fls.h @@ -5,12 +5,12 @@ #include <asm/types.h> /** - * __fls - find last (most-significant) set bit in a long word + * generic___fls - find last (most-significant) set bit in a long word * @word: the word to search * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __fls(unsigned long word) +static __always_inline unsigned long generic___fls(unsigned long word) { int num = BITS_PER_LONG - 1; @@ -41,4 +41,8 @@ static __always_inline unsigned long __fls(unsigned long word) return num; } +#ifndef __HAVE_ARCH___FLS +#define __fls(word) generic___fls(word) +#endif + #endif /* _ASM_GENERIC_BITOPS___FLS_H_ */ diff --git a/tools/include/asm-generic/bitops/fls.h b/tools/include/asm-generic/bitops/fls.h index b168bb10e1be..26f3ce1dd6e4 100644 --- a/tools/include/asm-generic/bitops/fls.h +++ b/tools/include/asm-generic/bitops/fls.h @@ -3,14 +3,14 @@ #define _ASM_GENERIC_BITOPS_FLS_H_ /** - * fls - find last (most-significant) bit set + * generic_fls - find last (most-significant) bit set * @x: the word to search * * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(unsigned int x) +static __always_inline int generic_fls(unsigned int x) { int r = 32; @@ -39,4 +39,8 @@ static __always_inline int fls(unsigned int x) return r; } +#ifndef __HAVE_ARCH_FLS +#define fls(x) generic_fls(x) +#endif + #endif /* _ASM_GENERIC_BITOPS_FLS_H_ */ diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h index 72535f00572f..72ea363d434d 100644 --- a/tools/include/linux/btf_ids.h +++ b/tools/include/linux/btf_ids.h @@ -3,6 +3,8 @@ #ifndef _LINUX_BTF_IDS_H #define _LINUX_BTF_IDS_H +#include <linux/types.h> /* for u32 */ + struct btf_id_set { u32 cnt; u32 ids[]; diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h index 4b0673bf52c2..07cfad817d53 100644 --- a/tools/include/linux/kernel.h +++ b/tools/include/linux/kernel.h @@ -8,6 +8,7 @@ #include <linux/build_bug.h> #include <linux/compiler.h> #include <linux/math.h> +#include <linux/panic.h> #include <endian.h> #include <byteswap.h> diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h index 7a6b98f4e579..dc0fc7125bc3 100644 --- a/tools/include/linux/mm.h +++ b/tools/include/linux/mm.h @@ -34,4 +34,9 @@ static inline void totalram_pages_add(long count) { } +static inline int early_pfn_to_nid(unsigned long pfn) +{ + return 0; +} + #endif diff --git a/tools/include/linux/panic.h b/tools/include/linux/panic.h new file mode 100644 index 000000000000..9c8f17a41ce8 --- /dev/null +++ b/tools/include/linux/panic.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_LINUX_PANIC_H +#define _TOOLS_LINUX_PANIC_H + +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> + +static inline void panic(const char *fmt, ...) +{ + va_list argp; + + va_start(argp, fmt); + vfprintf(stderr, fmt, argp); + va_end(argp); + exit(-1); +} + +#endif diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index fd4f9574d177..2ee338860b7e 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -3013,6 +3013,7 @@ struct drm_i915_query_item { * - %DRM_I915_QUERY_MEMORY_REGIONS (see struct drm_i915_query_memory_regions) * - %DRM_I915_QUERY_HWCONFIG_BLOB (see `GuC HWCONFIG blob uAPI`) * - %DRM_I915_QUERY_GEOMETRY_SUBSLICES (see struct drm_i915_query_topology_info) + * - %DRM_I915_QUERY_GUC_SUBMISSION_VERSION (see struct drm_i915_query_guc_submission_version) */ __u64 query_id; #define DRM_I915_QUERY_TOPOLOGY_INFO 1 @@ -3021,6 +3022,7 @@ struct drm_i915_query_item { #define DRM_I915_QUERY_MEMORY_REGIONS 4 #define DRM_I915_QUERY_HWCONFIG_BLOB 5 #define DRM_I915_QUERY_GEOMETRY_SUBSLICES 6 +#define DRM_I915_QUERY_GUC_SUBMISSION_VERSION 7 /* Must be kept compact -- no holes and well documented */ /** @@ -3567,6 +3569,20 @@ struct drm_i915_query_memory_regions { }; /** + * struct drm_i915_query_guc_submission_version - query GuC submission interface version + */ +struct drm_i915_query_guc_submission_version { + /** @branch: Firmware branch version. */ + __u32 branch; + /** @major: Firmware major version. */ + __u32 major; + /** @minor: Firmware minor version. */ + __u32 minor; + /** @patch: Firmware patch version. */ + __u32 patch; +}; + +/** * DOC: GuC HWCONFIG blob uAPI * * The GuC produces a blob with information about the current device. diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index 48ad69f7722e..45e4e64fd664 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -64,6 +64,24 @@ struct fstrim_range { __u64 minlen; }; +/* + * We include a length field because some filesystems (vfat) have an identifier + * that we do want to expose as a UUID, but doesn't have the standard length. + * + * We use a fixed size buffer beacuse this interface will, by fiat, never + * support "UUIDs" longer than 16 bytes; we don't want to force all downstream + * users to have to deal with that. + */ +struct fsuuid2 { + __u8 len; + __u8 uuid[16]; +}; + +struct fs_sysfs_path { + __u8 len; + __u8 name[128]; +}; + /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */ #define FILE_DEDUPE_RANGE_SAME 0 #define FILE_DEDUPE_RANGE_DIFFERS 1 @@ -215,6 +233,13 @@ struct fsxattr { #define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) #define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX]) #define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX]) +/* Returns the external filesystem UUID, the same one blkid returns */ +#define FS_IOC_GETFSUUID _IOR(0x15, 0, struct fsuuid2) +/* + * Returns the path component under /sys/fs/ that refers to this filesystem; + * also /sys/kernel/debug/ for filesystems with debugfs exports + */ +#define FS_IOC_GETFSSYSFSPATH _IOR(0x15, 1, struct fs_sysfs_path) /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) @@ -301,9 +326,12 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO O_APPEND */ #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) +/* per-IO negation of O_APPEND */ +#define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND) + RWF_APPEND | RWF_NOAPPEND) /* Pagemap ioctl */ #define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index c3308536482b..2190adbe3002 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -16,6 +16,11 @@ #define KVM_API_VERSION 12 +/* + * Backwards-compatible definitions. + */ +#define __KVM_HAVE_GUEST_DEBUG + /* for KVM_SET_USER_MEMORY_REGION */ struct kvm_userspace_memory_region { __u32 slot; @@ -85,43 +90,6 @@ struct kvm_pit_config { #define KVM_PIT_SPEAKER_DUMMY 1 -struct kvm_s390_skeys { - __u64 start_gfn; - __u64 count; - __u64 skeydata_addr; - __u32 flags; - __u32 reserved[9]; -}; - -#define KVM_S390_CMMA_PEEK (1 << 0) - -/** - * kvm_s390_cmma_log - Used for CMMA migration. - * - * Used both for input and output. - * - * @start_gfn: Guest page number to start from. - * @count: Size of the result buffer. - * @flags: Control operation mode via KVM_S390_CMMA_* flags - * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty - * pages are still remaining. - * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set - * in the PGSTE. - * @values: Pointer to the values buffer. - * - * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls. - */ -struct kvm_s390_cmma_log { - __u64 start_gfn; - __u32 count; - __u32 flags; - union { - __u64 remaining; - __u64 mask; - }; - __u64 values; -}; - struct kvm_hyperv_exit { #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 @@ -315,11 +283,6 @@ struct kvm_run { __u32 ipb; } s390_sieic; /* KVM_EXIT_S390_RESET */ -#define KVM_S390_RESET_POR 1 -#define KVM_S390_RESET_CLEAR 2 -#define KVM_S390_RESET_SUBSYSTEM 4 -#define KVM_S390_RESET_CPU_INIT 8 -#define KVM_S390_RESET_IPL 16 __u64 s390_reset_flags; /* KVM_EXIT_S390_UCONTROL */ struct { @@ -536,43 +499,6 @@ struct kvm_translation { __u8 pad[5]; }; -/* for KVM_S390_MEM_OP */ -struct kvm_s390_mem_op { - /* in */ - __u64 gaddr; /* the guest address */ - __u64 flags; /* flags */ - __u32 size; /* amount of bytes */ - __u32 op; /* type of operation */ - __u64 buf; /* buffer in userspace */ - union { - struct { - __u8 ar; /* the access register number */ - __u8 key; /* access key, ignored if flag unset */ - __u8 pad1[6]; /* ignored */ - __u64 old_addr; /* ignored if cmpxchg flag unset */ - }; - __u32 sida_offset; /* offset into the sida */ - __u8 reserved[32]; /* ignored */ - }; -}; -/* types for kvm_s390_mem_op->op */ -#define KVM_S390_MEMOP_LOGICAL_READ 0 -#define KVM_S390_MEMOP_LOGICAL_WRITE 1 -#define KVM_S390_MEMOP_SIDA_READ 2 -#define KVM_S390_MEMOP_SIDA_WRITE 3 -#define KVM_S390_MEMOP_ABSOLUTE_READ 4 -#define KVM_S390_MEMOP_ABSOLUTE_WRITE 5 -#define KVM_S390_MEMOP_ABSOLUTE_CMPXCHG 6 - -/* flags for kvm_s390_mem_op->flags */ -#define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) -#define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) -#define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2) - -/* flags specifying extension support via KVM_CAP_S390_MEM_OP_EXTENSION */ -#define KVM_S390_MEMOP_EXTENSION_CAP_BASE (1 << 0) -#define KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG (1 << 1) - /* for KVM_INTERRUPT */ struct kvm_interrupt { /* in */ @@ -637,124 +563,6 @@ struct kvm_mp_state { __u32 mp_state; }; -struct kvm_s390_psw { - __u64 mask; - __u64 addr; -}; - -/* valid values for type in kvm_s390_interrupt */ -#define KVM_S390_SIGP_STOP 0xfffe0000u -#define KVM_S390_PROGRAM_INT 0xfffe0001u -#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u -#define KVM_S390_RESTART 0xfffe0003u -#define KVM_S390_INT_PFAULT_INIT 0xfffe0004u -#define KVM_S390_INT_PFAULT_DONE 0xfffe0005u -#define KVM_S390_MCHK 0xfffe1000u -#define KVM_S390_INT_CLOCK_COMP 0xffff1004u -#define KVM_S390_INT_CPU_TIMER 0xffff1005u -#define KVM_S390_INT_VIRTIO 0xffff2603u -#define KVM_S390_INT_SERVICE 0xffff2401u -#define KVM_S390_INT_EMERGENCY 0xffff1201u -#define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u -/* Anything below 0xfffe0000u is taken by INT_IO */ -#define KVM_S390_INT_IO(ai,cssid,ssid,schid) \ - (((schid)) | \ - ((ssid) << 16) | \ - ((cssid) << 18) | \ - ((ai) << 26)) -#define KVM_S390_INT_IO_MIN 0x00000000u -#define KVM_S390_INT_IO_MAX 0xfffdffffu -#define KVM_S390_INT_IO_AI_MASK 0x04000000u - - -struct kvm_s390_interrupt { - __u32 type; - __u32 parm; - __u64 parm64; -}; - -struct kvm_s390_io_info { - __u16 subchannel_id; - __u16 subchannel_nr; - __u32 io_int_parm; - __u32 io_int_word; -}; - -struct kvm_s390_ext_info { - __u32 ext_params; - __u32 pad; - __u64 ext_params2; -}; - -struct kvm_s390_pgm_info { - __u64 trans_exc_code; - __u64 mon_code; - __u64 per_address; - __u32 data_exc_code; - __u16 code; - __u16 mon_class_nr; - __u8 per_code; - __u8 per_atmid; - __u8 exc_access_id; - __u8 per_access_id; - __u8 op_access_id; -#define KVM_S390_PGM_FLAGS_ILC_VALID 0x01 -#define KVM_S390_PGM_FLAGS_ILC_0 0x02 -#define KVM_S390_PGM_FLAGS_ILC_1 0x04 -#define KVM_S390_PGM_FLAGS_ILC_MASK 0x06 -#define KVM_S390_PGM_FLAGS_NO_REWIND 0x08 - __u8 flags; - __u8 pad[2]; -}; - -struct kvm_s390_prefix_info { - __u32 address; -}; - -struct kvm_s390_extcall_info { - __u16 code; -}; - -struct kvm_s390_emerg_info { - __u16 code; -}; - -#define KVM_S390_STOP_FLAG_STORE_STATUS 0x01 -struct kvm_s390_stop_info { - __u32 flags; -}; - -struct kvm_s390_mchk_info { - __u64 cr14; - __u64 mcic; - __u64 failing_storage_address; - __u32 ext_damage_code; - __u32 pad; - __u8 fixed_logout[16]; -}; - -struct kvm_s390_irq { - __u64 type; - union { - struct kvm_s390_io_info io; - struct kvm_s390_ext_info ext; - struct kvm_s390_pgm_info pgm; - struct kvm_s390_emerg_info emerg; - struct kvm_s390_extcall_info extcall; - struct kvm_s390_prefix_info prefix; - struct kvm_s390_stop_info stop; - struct kvm_s390_mchk_info mchk; - char reserved[64]; - } u; -}; - -struct kvm_s390_irq_state { - __u64 buf; - __u32 flags; /* will stay unused for compatibility reasons */ - __u32 len; - __u32 reserved[4]; /* will stay unused for compatibility reasons */ -}; - /* for KVM_SET_GUEST_DEBUG */ #define KVM_GUESTDBG_ENABLE 0x00000001 @@ -810,50 +618,6 @@ struct kvm_enable_cap { __u8 pad[64]; }; -/* for KVM_PPC_GET_PVINFO */ - -#define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) - -struct kvm_ppc_pvinfo { - /* out */ - __u32 flags; - __u32 hcall[4]; - __u8 pad[108]; -}; - -/* for KVM_PPC_GET_SMMU_INFO */ -#define KVM_PPC_PAGE_SIZES_MAX_SZ 8 - -struct kvm_ppc_one_page_size { - __u32 page_shift; /* Page shift (or 0) */ - __u32 pte_enc; /* Encoding in the HPTE (>>12) */ -}; - -struct kvm_ppc_one_seg_page_size { - __u32 page_shift; /* Base page shift of segment (or 0) */ - __u32 slb_enc; /* SLB encoding for BookS */ - struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; -}; - -#define KVM_PPC_PAGE_SIZES_REAL 0x00000001 -#define KVM_PPC_1T_SEGMENTS 0x00000002 -#define KVM_PPC_NO_HASH 0x00000004 - -struct kvm_ppc_smmu_info { - __u64 flags; - __u32 slb_size; - __u16 data_keys; /* # storage keys supported for data */ - __u16 instr_keys; /* # storage keys supported for instructions */ - struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; -}; - -/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */ -struct kvm_ppc_resize_hpt { - __u64 flags; - __u32 shift; - __u32 pad; -}; - #define KVMIO 0xAE /* machine type bits, to be used as argument to KVM_CREATE_VM */ @@ -923,9 +687,7 @@ struct kvm_ppc_resize_hpt { /* Bug in KVM_SET_USER_MEMORY_REGION fixed: */ #define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21 #define KVM_CAP_USER_NMI 22 -#ifdef __KVM_HAVE_GUEST_DEBUG #define KVM_CAP_SET_GUEST_DEBUG 23 -#endif #ifdef __KVM_HAVE_PIT #define KVM_CAP_REINJECT_CONTROL 24 #endif @@ -1156,8 +918,6 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 -#ifdef KVM_CAP_IRQ_ROUTING - struct kvm_irq_routing_irqchip { __u32 irqchip; __u32 pin; @@ -1222,42 +982,6 @@ struct kvm_irq_routing { struct kvm_irq_routing_entry entries[]; }; -#endif - -#ifdef KVM_CAP_MCE -/* x86 MCE */ -struct kvm_x86_mce { - __u64 status; - __u64 addr; - __u64 misc; - __u64 mcg_status; - __u8 bank; - __u8 pad1[7]; - __u64 pad2[3]; -}; -#endif - -#ifdef KVM_CAP_XEN_HVM -#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) -#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) -#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) -#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) -#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) -#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) -#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) -#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) - -struct kvm_xen_hvm_config { - __u32 flags; - __u32 msr; - __u64 blob_addr_32; - __u64 blob_addr_64; - __u8 blob_size_32; - __u8 blob_size_64; - __u8 pad2[30]; -}; -#endif - #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) /* * Available with KVM_CAP_IRQFD_RESAMPLE @@ -1442,11 +1166,6 @@ struct kvm_vfio_spapr_tce { struct kvm_userspace_memory_region2) /* enable ucontrol for s390 */ -struct kvm_s390_ucas_mapping { - __u64 user_addr; - __u64 vcpu_addr; - __u64 length; -}; #define KVM_S390_UCAS_MAP _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping) #define KVM_S390_UCAS_UNMAP _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping) #define KVM_S390_VCPU_FAULT _IOW(KVMIO, 0x52, unsigned long) @@ -1641,89 +1360,6 @@ struct kvm_enc_region { #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) -struct kvm_s390_pv_sec_parm { - __u64 origin; - __u64 length; -}; - -struct kvm_s390_pv_unp { - __u64 addr; - __u64 size; - __u64 tweak; -}; - -enum pv_cmd_dmp_id { - KVM_PV_DUMP_INIT, - KVM_PV_DUMP_CONFIG_STOR_STATE, - KVM_PV_DUMP_COMPLETE, - KVM_PV_DUMP_CPU, -}; - -struct kvm_s390_pv_dmp { - __u64 subcmd; - __u64 buff_addr; - __u64 buff_len; - __u64 gaddr; /* For dump storage state */ - __u64 reserved[4]; -}; - -enum pv_cmd_info_id { - KVM_PV_INFO_VM, - KVM_PV_INFO_DUMP, -}; - -struct kvm_s390_pv_info_dump { - __u64 dump_cpu_buffer_len; - __u64 dump_config_mem_buffer_per_1m; - __u64 dump_config_finalize_len; -}; - -struct kvm_s390_pv_info_vm { - __u64 inst_calls_list[4]; - __u64 max_cpus; - __u64 max_guests; - __u64 max_guest_addr; - __u64 feature_indication; -}; - -struct kvm_s390_pv_info_header { - __u32 id; - __u32 len_max; - __u32 len_written; - __u32 reserved; -}; - -struct kvm_s390_pv_info { - struct kvm_s390_pv_info_header header; - union { - struct kvm_s390_pv_info_dump dump; - struct kvm_s390_pv_info_vm vm; - }; -}; - -enum pv_cmd_id { - KVM_PV_ENABLE, - KVM_PV_DISABLE, - KVM_PV_SET_SEC_PARMS, - KVM_PV_UNPACK, - KVM_PV_VERIFY, - KVM_PV_PREP_RESET, - KVM_PV_UNSHARE_ALL, - KVM_PV_INFO, - KVM_PV_DUMP, - KVM_PV_ASYNC_CLEANUP_PREPARE, - KVM_PV_ASYNC_CLEANUP_PERFORM, -}; - -struct kvm_pv_cmd { - __u32 cmd; /* Command to be executed */ - __u16 rc; /* Ultravisor return code */ - __u16 rrc; /* Ultravisor return reason code */ - __u64 data; /* Data or address */ - __u32 flags; /* flags for future extensions. Must be 0 for now */ - __u32 reserved[3]; -}; - /* Available with KVM_CAP_S390_PROTECTED */ #define KVM_S390_PV_COMMAND _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd) @@ -1737,58 +1373,6 @@ struct kvm_pv_cmd { #define KVM_XEN_HVM_GET_ATTR _IOWR(KVMIO, 0xc8, struct kvm_xen_hvm_attr) #define KVM_XEN_HVM_SET_ATTR _IOW(KVMIO, 0xc9, struct kvm_xen_hvm_attr) -struct kvm_xen_hvm_attr { - __u16 type; - __u16 pad[3]; - union { - __u8 long_mode; - __u8 vector; - __u8 runstate_update_flag; - struct { - __u64 gfn; -#define KVM_XEN_INVALID_GFN ((__u64)-1) - } shared_info; - struct { - __u32 send_port; - __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */ - __u32 flags; -#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0) -#define KVM_XEN_EVTCHN_UPDATE (1 << 1) -#define KVM_XEN_EVTCHN_RESET (1 << 2) - /* - * Events sent by the guest are either looped back to - * the guest itself (potentially on a different port#) - * or signalled via an eventfd. - */ - union { - struct { - __u32 port; - __u32 vcpu; - __u32 priority; - } port; - struct { - __u32 port; /* Zero for eventfd */ - __s32 fd; - } eventfd; - __u32 padding[4]; - } deliver; - } evtchn; - __u32 xen_version; - __u64 pad[8]; - } u; -}; - - -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ -#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 -#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 -#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ -#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 -#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ -#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 - /* Per-vCPU Xen attributes */ #define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr) #define KVM_XEN_VCPU_SET_ATTR _IOW(KVMIO, 0xcb, struct kvm_xen_vcpu_attr) @@ -1799,242 +1383,6 @@ struct kvm_xen_hvm_attr { #define KVM_GET_SREGS2 _IOR(KVMIO, 0xcc, struct kvm_sregs2) #define KVM_SET_SREGS2 _IOW(KVMIO, 0xcd, struct kvm_sregs2) -struct kvm_xen_vcpu_attr { - __u16 type; - __u16 pad[3]; - union { - __u64 gpa; -#define KVM_XEN_INVALID_GPA ((__u64)-1) - __u64 pad[8]; - struct { - __u64 state; - __u64 state_entry_time; - __u64 time_running; - __u64 time_runnable; - __u64 time_blocked; - __u64 time_offline; - } runstate; - __u32 vcpu_id; - struct { - __u32 port; - __u32 priority; - __u64 expires_ns; - } timer; - __u8 vector; - } u; -}; - -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ -#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 -#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ -#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6 -#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7 -#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 - -/* Secure Encrypted Virtualization command */ -enum sev_cmd_id { - /* Guest initialization commands */ - KVM_SEV_INIT = 0, - KVM_SEV_ES_INIT, - /* Guest launch commands */ - KVM_SEV_LAUNCH_START, - KVM_SEV_LAUNCH_UPDATE_DATA, - KVM_SEV_LAUNCH_UPDATE_VMSA, - KVM_SEV_LAUNCH_SECRET, - KVM_SEV_LAUNCH_MEASURE, - KVM_SEV_LAUNCH_FINISH, - /* Guest migration commands (outgoing) */ - KVM_SEV_SEND_START, - KVM_SEV_SEND_UPDATE_DATA, - KVM_SEV_SEND_UPDATE_VMSA, - KVM_SEV_SEND_FINISH, - /* Guest migration commands (incoming) */ - KVM_SEV_RECEIVE_START, - KVM_SEV_RECEIVE_UPDATE_DATA, - KVM_SEV_RECEIVE_UPDATE_VMSA, - KVM_SEV_RECEIVE_FINISH, - /* Guest status and debug commands */ - KVM_SEV_GUEST_STATUS, - KVM_SEV_DBG_DECRYPT, - KVM_SEV_DBG_ENCRYPT, - /* Guest certificates commands */ - KVM_SEV_CERT_EXPORT, - /* Attestation report */ - KVM_SEV_GET_ATTESTATION_REPORT, - /* Guest Migration Extension */ - KVM_SEV_SEND_CANCEL, - - KVM_SEV_NR_MAX, -}; - -struct kvm_sev_cmd { - __u32 id; - __u64 data; - __u32 error; - __u32 sev_fd; -}; - -struct kvm_sev_launch_start { - __u32 handle; - __u32 policy; - __u64 dh_uaddr; - __u32 dh_len; - __u64 session_uaddr; - __u32 session_len; -}; - -struct kvm_sev_launch_update_data { - __u64 uaddr; - __u32 len; -}; - - -struct kvm_sev_launch_secret { - __u64 hdr_uaddr; - __u32 hdr_len; - __u64 guest_uaddr; - __u32 guest_len; - __u64 trans_uaddr; - __u32 trans_len; -}; - -struct kvm_sev_launch_measure { - __u64 uaddr; - __u32 len; -}; - -struct kvm_sev_guest_status { - __u32 handle; - __u32 policy; - __u32 state; -}; - -struct kvm_sev_dbg { - __u64 src_uaddr; - __u64 dst_uaddr; - __u32 len; -}; - -struct kvm_sev_attestation_report { - __u8 mnonce[16]; - __u64 uaddr; - __u32 len; -}; - -struct kvm_sev_send_start { - __u32 policy; - __u64 pdh_cert_uaddr; - __u32 pdh_cert_len; - __u64 plat_certs_uaddr; - __u32 plat_certs_len; - __u64 amd_certs_uaddr; - __u32 amd_certs_len; - __u64 session_uaddr; - __u32 session_len; -}; - -struct kvm_sev_send_update_data { - __u64 hdr_uaddr; - __u32 hdr_len; - __u64 guest_uaddr; - __u32 guest_len; - __u64 trans_uaddr; - __u32 trans_len; -}; - -struct kvm_sev_receive_start { - __u32 handle; - __u32 policy; - __u64 pdh_uaddr; - __u32 pdh_len; - __u64 session_uaddr; - __u32 session_len; -}; - -struct kvm_sev_receive_update_data { - __u64 hdr_uaddr; - __u32 hdr_len; - __u64 guest_uaddr; - __u32 guest_len; - __u64 trans_uaddr; - __u32 trans_len; -}; - -#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) -#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) -#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) - -struct kvm_assigned_pci_dev { - __u32 assigned_dev_id; - __u32 busnr; - __u32 devfn; - __u32 flags; - __u32 segnr; - union { - __u32 reserved[11]; - }; -}; - -#define KVM_DEV_IRQ_HOST_INTX (1 << 0) -#define KVM_DEV_IRQ_HOST_MSI (1 << 1) -#define KVM_DEV_IRQ_HOST_MSIX (1 << 2) - -#define KVM_DEV_IRQ_GUEST_INTX (1 << 8) -#define KVM_DEV_IRQ_GUEST_MSI (1 << 9) -#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) - -#define KVM_DEV_IRQ_HOST_MASK 0x00ff -#define KVM_DEV_IRQ_GUEST_MASK 0xff00 - -struct kvm_assigned_irq { - __u32 assigned_dev_id; - __u32 host_irq; /* ignored (legacy field) */ - __u32 guest_irq; - __u32 flags; - union { - __u32 reserved[12]; - }; -}; - -struct kvm_assigned_msix_nr { - __u32 assigned_dev_id; - __u16 entry_nr; - __u16 padding; -}; - -#define KVM_MAX_MSIX_PER_DEV 256 -struct kvm_assigned_msix_entry { - __u32 assigned_dev_id; - __u32 gsi; - __u16 entry; /* The index of entry in the MSI-X table */ - __u16 padding[3]; -}; - -#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) -#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) - -/* Available with KVM_CAP_ARM_USER_IRQ */ - -/* Bits for run->s.regs.device_irq_level */ -#define KVM_ARM_DEV_EL1_VTIMER (1 << 0) -#define KVM_ARM_DEV_EL1_PTIMER (1 << 1) -#define KVM_ARM_DEV_PMU (1 << 2) - -struct kvm_hyperv_eventfd { - __u32 conn_id; - __s32 fd; - __u32 flags; - __u32 padding[3]; -}; - -#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff -#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) - #define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) #define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) @@ -2180,33 +1528,6 @@ struct kvm_stats_desc { /* Available with KVM_CAP_S390_ZPCI_OP */ #define KVM_S390_ZPCI_OP _IOW(KVMIO, 0xd1, struct kvm_s390_zpci_op) -struct kvm_s390_zpci_op { - /* in */ - __u32 fh; /* target device */ - __u8 op; /* operation to perform */ - __u8 pad[3]; - union { - /* for KVM_S390_ZPCIOP_REG_AEN */ - struct { - __u64 ibv; /* Guest addr of interrupt bit vector */ - __u64 sb; /* Guest addr of summary bit */ - __u32 flags; - __u32 noi; /* Number of interrupts */ - __u8 isc; /* Guest interrupt subclass */ - __u8 sbo; /* Offset of guest summary bit vector */ - __u16 pad; - } reg_aen; - __u64 reserved[8]; - } u; -}; - -/* types for kvm_s390_zpci_op->op */ -#define KVM_S390_ZPCIOP_REG_AEN 0 -#define KVM_S390_ZPCIOP_DEREG_AEN 1 - -/* flags for kvm_s390_zpci_op->u.reg_aen.flags */ -#define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) - /* Available with KVM_CAP_MEMORY_ATTRIBUTES */ #define KVM_SET_MEMORY_ATTRIBUTES _IOW(KVMIO, 0xd2, struct kvm_memory_attributes) diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index d5b9cfbd9cea..628d46a0da92 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -142,7 +142,7 @@ struct snd_hwdep_dsp_image { * * *****************************************************************************/ -#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 16) +#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 17) typedef unsigned long snd_pcm_uframes_t; typedef signed long snd_pcm_sframes_t; @@ -416,7 +416,7 @@ struct snd_pcm_hw_params { unsigned int rmask; /* W: requested masks */ unsigned int cmask; /* R: changed masks */ unsigned int info; /* R: Info flags for returned setup */ - unsigned int msbits; /* R: used most significant bits */ + unsigned int msbits; /* R: used most significant bits (in sample bit-width) */ unsigned int rate_num; /* R: rate numerator */ unsigned int rate_den; /* R: rate denominator */ snd_pcm_uframes_t fifo_size; /* R: chip FIFO size in frames */ diff --git a/tools/net/ynl/cli.py b/tools/net/ynl/cli.py index f131e33ac3ee..058926d69ef0 100755 --- a/tools/net/ynl/cli.py +++ b/tools/net/ynl/cli.py @@ -19,13 +19,28 @@ class YnlEncoder(json.JSONEncoder): def main(): - parser = argparse.ArgumentParser(description='YNL CLI sample') + description = """ + YNL CLI utility - a general purpose netlink utility that uses YAML + specs to drive protocol encoding and decoding. + """ + epilog = """ + The --multi option can be repeated to include several do operations + in the same netlink payload. + """ + + parser = argparse.ArgumentParser(description=description, + epilog=epilog) parser.add_argument('--spec', dest='spec', type=str, required=True) parser.add_argument('--schema', dest='schema', type=str) parser.add_argument('--no-schema', action='store_true') parser.add_argument('--json', dest='json_text', type=str) - parser.add_argument('--do', dest='do', type=str) - parser.add_argument('--dump', dest='dump', type=str) + + group = parser.add_mutually_exclusive_group() + group.add_argument('--do', dest='do', metavar='DO-OPERATION', type=str) + group.add_argument('--multi', dest='multi', nargs=2, action='append', + metavar=('DO-OPERATION', 'JSON_TEXT'), type=str) + group.add_argument('--dump', dest='dump', metavar='DUMP-OPERATION', type=str) + parser.add_argument('--sleep', dest='sleep', type=int) parser.add_argument('--subscribe', dest='ntf', type=str) parser.add_argument('--replace', dest='flags', action='append_const', @@ -73,6 +88,10 @@ def main(): if args.dump: reply = ynl.dump(args.dump, attrs) output(reply) + if args.multi: + ops = [ (item[0], json.loads(item[1]), args.flags or []) for item in args.multi ] + reply = ynl.do_multi(ops) + output(reply) except NlError as e: print(e) exit(1) diff --git a/tools/net/ynl/ethtool.py b/tools/net/ynl/ethtool.py index 44ba3ba58ed9..63c471f075ab 100755 --- a/tools/net/ynl/ethtool.py +++ b/tools/net/ynl/ethtool.py @@ -324,7 +324,13 @@ def main(): return if args.show_time_stamping: - tsinfo = dumpit(ynl, args, 'tsinfo-get') + req = { + 'header': { + 'flags': 'stats', + }, + } + + tsinfo = dumpit(ynl, args, 'tsinfo-get', req) print(f'Time stamping parameters for {args.device}:') @@ -338,6 +344,9 @@ def main(): print('Hardware Receive Filter Modes:') [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])] + + print('Statistics:') + [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()] return print(f'Settings for {args.device}:') diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index 82d3c98067aa..35f82a2c2247 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -100,9 +100,10 @@ class Netlink: class NlError(Exception): def __init__(self, nl_msg): self.nl_msg = nl_msg + self.error = -nl_msg.error def __str__(self): - return f"Netlink error: {os.strerror(-self.nl_msg.error)}\n{self.nl_msg}" + return f"Netlink error: {os.strerror(self.error)}\n{self.nl_msg}" class ConfigError(Exception): @@ -202,6 +203,7 @@ class NlMsg: self.done = 1 extack_off = 20 elif self.nl_type == Netlink.NLMSG_DONE: + self.error = struct.unpack("i", self.raw[0:4])[0] self.done = 1 extack_off = 4 @@ -385,12 +387,9 @@ class NetlinkProtocol: def _decode(self, nl_msg): return nl_msg - def decode(self, ynl, nl_msg): + def decode(self, ynl, nl_msg, op): msg = self._decode(nl_msg) - fixed_header_size = 0 - if ynl: - op = ynl.rsp_by_value[msg.cmd()] - fixed_header_size = ynl._struct_size(op.fixed_header) + fixed_header_size = ynl._struct_size(op.fixed_header) msg.raw_attrs = NlAttrs(msg.raw, fixed_header_size) return msg @@ -630,15 +629,28 @@ class YnlFamily(SpecFamily): decoded = self._formatted_string(decoded, attr_spec.display_hint) return decoded - def _decode_array_nest(self, attr, attr_spec): + def _decode_array_attr(self, attr, attr_spec): decoded = [] offset = 0 while offset < len(attr.raw): item = NlAttr(attr.raw, offset) offset += item.full_len - subattrs = self._decode(NlAttrs(item.raw), attr_spec['nested-attributes']) - decoded.append({ item.type: subattrs }) + if attr_spec["sub-type"] == 'nest': + subattrs = self._decode(NlAttrs(item.raw), attr_spec['nested-attributes']) + decoded.append({ item.type: subattrs }) + elif attr_spec["sub-type"] == 'binary': + subattrs = item.as_bin() + if attr_spec.display_hint: + subattrs = self._formatted_string(subattrs, attr_spec.display_hint) + decoded.append(subattrs) + elif attr_spec["sub-type"] in NlAttr.type_formats: + subattrs = item.as_scalar(attr_spec['sub-type'], attr_spec.byte_order) + if attr_spec.display_hint: + subattrs = self._formatted_string(subattrs, attr_spec.display_hint) + decoded.append(subattrs) + else: + raise Exception(f'Unknown {attr_spec["sub-type"]} with name {attr_spec["name"]}') return decoded def _decode_nest_type_value(self, attr, attr_spec): @@ -732,8 +744,8 @@ class YnlFamily(SpecFamily): decoded = attr.as_scalar(attr_spec['type'], attr_spec.byte_order) if 'enum' in attr_spec: decoded = self._decode_enum(decoded, attr_spec) - elif attr_spec["type"] == 'array-nest': - decoded = self._decode_array_nest(attr, attr_spec) + elif attr_spec["type"] == 'indexed-array': + decoded = self._decode_array_attr(attr, attr_spec) elif attr_spec["type"] == 'bitfield32': value, selector = struct.unpack("II", attr.raw) if 'enum' in attr_spec: @@ -783,7 +795,7 @@ class YnlFamily(SpecFamily): if 'bad-attr-offs' not in extack: return - msg = self.nlproto.decode(self, NlMsg(request, 0, op.attr_set)) + msg = self.nlproto.decode(self, NlMsg(request, 0, op.attr_set), op) offset = self.nlproto.msghdr_size() + self._struct_size(op.fixed_header) path = self._decode_extack_path(msg.raw_attrs, op.attr_set, offset, extack['bad-attr-offs']) @@ -908,7 +920,8 @@ class YnlFamily(SpecFamily): print("Netlink done while checking for ntf!?") continue - decoded = self.nlproto.decode(self, nl_msg) + op = self.rsp_by_value[nl_msg.cmd()] + decoded = self.nlproto.decode(self, nl_msg, op) if decoded.cmd() not in self.async_msg_ids: print("Unexpected msg id done while checking for ntf", decoded) continue @@ -926,16 +939,11 @@ class YnlFamily(SpecFamily): return op['do']['request']['attributes'].copy() - def _op(self, method, vals, flags=None, dump=False): - op = self.ops[method] - + def _encode_message(self, op, vals, flags, req_seq): nl_flags = Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK for flag in flags or []: nl_flags |= flag - if dump: - nl_flags |= Netlink.NLM_F_DUMP - req_seq = random.randint(1024, 65535) msg = self.nlproto.message(nl_flags, op.req_value, 1, req_seq) if op.fixed_header: msg += self._encode_struct(op.fixed_header, vals) @@ -943,18 +951,36 @@ class YnlFamily(SpecFamily): for name, value in vals.items(): msg += self._add_attr(op.attr_set.name, name, value, search_attrs) msg = _genl_msg_finalize(msg) + return msg - self.sock.send(msg, 0) + def _ops(self, ops): + reqs_by_seq = {} + req_seq = random.randint(1024, 65535) + payload = b'' + for (method, vals, flags) in ops: + op = self.ops[method] + msg = self._encode_message(op, vals, flags, req_seq) + reqs_by_seq[req_seq] = (op, msg, flags) + payload += msg + req_seq += 1 + + self.sock.send(payload, 0) done = False rsp = [] + op_rsp = [] while not done: reply = self.sock.recv(self._recv_size) nms = NlMsgs(reply, attr_space=op.attr_set) self._recv_dbg_print(reply, nms) for nl_msg in nms: - if nl_msg.extack: - self._decode_extack(msg, op, nl_msg.extack) + if nl_msg.nl_seq in reqs_by_seq: + (op, req_msg, req_flags) = reqs_by_seq[nl_msg.nl_seq] + if nl_msg.extack: + self._decode_extack(req_msg, op, nl_msg.extack) + else: + op = self.rsp_by_value[nl_msg.cmd()] + req_flags = [] if nl_msg.error: raise NlError(nl_msg) @@ -962,13 +988,25 @@ class YnlFamily(SpecFamily): if nl_msg.extack: print("Netlink warning:") print(nl_msg) - done = True + + if Netlink.NLM_F_DUMP in req_flags: + rsp.append(op_rsp) + elif not op_rsp: + rsp.append(None) + elif len(op_rsp) == 1: + rsp.append(op_rsp[0]) + else: + rsp.append(op_rsp) + op_rsp = [] + + del reqs_by_seq[nl_msg.nl_seq] + done = len(reqs_by_seq) == 0 break - decoded = self.nlproto.decode(self, nl_msg) + decoded = self.nlproto.decode(self, nl_msg, op) # Check if this is a reply to our request - if nl_msg.nl_seq != req_seq or decoded.cmd() != op.rsp_value: + if nl_msg.nl_seq not in reqs_by_seq or decoded.cmd() != op.rsp_value: if decoded.cmd() in self.async_msg_ids: self.handle_ntf(decoded) continue @@ -979,16 +1017,23 @@ class YnlFamily(SpecFamily): rsp_msg = self._decode(decoded.raw_attrs, op.attr_set.name) if op.fixed_header: rsp_msg.update(self._decode_struct(decoded.raw, op.fixed_header)) - rsp.append(rsp_msg) + op_rsp.append(rsp_msg) - if not rsp: - return None - if not dump and len(rsp) == 1: - return rsp[0] return rsp + def _op(self, method, vals, flags=None, dump=False): + req_flags = flags or [] + if dump: + req_flags.append(Netlink.NLM_F_DUMP) + + ops = [(method, vals, req_flags)] + return self._ops(ops)[0] + def do(self, method, vals, flags=None): return self._op(method, vals, flags) def dump(self, method, vals): - return self._op(method, vals, [], dump=True) + return self._op(method, vals, dump=True) + + def do_multi(self, ops): + return self._ops(ops) diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index a451cbfbd781..c0b90c104d92 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -841,8 +841,11 @@ class AttrSet(SpecAttrSet): t = TypeBitfield32(self.family, self, elem, value) elif elem['type'] == 'nest': t = TypeNest(self.family, self, elem, value) - elif elem['type'] == 'array-nest': - t = TypeArrayNest(self.family, self, elem, value) + elif elem['type'] == 'indexed-array' and 'sub-type' in elem: + if elem["sub-type"] == 'nest': + t = TypeArrayNest(self.family, self, elem, value) + else: + raise Exception(f'new_attr: unsupported sub-type {elem["sub-type"]}') elif elem['type'] == 'nest-type-value': t = TypeNestTypeValue(self.family, self, elem, value) else: @@ -1055,7 +1058,7 @@ class Family(SpecFamily): if nested in self.root_sets: raise Exception("Inheriting members to a space used as root not supported") inherit.update(set(spec['type-value'])) - elif spec['type'] == 'array-nest': + elif spec['type'] == 'indexed-array': inherit.add('idx') self.pure_nested_structs[nested].set_inherited(inherit) @@ -1619,9 +1622,12 @@ def _multi_parse(ri, struct, init_lines, local_vars): multi_attrs = set() needs_parg = False for arg, aspec in struct.member_list(): - if aspec['type'] == 'array-nest': - local_vars.append(f'const struct nlattr *attr_{aspec.c_name};') - array_nests.add(arg) + if aspec['type'] == 'indexed-array' and 'sub-type' in aspec: + if aspec["sub-type"] == 'nest': + local_vars.append(f'const struct nlattr *attr_{aspec.c_name};') + array_nests.add(arg) + else: + raise Exception(f'Not supported sub-type {aspec["sub-type"]}') if 'multi-attr' in aspec: multi_attrs.add(arg) needs_parg |= 'nested-attributes' in aspec diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 0b10ad008668..0a33d9195b7a 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -585,7 +585,7 @@ static int add_dead_ends(struct objtool_file *file) struct section *rsec; struct reloc *reloc; struct instruction *insn; - unsigned long offset; + uint64_t offset; /* * Check for manually annotated dead ends. diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index ec5e21932876..4790c735599b 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -970,7 +970,7 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, if (dso->annotate_warned) return -1; - if (not_annotated) { + if (not_annotated || !sym->annotate2) { err = symbol__annotate2(ms, evsel, &browser.arch); if (err) { char msg[BUFSIZ]; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index ac002d907d81..50ca92255ff6 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2461,6 +2461,9 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, if (parch) *parch = arch; + if (!list_empty(¬es->src->source)) + return 0; + args.arch = arch; args.ms = *ms; if (annotate_opts.full_addr) diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c index fb54bd38e7d0..d931a898c434 100644 --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -284,6 +284,7 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags) struct task_struct *curr; struct mm_struct___old *mm_old; struct mm_struct___new *mm_new; + struct sighand_struct *sighand; switch (flags) { case LCB_F_READ: /* rwsem */ @@ -305,7 +306,9 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags) break; case LCB_F_SPIN: /* spinlock */ curr = bpf_get_current_task_btf(); - if (&curr->sighand->siglock == (void *)lock) + sighand = curr->sighand; + + if (sighand && &sighand->siglock == (void *)lock) return LCD_F_SIGHAND_LOCK; break; default: diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 8f08c3fd498d..0d3672e5d9ed 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -67,6 +67,10 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP \fB--quiet\fP Do not decode and print the system configuration header information. .PP ++\fB--no-msr\fP Disable all the uses of the MSR driver. ++.PP ++\fB--no-perf\fP Disable all the uses of the perf API. ++.PP \fB--interval seconds\fP overrides the default 5.0 second measurement interval. .PP \fB--num_iterations num\fP number of the measurement iterations. @@ -125,9 +129,17 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBPkgTmp\fP Degrees Celsius reported by the per-package Package Thermal Monitor. .PP -\fBGFX%rc6\fP The percentage of time the GPU is in the "render C6" state, rc6, during the measurement interval. From /sys/class/drm/card0/power/rc6_residency_ms. +\fBGFX%rc6\fP The percentage of time the GPU is in the "render C6" state, rc6, during the measurement interval. From /sys/class/drm/card0/power/rc6_residency_ms or /sys/class/drm/card0/gt/gt0/rc6_residency_ms or /sys/class/drm/card0/device/tile0/gtN/gtidle/idle_residency_ms depending on the graphics driver being used. .PP -\fBGFXMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz. +\fBGFXMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz or /sys/class/drm/card0/gt_cur_freq_mhz or /sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/cur_freq depending on the graphics driver being used. +.PP +\fBGFXAMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz or /sys/class/drm/card0/gt_act_freq_mhz or /sys/class/drm/card0/gt/gt0/rps_act_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/act_freq depending on the graphics driver being used. +.PP +\fBSAM%mc6\fP The percentage of time the SA Media is in the "module C6" state, mc6, during the measurement interval. From /sys/class/drm/card0/gt/gt1/rc6_residency_ms or /sys/class/drm/card0/device/tile0/gtN/gtidle/idle_residency_ms depending on the graphics driver being used. +.PP +\fBSAMMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/cur_freq depending on the graphics driver being used. +.PP +\fBSAMAMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/drm/card0/gt/gt1/rps_act_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/act_freq depending on the graphics driver being used. .PP \fBPkg%pc2, Pkg%pc3, Pkg%pc6, Pkg%pc7\fP percentage residency in hardware package idle states. These numbers are from hardware residency counters. .PP @@ -370,7 +382,7 @@ below the processor's base frequency. Busy% = MPERF_delta/TSC_delta -Bzy_MHz = TSC_delta/APERF_delta/MPERF_delta/measurement_interval +Bzy_MHz = TSC_delta*APERF_delta/MPERF_delta/measurement_interval Note that these calculations depend on TSC_delta, so they are not reliable during intervals when TSC_MHz is not running at the base frequency. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 7a334377f92b..98256468e248 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3,7 +3,7 @@ * turbostat -- show CPU frequency and C-state residency * on modern Intel and AMD processors. * - * Copyright (c) 2023 Intel Corporation. + * Copyright (c) 2024 Intel Corporation. * Len Brown <len.brown@intel.com> */ @@ -36,6 +36,8 @@ #include <linux/perf_event.h> #include <asm/unistd.h> #include <stdbool.h> +#include <assert.h> +#include <linux/kernel.h> #define UNUSED(x) (void)(x) @@ -53,9 +55,13 @@ #define NAME_BYTES 20 #define PATH_BYTES 128 +#define MAX_NOFILE 0x8000 + enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; +enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; +enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; struct msr_counter { unsigned int msr_num; @@ -127,6 +133,9 @@ struct msr_counter bic[] = { { 0x0, "IPC", "", 0, 0, 0, NULL, 0 }, { 0x0, "CoreThr", "", 0, 0, 0, NULL, 0 }, { 0x0, "UncMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SAM%mc6", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SAMMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SAMAMHz", "", 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -185,11 +194,14 @@ struct msr_counter bic[] = { #define BIC_IPC (1ULL << 52) #define BIC_CORE_THROT_CNT (1ULL << 53) #define BIC_UNCORE_MHZ (1ULL << 54) +#define BIC_SAM_mc6 (1ULL << 55) +#define BIC_SAMMHz (1ULL << 56) +#define BIC_SAMACTMHz (1ULL << 57) #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) -#define BIC_FREQUENCY ( BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_UNCORE_MHZ) -#define BIC_IDLE ( BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX) +#define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) +#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6) #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -204,10 +216,13 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) +struct amperf_group_fd; + char *proc_stat = "/proc/stat"; FILE *outf; int *fd_percpu; int *fd_instr_count_percpu; +struct amperf_group_fd *fd_amperf_percpu; /* File descriptors for perf group with APERF and MPERF counters. */ struct timeval interval_tv = { 5, 0 }; struct timespec interval_ts = { 5, 0 }; @@ -242,11 +257,8 @@ char *output_buffer, *outp; unsigned int do_dts; unsigned int do_ptm; unsigned int do_ipc; -unsigned long long gfx_cur_rc6_ms; unsigned long long cpuidle_cur_cpu_lpi_us; unsigned long long cpuidle_cur_sys_lpi_us; -unsigned int gfx_cur_mhz; -unsigned int gfx_act_mhz; unsigned int tj_max; unsigned int tj_max_override; double rapl_power_units, rapl_time_units; @@ -263,6 +275,28 @@ unsigned int has_hwp_epp; /* IA32_HWP_REQUEST[bits 31:24] */ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int first_counter_read = 1; int ignore_stdin; +bool no_msr; +bool no_perf; +enum amperf_source amperf_source; + +enum gfx_sysfs_idx { + GFX_rc6, + GFX_MHz, + GFX_ACTMHz, + SAM_mc6, + SAM_MHz, + SAM_ACTMHz, + GFX_MAX +}; + +struct gfx_sysfs_info { + const char *path; + FILE *fp; + unsigned int val; + unsigned long long val_ull; +}; + +static struct gfx_sysfs_info gfx_info[GFX_MAX]; int get_msr(int cpu, off_t offset, unsigned long long *msr); @@ -652,6 +686,7 @@ static const struct platform_features icx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_ICX, + .has_msr_core_c1_res = 1, .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -948,6 +983,175 @@ size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affi #define MAX_ADDED_THREAD_COUNTERS 24 #define BITMASK_SIZE 32 +/* Indexes used to map data read from perf and MSRs into global variables */ +enum rapl_rci_index { + RAPL_RCI_INDEX_ENERGY_PKG = 0, + RAPL_RCI_INDEX_ENERGY_CORES = 1, + RAPL_RCI_INDEX_DRAM = 2, + RAPL_RCI_INDEX_GFX = 3, + RAPL_RCI_INDEX_PKG_PERF_STATUS = 4, + RAPL_RCI_INDEX_DRAM_PERF_STATUS = 5, + RAPL_RCI_INDEX_CORE_ENERGY = 6, + NUM_RAPL_COUNTERS, +}; + +enum rapl_unit { + RAPL_UNIT_INVALID, + RAPL_UNIT_JOULES, + RAPL_UNIT_WATTS, +}; + +struct rapl_counter_info_t { + unsigned long long data[NUM_RAPL_COUNTERS]; + enum rapl_source source[NUM_RAPL_COUNTERS]; + unsigned long long flags[NUM_RAPL_COUNTERS]; + double scale[NUM_RAPL_COUNTERS]; + enum rapl_unit unit[NUM_RAPL_COUNTERS]; + + union { + /* Active when source == RAPL_SOURCE_MSR */ + struct { + unsigned long long msr[NUM_RAPL_COUNTERS]; + unsigned long long msr_mask[NUM_RAPL_COUNTERS]; + int msr_shift[NUM_RAPL_COUNTERS]; + }; + }; + + int fd_perf; +}; + +/* struct rapl_counter_info_t for each RAPL domain */ +struct rapl_counter_info_t *rapl_counter_info_perdomain; + +#define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1) + +struct rapl_counter_arch_info { + int feature_mask; /* Mask for testing if the counter is supported on host */ + const char *perf_subsys; + const char *perf_name; + unsigned long long msr; + unsigned long long msr_mask; + int msr_shift; /* Positive mean shift right, negative mean shift left */ + double *platform_rapl_msr_scale; /* Scale applied to values read by MSR (platform dependent, filled at runtime) */ + unsigned int rci_index; /* Maps data from perf counters to global variables */ + unsigned long long bic; + double compat_scale; /* Some counters require constant scaling to be in the same range as other, similar ones */ + unsigned long long flags; +}; + +static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { + { + .feature_mask = RAPL_PKG, + .perf_subsys = "power", + .perf_name = "energy-pkg", + .msr = MSR_PKG_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_PKG, + .bic = BIC_PkgWatt | BIC_Pkg_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_AMD_F17H, + .perf_subsys = "power", + .perf_name = "energy-pkg", + .msr = MSR_PKG_ENERGY_STAT, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_PKG, + .bic = BIC_PkgWatt | BIC_Pkg_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_CORE_ENERGY_STATUS, + .perf_subsys = "power", + .perf_name = "energy-cores", + .msr = MSR_PP0_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_CORES, + .bic = BIC_CorWatt | BIC_Cor_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_DRAM, + .perf_subsys = "power", + .perf_name = "energy-ram", + .msr = MSR_DRAM_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_dram_energy_units, + .rci_index = RAPL_RCI_INDEX_DRAM, + .bic = BIC_RAMWatt | BIC_RAM_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_GFX, + .perf_subsys = "power", + .perf_name = "energy-gpu", + .msr = MSR_PP1_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_GFX, + .bic = BIC_GFXWatt | BIC_GFX_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_PKG_PERF_STATUS, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_PKG_PERF_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_time_units, + .rci_index = RAPL_RCI_INDEX_PKG_PERF_STATUS, + .bic = BIC_PKG__, + .compat_scale = 100.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_DRAM_PERF_STATUS, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_DRAM_PERF_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_time_units, + .rci_index = RAPL_RCI_INDEX_DRAM_PERF_STATUS, + .bic = BIC_RAM__, + .compat_scale = 100.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_AMD_F17H, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_CORE_ENERGY_STAT, + .msr_mask = 0xFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_CORE_ENERGY, + .bic = BIC_CorWatt | BIC_Cor_J, + .compat_scale = 1.0, + .flags = 0, + }, +}; + +struct rapl_counter { + unsigned long long raw_value; + enum rapl_unit unit; + double scale; +}; + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -974,7 +1178,7 @@ struct core_data { unsigned long long c7; unsigned long long mc6_us; /* duplicate as per-core for now, even though per module */ unsigned int core_temp_c; - unsigned int core_energy; /* MSR_CORE_ENERGY_STAT */ + struct rapl_counter core_energy; /* MSR_CORE_ENERGY_STAT */ unsigned int core_id; unsigned long long core_throt_cnt; unsigned long long counter[MAX_ADDED_COUNTERS]; @@ -989,8 +1193,8 @@ struct pkg_data { unsigned long long pc8; unsigned long long pc9; unsigned long long pc10; - unsigned long long cpu_lpi; - unsigned long long sys_lpi; + long long cpu_lpi; + long long sys_lpi; unsigned long long pkg_wtd_core_c0; unsigned long long pkg_any_core_c0; unsigned long long pkg_any_gfxe_c0; @@ -998,13 +1202,16 @@ struct pkg_data { long long gfx_rc6_ms; unsigned int gfx_mhz; unsigned int gfx_act_mhz; + long long sam_mc6_ms; + unsigned int sam_mhz; + unsigned int sam_act_mhz; unsigned int package_id; - unsigned long long energy_pkg; /* MSR_PKG_ENERGY_STATUS */ - unsigned long long energy_dram; /* MSR_DRAM_ENERGY_STATUS */ - unsigned long long energy_cores; /* MSR_PP0_ENERGY_STATUS */ - unsigned long long energy_gfx; /* MSR_PP1_ENERGY_STATUS */ - unsigned long long rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ - unsigned long long rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ + struct rapl_counter energy_pkg; /* MSR_PKG_ENERGY_STATUS */ + struct rapl_counter energy_dram; /* MSR_DRAM_ENERGY_STATUS */ + struct rapl_counter energy_cores; /* MSR_PP0_ENERGY_STATUS */ + struct rapl_counter energy_gfx; /* MSR_PP1_ENERGY_STATUS */ + struct rapl_counter rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ + struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned int uncore_mhz; unsigned long long counter[MAX_ADDED_COUNTERS]; @@ -1150,6 +1357,38 @@ struct sys_counters { struct msr_counter *pp; } sys; +void free_sys_counters(void) +{ + struct msr_counter *p = sys.tp, *pnext = NULL; + + while (p) { + pnext = p->next; + free(p); + p = pnext; + } + + p = sys.cp, pnext = NULL; + while (p) { + pnext = p->next; + free(p); + p = pnext; + } + + p = sys.pp, pnext = NULL; + while (p) { + pnext = p->next; + free(p); + p = pnext; + } + + sys.added_thread_counters = 0; + sys.added_core_counters = 0; + sys.added_package_counters = 0; + sys.tp = NULL; + sys.cp = NULL; + sys.pp = NULL; +} + struct system_summary { struct thread_data threads; struct core_data cores; @@ -1280,34 +1519,60 @@ int get_msr_fd(int cpu) sprintf(pathname, "/dev/cpu/%d/msr", cpu); fd = open(pathname, O_RDONLY); if (fd < 0) - err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname); + err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, " + "or run with --no-msr, or run as root", pathname); fd_percpu[cpu] = fd; return fd; } +static void bic_disable_msr_access(void) +{ + const unsigned long bic_msrs = + BIC_SMI | + BIC_CPU_c1 | + BIC_CPU_c3 | + BIC_CPU_c6 | + BIC_CPU_c7 | + BIC_Mod_c6 | + BIC_CoreTmp | + BIC_Totl_c0 | + BIC_Any_c0 | + BIC_GFX_c0 | + BIC_CPUGFX | + BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp; + + bic_enabled &= ~bic_msrs; + + free_sys_counters(); +} + static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { + assert(!no_perf); + return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } -static int perf_instr_count_open(int cpu_num) +static long open_perf_counter(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format) { - struct perf_event_attr pea; - int fd; + struct perf_event_attr attr; + const pid_t pid = -1; + const unsigned long flags = 0; - memset(&pea, 0, sizeof(struct perf_event_attr)); - pea.type = PERF_TYPE_HARDWARE; - pea.size = sizeof(struct perf_event_attr); - pea.config = PERF_COUNT_HW_INSTRUCTIONS; + assert(!no_perf); - /* counter for cpu_num, including user + kernel and all processes */ - fd = perf_event_open(&pea, -1, cpu_num, -1, 0); - if (fd == -1) { - warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\"", progname); - BIC_NOT_PRESENT(BIC_IPC); - } + memset(&attr, 0, sizeof(struct perf_event_attr)); + + attr.type = type; + attr.size = sizeof(struct perf_event_attr); + attr.config = config; + attr.disabled = 0; + attr.sample_type = PERF_SAMPLE_IDENTIFIER; + attr.read_format = read_format; + + const int fd = perf_event_open(&attr, pid, cpu, group_fd, flags); return fd; } @@ -1317,7 +1582,7 @@ int get_instr_count_fd(int cpu) if (fd_instr_count_percpu[cpu]) return fd_instr_count_percpu[cpu]; - fd_instr_count_percpu[cpu] = perf_instr_count_open(cpu); + fd_instr_count_percpu[cpu] = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); return fd_instr_count_percpu[cpu]; } @@ -1326,6 +1591,8 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr) { ssize_t retval; + assert(!no_msr); + retval = pread(get_msr_fd(cpu), msr, sizeof(*msr), offset); if (retval != sizeof *msr) @@ -1334,6 +1601,21 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr) return 0; } +int probe_msr(int cpu, off_t offset) +{ + ssize_t retval; + unsigned long long dummy; + + assert(!no_msr); + + retval = pread(get_msr_fd(cpu), &dummy, sizeof(dummy), offset); + + if (retval != sizeof(dummy)) + return 1; + + return 0; +} + #define MAX_DEFERRED 16 char *deferred_add_names[MAX_DEFERRED]; char *deferred_skip_names[MAX_DEFERRED]; @@ -1369,6 +1651,8 @@ void help(void) " Override default 5-second measurement interval\n" " -J, --Joules displays energy in Joules instead of Watts\n" " -l, --list list column headers only\n" + " -M, --no-msr Disable all uses of the MSR driver\n" + " -P, --no-perf Disable all uses of the perf API\n" " -n, --num_iterations num\n" " number of the measurement iterations\n" " -N, --header_iterations num\n" @@ -1573,6 +1857,15 @@ void print_header(char *delim) if (DO_BIC(BIC_GFXACTMHz)) outp += sprintf(outp, "%sGFXAMHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_SAM_mc6)) + outp += sprintf(outp, "%sSAM%%mc6", (printed++ ? delim : "")); + + if (DO_BIC(BIC_SAMMHz)) + outp += sprintf(outp, "%sSAMMHz", (printed++ ? delim : "")); + + if (DO_BIC(BIC_SAMACTMHz)) + outp += sprintf(outp, "%sSAMAMHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_Totl_c0)) outp += sprintf(outp, "%sTotl%%C0", (printed++ ? delim : "")); if (DO_BIC(BIC_Any_c0)) @@ -1671,26 +1964,35 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "SMI: %d\n", t->smi_count); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "tADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, t->counter[i]); + outp += + sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, + t->counter[i], mp->path); } } - if (c) { + if (c && is_cpu_first_thread_in_core(t, c, p)) { outp += sprintf(outp, "core: %d\n", c->core_id); outp += sprintf(outp, "c3: %016llX\n", c->c3); outp += sprintf(outp, "c6: %016llX\n", c->c6); outp += sprintf(outp, "c7: %016llX\n", c->c7); outp += sprintf(outp, "DTS: %dC\n", c->core_temp_c); outp += sprintf(outp, "cpu_throt_count: %016llX\n", c->core_throt_cnt); - outp += sprintf(outp, "Joules: %0X\n", c->core_energy); + + const unsigned long long energy_value = c->core_energy.raw_value * c->core_energy.scale; + const double energy_scale = c->core_energy.scale; + + if (c->core_energy.unit == RAPL_UNIT_JOULES) + outp += sprintf(outp, "Joules: %0llX (scale: %lf)\n", energy_value, energy_scale); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "cADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, c->counter[i]); + outp += + sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, + c->counter[i], mp->path); } outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us); } - if (p) { + if (p && is_cpu_first_core_in_package(t, c, p)) { outp += sprintf(outp, "package: %d\n", p->package_id); outp += sprintf(outp, "Weighted cores: %016llX\n", p->pkg_wtd_core_c0); @@ -1710,16 +2012,18 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "pc10: %016llX\n", p->pc10); outp += sprintf(outp, "cpu_lpi: %016llX\n", p->cpu_lpi); outp += sprintf(outp, "sys_lpi: %016llX\n", p->sys_lpi); - outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg); - outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores); - outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx); - outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram); - outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status); - outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status); + outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg.raw_value); + outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores.raw_value); + outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx.raw_value); + outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram.raw_value); + outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status.raw_value); + outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status.raw_value); outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "pADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, p->counter[i]); + outp += + sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, + p->counter[i], mp->path); } } @@ -1728,6 +2032,23 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p return 0; } +double rapl_counter_get_value(const struct rapl_counter *c, enum rapl_unit desired_unit, double interval) +{ + assert(desired_unit != RAPL_UNIT_INVALID); + + /* + * For now we don't expect anything other than joules, + * so just simplify the logic. + */ + assert(c->unit == RAPL_UNIT_JOULES); + + const double scaled = c->raw_value * c->scale; + + if (desired_unit == RAPL_UNIT_WATTS) + return scaled / interval; + return scaled; +} + /* * column formatting convention & formats */ @@ -1921,9 +2242,11 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float); + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&c->core_energy, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&c->core_energy, RAPL_UNIT_JOULES, interval_float)); /* print per-package data only for 1st core in package */ if (!is_cpu_first_core_in_package(t, c, p)) @@ -1951,6 +2274,24 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_GFXACTMHz)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_act_mhz); + /* SAMmc6 */ + if (DO_BIC(BIC_SAM_mc6)) { + if (p->sam_mc6_ms == -1) { /* detect GFX counter reset */ + outp += sprintf(outp, "%s**.**", (printed++ ? delim : "")); + } else { + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + p->sam_mc6_ms / 10.0 / interval_float); + } + } + + /* SAMMHz */ + if (DO_BIC(BIC_SAMMHz)) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_mhz); + + /* SAMACTMHz */ + if (DO_BIC(BIC_SAMACTMHz)) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_act_mhz); + /* Totl%C0, Any%C0 GFX%C0 CPUGFX% */ if (DO_BIC(BIC_Totl_c0)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0 / tsc); @@ -1976,43 +2317,59 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_Pkgpc10)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc); - if (DO_BIC(BIC_CPU_LPI)) - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->cpu_lpi / 1000000.0 / interval_float); - if (DO_BIC(BIC_SYS_LPI)) - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->sys_lpi / 1000000.0 / interval_float); + if (DO_BIC(BIC_CPU_LPI)) { + if (p->cpu_lpi >= 0) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + 100.0 * p->cpu_lpi / 1000000.0 / interval_float); + else + outp += sprintf(outp, "%s(neg)", (printed++ ? delim : "")); + } + if (DO_BIC(BIC_SYS_LPI)) { + if (p->sys_lpi >= 0) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + 100.0 * p->sys_lpi / 1000000.0 / interval_float); + else + outp += sprintf(outp, "%s(neg)", (printed++ ? delim : "")); + } if (DO_BIC(BIC_PkgWatt)) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float); - + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float); + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_GFXWatt)) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units / interval_float); + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_RAMWatt)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - p->energy_dram * rapl_dram_energy_units / interval_float); + rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_Pkg_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_GFX_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_RAM_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_dram * rapl_dram_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_PKG__)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); + rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_RAM__)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); + rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float)); /* UncMHz */ if (DO_BIC(BIC_UNCORE_MHZ)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz); @@ -2121,12 +2478,22 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->gfx_mhz = new->gfx_mhz; old->gfx_act_mhz = new->gfx_act_mhz; - old->energy_pkg = new->energy_pkg - old->energy_pkg; - old->energy_cores = new->energy_cores - old->energy_cores; - old->energy_gfx = new->energy_gfx - old->energy_gfx; - old->energy_dram = new->energy_dram - old->energy_dram; - old->rapl_pkg_perf_status = new->rapl_pkg_perf_status - old->rapl_pkg_perf_status; - old->rapl_dram_perf_status = new->rapl_dram_perf_status - old->rapl_dram_perf_status; + /* flag an error when mc6 counter resets/wraps */ + if (old->sam_mc6_ms > new->sam_mc6_ms) + old->sam_mc6_ms = -1; + else + old->sam_mc6_ms = new->sam_mc6_ms - old->sam_mc6_ms; + + old->sam_mhz = new->sam_mhz; + old->sam_act_mhz = new->sam_act_mhz; + + old->energy_pkg.raw_value = new->energy_pkg.raw_value - old->energy_pkg.raw_value; + old->energy_cores.raw_value = new->energy_cores.raw_value - old->energy_cores.raw_value; + old->energy_gfx.raw_value = new->energy_gfx.raw_value - old->energy_gfx.raw_value; + old->energy_dram.raw_value = new->energy_dram.raw_value - old->energy_dram.raw_value; + old->rapl_pkg_perf_status.raw_value = new->rapl_pkg_perf_status.raw_value - old->rapl_pkg_perf_status.raw_value; + old->rapl_dram_perf_status.raw_value = + new->rapl_dram_perf_status.raw_value - old->rapl_dram_perf_status.raw_value; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2150,7 +2517,7 @@ void delta_core(struct core_data *new, struct core_data *old) old->core_throt_cnt = new->core_throt_cnt; old->mc6_us = new->mc6_us - old->mc6_us; - DELTA_WRAP32(new->core_energy, old->core_energy); + DELTA_WRAP32(new->core_energy.raw_value, old->core_energy.raw_value); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2277,6 +2644,13 @@ int delta_cpu(struct thread_data *t, struct core_data *c, return retval; } +void rapl_counter_clear(struct rapl_counter *c) +{ + c->raw_value = 0; + c->scale = 0.0; + c->unit = RAPL_UNIT_INVALID; +} + void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; @@ -2304,7 +2678,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data c->c7 = 0; c->mc6_us = 0; c->core_temp_c = 0; - c->core_energy = 0; + rapl_counter_clear(&c->core_energy); c->core_throt_cnt = 0; p->pkg_wtd_core_c0 = 0; @@ -2325,18 +2699,21 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->cpu_lpi = 0; p->sys_lpi = 0; - p->energy_pkg = 0; - p->energy_dram = 0; - p->energy_cores = 0; - p->energy_gfx = 0; - p->rapl_pkg_perf_status = 0; - p->rapl_dram_perf_status = 0; + rapl_counter_clear(&p->energy_pkg); + rapl_counter_clear(&p->energy_dram); + rapl_counter_clear(&p->energy_cores); + rapl_counter_clear(&p->energy_gfx); + rapl_counter_clear(&p->rapl_pkg_perf_status); + rapl_counter_clear(&p->rapl_dram_perf_status); p->pkg_temp_c = 0; p->gfx_rc6_ms = 0; p->uncore_mhz = 0; p->gfx_mhz = 0; p->gfx_act_mhz = 0; + p->sam_mc6_ms = 0; + p->sam_mhz = 0; + p->sam_act_mhz = 0; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) t->counter[i] = 0; @@ -2347,6 +2724,20 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->counter[i] = 0; } +void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src) +{ + /* Copy unit and scale from src if dst is not initialized */ + if (dst->unit == RAPL_UNIT_INVALID) { + dst->unit = src->unit; + dst->scale = src->scale; + } + + assert(dst->unit == src->unit); + assert(dst->scale == src->scale); + + dst->raw_value += src->raw_value; +} + int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; @@ -2393,7 +2784,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c); average.cores.core_throt_cnt = MAX(average.cores.core_throt_cnt, c->core_throt_cnt); - average.cores.core_energy += c->core_energy; + rapl_counter_accumulate(&average.cores.core_energy, &c->core_energy); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2428,25 +2819,29 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.cpu_lpi = p->cpu_lpi; average.packages.sys_lpi = p->sys_lpi; - average.packages.energy_pkg += p->energy_pkg; - average.packages.energy_dram += p->energy_dram; - average.packages.energy_cores += p->energy_cores; - average.packages.energy_gfx += p->energy_gfx; + rapl_counter_accumulate(&average.packages.energy_pkg, &p->energy_pkg); + rapl_counter_accumulate(&average.packages.energy_dram, &p->energy_dram); + rapl_counter_accumulate(&average.packages.energy_cores, &p->energy_cores); + rapl_counter_accumulate(&average.packages.energy_gfx, &p->energy_gfx); average.packages.gfx_rc6_ms = p->gfx_rc6_ms; average.packages.uncore_mhz = p->uncore_mhz; average.packages.gfx_mhz = p->gfx_mhz; average.packages.gfx_act_mhz = p->gfx_act_mhz; + average.packages.sam_mc6_ms = p->sam_mc6_ms; + average.packages.sam_mhz = p->sam_mhz; + average.packages.sam_act_mhz = p->sam_act_mhz; average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c); - average.packages.rapl_pkg_perf_status += p->rapl_pkg_perf_status; - average.packages.rapl_dram_perf_status += p->rapl_dram_perf_status; + rapl_counter_accumulate(&average.packages.rapl_pkg_perf_status, &p->rapl_pkg_perf_status); + rapl_counter_accumulate(&average.packages.rapl_dram_perf_status, &p->rapl_dram_perf_status); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - if (mp->format == FORMAT_RAW) - continue; - average.packages.counter[i] += p->counter[i]; + if ((mp->format == FORMAT_RAW) && (topo.num_packages == 0)) + average.packages.counter[i] = p->counter[i]; + else + average.packages.counter[i] += p->counter[i]; } return 0; } @@ -2578,6 +2973,7 @@ unsigned long long snapshot_sysfs_counter(char *path) int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) { if (mp->msr_num != 0) { + assert(!no_msr); if (get_msr(cpu, mp->msr_num, counterp)) return -1; } else { @@ -2599,7 +2995,7 @@ unsigned long long get_uncore_mhz(int package, int die) { char path[128]; - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/current_freq_khz", package, + sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", package, die); return (snapshot_sysfs_counter(path) / 1000); @@ -2627,6 +3023,9 @@ int get_epb(int cpu) return epb; msr_fallback: + if (no_msr) + return -1; + get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr); return msr & 0xf; @@ -2700,6 +3099,351 @@ int get_core_throt_cnt(int cpu, unsigned long long *cnt) return 0; } +struct amperf_group_fd { + int aperf; /* Also the group descriptor */ + int mperf; +}; + +static int read_perf_counter_info(const char *const path, const char *const parse_format, void *value_ptr) +{ + int fdmt; + int bytes_read; + char buf[64]; + int ret = -1; + + fdmt = open(path, O_RDONLY, 0); + if (fdmt == -1) { + if (debug) + fprintf(stderr, "Failed to parse perf counter info %s\n", path); + ret = -1; + goto cleanup_and_exit; + } + + bytes_read = read(fdmt, buf, sizeof(buf) - 1); + if (bytes_read <= 0 || bytes_read >= (int)sizeof(buf)) { + if (debug) + fprintf(stderr, "Failed to parse perf counter info %s\n", path); + ret = -1; + goto cleanup_and_exit; + } + + buf[bytes_read] = '\0'; + + if (sscanf(buf, parse_format, value_ptr) != 1) { + if (debug) + fprintf(stderr, "Failed to parse perf counter info %s\n", path); + ret = -1; + goto cleanup_and_exit; + } + + ret = 0; + +cleanup_and_exit: + close(fdmt); + return ret; +} + +static unsigned int read_perf_counter_info_n(const char *const path, const char *const parse_format) +{ + unsigned int v; + int status; + + status = read_perf_counter_info(path, parse_format, &v); + if (status) + v = -1; + + return v; +} + +static unsigned int read_msr_type(void) +{ + const char *const path = "/sys/bus/event_source/devices/msr/type"; + const char *const format = "%u"; + + return read_perf_counter_info_n(path, format); +} + +static unsigned int read_aperf_config(void) +{ + const char *const path = "/sys/bus/event_source/devices/msr/events/aperf"; + const char *const format = "event=%x"; + + return read_perf_counter_info_n(path, format); +} + +static unsigned int read_mperf_config(void) +{ + const char *const path = "/sys/bus/event_source/devices/msr/events/mperf"; + const char *const format = "event=%x"; + + return read_perf_counter_info_n(path, format); +} + +static unsigned int read_perf_type(const char *subsys) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/type"; + const char *const format = "%u"; + char path[128]; + + snprintf(path, sizeof(path), path_format, subsys); + + return read_perf_counter_info_n(path, format); +} + +static unsigned int read_rapl_config(const char *subsys, const char *event_name) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s"; + const char *const format = "event=%x"; + char path[128]; + + snprintf(path, sizeof(path), path_format, subsys, event_name); + + return read_perf_counter_info_n(path, format); +} + +static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.unit"; + const char *const format = "%s"; + char path[128]; + char unit_buffer[16]; + + snprintf(path, sizeof(path), path_format, subsys, event_name); + + read_perf_counter_info(path, format, &unit_buffer); + if (strcmp("Joules", unit_buffer) == 0) + return RAPL_UNIT_JOULES; + + return RAPL_UNIT_INVALID; +} + +static double read_perf_rapl_scale(const char *subsys, const char *event_name) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale"; + const char *const format = "%lf"; + char path[128]; + double scale; + + snprintf(path, sizeof(path), path_format, subsys, event_name); + + if (read_perf_counter_info(path, format, &scale)) + return 0.0; + + return scale; +} + +static struct amperf_group_fd open_amperf_fd(int cpu) +{ + const unsigned int msr_type = read_msr_type(); + const unsigned int aperf_config = read_aperf_config(); + const unsigned int mperf_config = read_mperf_config(); + struct amperf_group_fd fds = {.aperf = -1, .mperf = -1 }; + + fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); + fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); + + return fds; +} + +static int get_amperf_fd(int cpu) +{ + assert(fd_amperf_percpu); + + if (fd_amperf_percpu[cpu].aperf) + return fd_amperf_percpu[cpu].aperf; + + fd_amperf_percpu[cpu] = open_amperf_fd(cpu); + + return fd_amperf_percpu[cpu].aperf; +} + +/* Read APERF, MPERF and TSC using the perf API. */ +static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu) +{ + union { + struct { + unsigned long nr_entries; + unsigned long aperf; + unsigned long mperf; + }; + + unsigned long as_array[3]; + } cnt; + + const int fd_amperf = get_amperf_fd(cpu); + + /* + * Read the TSC with rdtsc, because we want the absolute value and not + * the offset from the start of the counter. + */ + t->tsc = rdtsc(); + + const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array)); + + if (n != sizeof(cnt.as_array)) + return -2; + + t->aperf = cnt.aperf * aperf_mperf_multiplier; + t->mperf = cnt.mperf * aperf_mperf_multiplier; + + return 0; +} + +/* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */ +static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu) +{ + unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; + int aperf_mperf_retry_count = 0; + + /* + * The TSC, APERF and MPERF must be read together for + * APERF/MPERF and MPERF/TSC to give accurate results. + * + * Unfortunately, APERF and MPERF are read by + * individual system call, so delays may occur + * between them. If the time to read them + * varies by a large amount, we re-read them. + */ + + /* + * This initial dummy APERF read has been seen to + * reduce jitter in the subsequent reads. + */ + + if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) + return -3; + +retry: + t->tsc = rdtsc(); /* re-read close to APERF */ + + tsc_before = t->tsc; + + if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) + return -3; + + tsc_between = rdtsc(); + + if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf)) + return -4; + + tsc_after = rdtsc(); + + aperf_time = tsc_between - tsc_before; + mperf_time = tsc_after - tsc_between; + + /* + * If the system call latency to read APERF and MPERF + * differ by more than 2x, then try again. + */ + if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) { + aperf_mperf_retry_count++; + if (aperf_mperf_retry_count < 5) + goto retry; + else + warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); + } + aperf_mperf_retry_count = 0; + + t->aperf = t->aperf * aperf_mperf_multiplier; + t->mperf = t->mperf * aperf_mperf_multiplier; + + return 0; +} + +size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) +{ + size_t ret = 0; + + for (int i = 0; i < NUM_RAPL_COUNTERS; ++i) + if (rci->source[i] == RAPL_SOURCE_PERF) + ++ret; + + return ret; +} + +void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx) +{ + rc->raw_value = rci->data[idx]; + rc->unit = rci->unit[idx]; + rc->scale = rci->scale[idx]; +} + +int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data *p) +{ + unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain]; + + if (debug) + fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain); + + assert(rapl_counter_info_perdomain); + + /* + * If we have any perf counters to read, read them all now, in bulk + */ + if (rci->fd_perf != -1) { + size_t num_perf_counters = rapl_counter_info_count_perf(rci); + const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long); + const ssize_t actual_read_size = read(rci->fd_perf, &perf_data[0], sizeof(perf_data)); + + if (actual_read_size != expected_read_size) + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, + actual_read_size); + } + + for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) { + switch (rci->source[i]) { + case RAPL_SOURCE_NONE: + break; + + case RAPL_SOURCE_PERF: + assert(pi < ARRAY_SIZE(perf_data)); + assert(rci->fd_perf != -1); + + if (debug) + fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n", + i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]); + + rci->data[i] = perf_data[pi]; + + ++pi; + break; + + case RAPL_SOURCE_MSR: + if (debug) + fprintf(stderr, "Reading rapl counter via msr at %u\n", i); + + assert(!no_msr); + if (rci->flags[i] & RAPL_COUNTER_FLAG_USE_MSR_SUM) { + if (get_msr_sum(cpu, rci->msr[i], &rci->data[i])) + return -13 - i; + } else { + if (get_msr(cpu, rci->msr[i], &rci->data[i])) + return -13 - i; + } + + rci->data[i] &= rci->msr_mask[i]; + if (rci->msr_shift[i] >= 0) + rci->data[i] >>= abs(rci->msr_shift[i]); + else + rci->data[i] <<= abs(rci->msr_shift[i]); + + break; + } + } + + _Static_assert(NUM_RAPL_COUNTERS == 7); + write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG); + write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES); + write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM); + write_rapl_counter(&p->energy_gfx, rci, RAPL_RCI_INDEX_GFX); + write_rapl_counter(&p->rapl_pkg_perf_status, rci, RAPL_RCI_INDEX_PKG_PERF_STATUS); + write_rapl_counter(&p->rapl_dram_perf_status, rci, RAPL_RCI_INDEX_DRAM_PERF_STATUS); + write_rapl_counter(&c->core_energy, rci, RAPL_RCI_INDEX_CORE_ENERGY); + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -2709,12 +3453,12 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int cpu = t->cpu_id; unsigned long long msr; - int aperf_mperf_retry_count = 0; struct msr_counter *mp; int i; + int status; if (cpu_migrate(cpu)) { - fprintf(outf, "get_counters: Could not migrate to CPU %d\n", cpu); + fprintf(outf, "%s: Could not migrate to CPU %d\n", __func__, cpu); return -1; } @@ -2722,63 +3466,26 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (first_counter_read) get_apic_id(t); -retry: + t->tsc = rdtsc(); /* we are running on local CPU of interest */ if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC) || soft_c1_residency_display(BIC_Avg_MHz)) { - unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; - - /* - * The TSC, APERF and MPERF must be read together for - * APERF/MPERF and MPERF/TSC to give accurate results. - * - * Unfortunately, APERF and MPERF are read by - * individual system call, so delays may occur - * between them. If the time to read them - * varies by a large amount, we re-read them. - */ - - /* - * This initial dummy APERF read has been seen to - * reduce jitter in the subsequent reads. - */ - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - - t->tsc = rdtsc(); /* re-read close to APERF */ - - tsc_before = t->tsc; + int status = -1; - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; + assert(!no_perf || !no_msr); - tsc_between = rdtsc(); - - if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf)) - return -4; - - tsc_after = rdtsc(); - - aperf_time = tsc_between - tsc_before; - mperf_time = tsc_after - tsc_between; - - /* - * If the system call latency to read APERF and MPERF - * differ by more than 2x, then try again. - */ - if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) { - aperf_mperf_retry_count++; - if (aperf_mperf_retry_count < 5) - goto retry; - else - warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); + switch (amperf_source) { + case AMPERF_SOURCE_PERF: + status = read_aperf_mperf_tsc_perf(t, cpu); + break; + case AMPERF_SOURCE_MSR: + status = read_aperf_mperf_tsc_msr(t, cpu); + break; } - aperf_mperf_retry_count = 0; - t->aperf = t->aperf * aperf_mperf_multiplier; - t->mperf = t->mperf * aperf_mperf_multiplier; + if (status != 0) + return status; } if (DO_BIC(BIC_IPC)) @@ -2806,6 +3513,12 @@ retry: if (!is_cpu_first_thread_in_core(t, c, p)) goto done; + if (platform->has_per_core_rapl) { + status = get_rapl_counters(cpu, c->core_id, c, p); + if (status != 0) + return status; + } + if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) { if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3)) return -6; @@ -2846,12 +3559,6 @@ retry: if (DO_BIC(BIC_CORE_THROT_CNT)) get_core_throt_cnt(cpu, &c->core_throt_cnt); - if (platform->rapl_msrs & RAPL_AMD_F17H) { - if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr)) - return -14; - c->core_energy = msr & 0xFFFFFFFF; - } - for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &c->counter[i])) return -10; @@ -2911,59 +3618,39 @@ retry: if (DO_BIC(BIC_SYS_LPI)) p->sys_lpi = cpuidle_cur_sys_lpi_us; - if (platform->rapl_msrs & RAPL_PKG) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr)) - return -13; - p->energy_pkg = msr; - } - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) { - if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) - return -14; - p->energy_cores = msr; - } - if (platform->rapl_msrs & RAPL_DRAM) { - if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) - return -15; - p->energy_dram = msr; - } - if (platform->rapl_msrs & RAPL_GFX) { - if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr)) - return -16; - p->energy_gfx = msr; - } - if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr)) - return -16; - p->rapl_pkg_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr)) - return -16; - p->rapl_dram_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_AMD_F17H) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr)) - return -13; - p->energy_pkg = msr; + if (!platform->has_per_core_rapl) { + status = get_rapl_counters(cpu, p->package_id, c, p); + if (status != 0) + return status; } + if (DO_BIC(BIC_PkgTmp)) { if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) return -17; p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F); } - if (DO_BIC(BIC_GFX_rc6)) - p->gfx_rc6_ms = gfx_cur_rc6_ms; - /* n.b. assume die0 uncore frequency applies to whole package */ if (DO_BIC(BIC_UNCORE_MHZ)) p->uncore_mhz = get_uncore_mhz(p->package_id, 0); + if (DO_BIC(BIC_GFX_rc6)) + p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; + if (DO_BIC(BIC_GFXMHz)) - p->gfx_mhz = gfx_cur_mhz; + p->gfx_mhz = gfx_info[GFX_MHz].val; if (DO_BIC(BIC_GFXACTMHz)) - p->gfx_act_mhz = gfx_act_mhz; + p->gfx_act_mhz = gfx_info[GFX_ACTMHz].val; + + if (DO_BIC(BIC_SAM_mc6)) + p->sam_mc6_ms = gfx_info[SAM_mc6].val_ull; + + if (DO_BIC(BIC_SAMMHz)) + p->sam_mhz = gfx_info[SAM_MHz].val; + + if (DO_BIC(BIC_SAMACTMHz)) + p->sam_act_mhz = gfx_info[SAM_ACTMHz].val; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &p->counter[i])) @@ -3053,7 +3740,7 @@ void probe_cst_limit(void) unsigned long long msr; int *pkg_cstate_limits; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; switch (platform->cst_limit) { @@ -3097,7 +3784,7 @@ static void dump_platform_info(void) unsigned long long msr; unsigned int ratio; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; get_msr(base_cpu, MSR_PLATFORM_INFO, &msr); @@ -3115,7 +3802,7 @@ static void dump_power_ctl(void) { unsigned long long msr; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr); @@ -3321,7 +4008,7 @@ static void dump_cst_cfg(void) { unsigned long long msr; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); @@ -3393,7 +4080,7 @@ void print_irtl(void) { unsigned long long msr; - if (!platform->has_irtl_msrs) + if (!platform->has_irtl_msrs || no_msr) return; if (platform->supported_cstates & PC3) { @@ -3443,12 +4130,64 @@ void free_fd_percpu(void) { int i; + if (!fd_percpu) + return; + for (i = 0; i < topo.max_cpu_num + 1; ++i) { if (fd_percpu[i] != 0) close(fd_percpu[i]); } free(fd_percpu); + fd_percpu = NULL; +} + +void free_fd_amperf_percpu(void) +{ + int i; + + if (!fd_amperf_percpu) + return; + + for (i = 0; i < topo.max_cpu_num + 1; ++i) { + if (fd_amperf_percpu[i].mperf != 0) + close(fd_amperf_percpu[i].mperf); + + if (fd_amperf_percpu[i].aperf != 0) + close(fd_amperf_percpu[i].aperf); + } + + free(fd_amperf_percpu); + fd_amperf_percpu = NULL; +} + +void free_fd_instr_count_percpu(void) +{ + if (!fd_instr_count_percpu) + return; + + for (int i = 0; i < topo.max_cpu_num + 1; ++i) { + if (fd_instr_count_percpu[i] != 0) + close(fd_instr_count_percpu[i]); + } + + free(fd_instr_count_percpu); + fd_instr_count_percpu = NULL; +} + +void free_fd_rapl_percpu(void) +{ + if (!rapl_counter_info_perdomain) + return; + + const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + + for (int domain_id = 0; domain_id < num_domains; ++domain_id) { + if (rapl_counter_info_perdomain[domain_id].fd_perf != -1) + close(rapl_counter_info_perdomain[domain_id].fd_perf); + } + + free(rapl_counter_info_perdomain); } void free_all_buffers(void) @@ -3492,6 +4231,9 @@ void free_all_buffers(void) outp = NULL; free_fd_percpu(); + free_fd_instr_count_percpu(); + free_fd_amperf_percpu(); + free_fd_rapl_percpu(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -3825,11 +4567,17 @@ static void update_effective_set(bool startup) err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str); } +void linux_perf_init(void); +void rapl_perf_init(void); + void re_initialize(void) { free_all_buffers(); setup_all_buffers(false); - fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); + linux_perf_init(); + rapl_perf_init(); + fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, + topo.allowed_cpus); } void set_max_cpu_num(void) @@ -3940,85 +4688,43 @@ int snapshot_proc_interrupts(void) } /* - * snapshot_gfx_rc6_ms() + * snapshot_graphics() * - * record snapshot of - * /sys/class/drm/card0/power/rc6_residency_ms + * record snapshot of specified graphics sysfs knob * * return 1 if config change requires a restart, else return 0 */ -int snapshot_gfx_rc6_ms(void) +int snapshot_graphics(int idx) { FILE *fp; int retval; - fp = fopen_or_die("/sys/class/drm/card0/power/rc6_residency_ms", "r"); - - retval = fscanf(fp, "%lld", &gfx_cur_rc6_ms); - if (retval != 1) - err(1, "GFX rc6"); - - fclose(fp); - - return 0; -} - -/* - * snapshot_gfx_mhz() - * - * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz - * when /sys/class/drm/card0/gt_cur_freq_mhz is not available. - * - * return 1 if config change requires a restart, else return 0 - */ -int snapshot_gfx_mhz(void) -{ - static FILE *fp; - int retval; - - if (fp == NULL) { - fp = fopen("/sys/class/drm/card0/gt_cur_freq_mhz", "r"); - if (!fp) - fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r"); - } else { - rewind(fp); - fflush(fp); - } - - retval = fscanf(fp, "%d", &gfx_cur_mhz); - if (retval != 1) - err(1, "GFX MHz"); - - return 0; -} - -/* - * snapshot_gfx_cur_mhz() - * - * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz - * when /sys/class/drm/card0/gt_act_freq_mhz is not available. - * - * return 1 if config change requires a restart, else return 0 - */ -int snapshot_gfx_act_mhz(void) -{ - static FILE *fp; - int retval; - - if (fp == NULL) { - fp = fopen("/sys/class/drm/card0/gt_act_freq_mhz", "r"); - if (!fp) - fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r"); - } else { - rewind(fp); - fflush(fp); + switch (idx) { + case GFX_rc6: + case SAM_mc6: + fp = fopen_or_die(gfx_info[idx].path, "r"); + retval = fscanf(fp, "%lld", &gfx_info[idx].val_ull); + if (retval != 1) + err(1, "rc6"); + fclose(fp); + return 0; + case GFX_MHz: + case GFX_ACTMHz: + case SAM_MHz: + case SAM_ACTMHz: + if (gfx_info[idx].fp == NULL) { + gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r"); + } else { + rewind(gfx_info[idx].fp); + fflush(gfx_info[idx].fp); + } + retval = fscanf(gfx_info[idx].fp, "%d", &gfx_info[idx].val); + if (retval != 1) + err(1, "MHz"); + return 0; + default: + return -EINVAL; } - - retval = fscanf(fp, "%d", &gfx_act_mhz); - if (retval != 1) - err(1, "GFX ACT MHz"); - - return 0; } /* @@ -4083,13 +4789,22 @@ int snapshot_proc_sysfs_files(void) return 1; if (DO_BIC(BIC_GFX_rc6)) - snapshot_gfx_rc6_ms(); + snapshot_graphics(GFX_rc6); if (DO_BIC(BIC_GFXMHz)) - snapshot_gfx_mhz(); + snapshot_graphics(GFX_MHz); if (DO_BIC(BIC_GFXACTMHz)) - snapshot_gfx_act_mhz(); + snapshot_graphics(GFX_ACTMHz); + + if (DO_BIC(BIC_SAM_mc6)) + snapshot_graphics(SAM_mc6); + + if (DO_BIC(BIC_SAMMHz)) + snapshot_graphics(SAM_MHz); + + if (DO_BIC(BIC_SAMACTMHz)) + snapshot_graphics(SAM_ACTMHz); if (DO_BIC(BIC_CPU_LPI)) snapshot_cpu_lpi_us(); @@ -4173,6 +4888,8 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long *msr) int ret, idx; unsigned long long msr_cur, msr_last; + assert(!no_msr); + if (!per_cpu_msr_sum) return 1; @@ -4201,6 +4918,8 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg UNUSED(c); UNUSED(p); + assert(!no_msr); + for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) { unsigned long long msr_cur, msr_last; off_t offset; @@ -4280,7 +4999,8 @@ release_msr: /* * set_my_sched_priority(pri) - * return previous + * return previous priority on success + * return value < -20 on failure */ int set_my_sched_priority(int priority) { @@ -4290,16 +5010,16 @@ int set_my_sched_priority(int priority) errno = 0; original_priority = getpriority(PRIO_PROCESS, 0); if (errno && (original_priority == -1)) - err(errno, "getpriority"); + return -21; retval = setpriority(PRIO_PROCESS, 0, priority); if (retval) - errx(retval, "capget(CAP_SYS_NICE) failed,try \"# setcap cap_sys_nice=ep %s\"", progname); + return -21; errno = 0; retval = getpriority(PRIO_PROCESS, 0); if (retval != priority) - err(retval, "getpriority(%d) != setpriority(%d)", retval, priority); + return -21; return original_priority; } @@ -4314,6 +5034,9 @@ void turbostat_loop() /* * elevate own priority for interval mode + * + * ignore on error - we probably don't have permission to set it, but + * it's not a big deal */ set_my_sched_priority(-20); @@ -4399,10 +5122,13 @@ void check_dev_msr() struct stat sb; char pathname[32]; + if (no_msr) + return; + sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (stat(pathname, &sb)) if (system("/sbin/modprobe msr > /dev/null 2>&1")) - err(-5, "no /dev/cpu/0/msr, Try \"# modprobe msr\" "); + no_msr = 1; } /* @@ -4414,47 +5140,51 @@ int check_for_cap_sys_rawio(void) { cap_t caps; cap_flag_value_t cap_flag_value; + int ret = 0; caps = cap_get_proc(); if (caps == NULL) - err(-6, "cap_get_proc\n"); + return 1; - if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) - err(-6, "cap_get\n"); + if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) { + ret = 1; + goto free_and_exit; + } if (cap_flag_value != CAP_SET) { - warnx("capget(CAP_SYS_RAWIO) failed," " try \"# setcap cap_sys_rawio=ep %s\"", progname); - return 1; + ret = 1; + goto free_and_exit; } +free_and_exit: if (cap_free(caps) == -1) err(-6, "cap_free\n"); - return 0; + return ret; } -void check_permissions(void) +void check_msr_permission(void) { - int do_exit = 0; + int failed = 0; char pathname[32]; + if (no_msr) + return; + /* check for CAP_SYS_RAWIO */ - do_exit += check_for_cap_sys_rawio(); + failed += check_for_cap_sys_rawio(); /* test file permissions */ sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (euidaccess(pathname, R_OK)) { - do_exit++; - warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr"); + failed++; } - /* if all else fails, thell them to be root */ - if (do_exit) - if (getuid() != 0) - warnx("... or simply run as root"); - - if (do_exit) - exit(-6); + if (failed) { + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", pathname, "--no-msr"); + no_msr = 1; + } } void probe_bclk(void) @@ -4462,7 +5192,7 @@ void probe_bclk(void) unsigned long long msr; unsigned int base_ratio; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; if (platform->bclk_freq == BCLK_100MHZ) @@ -4502,7 +5232,7 @@ static void dump_turbo_ratio_info(void) if (!has_turbo) return; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; if (platform->trl_msrs & TRL_LIMIT2) @@ -4567,20 +5297,15 @@ static void dump_sysfs_file(char *path) static void probe_intel_uncore_frequency(void) { int i, j; - char path[128]; + char path[256]; if (!genuine_intel) return; - if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00", R_OK)) - return; - - /* Cluster level sysfs not supported yet. */ - if (!access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK)) - return; + if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) + goto probe_cluster; - if (!access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) - BIC_PRESENT(BIC_UNCORE_MHZ); + BIC_PRESENT(BIC_UNCORE_MHZ); if (quiet) return; @@ -4588,40 +5313,178 @@ static void probe_intel_uncore_frequency(void) for (i = 0; i < topo.num_packages; ++i) { for (j = 0; j < topo.num_die; ++j) { int k, l; + char path_base[128]; + + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i, + j); - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/min_freq_khz", - i, j); + sprintf(path, "%s/min_freq_khz", path_base); k = read_sysfs_int(path); - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/max_freq_khz", - i, j); + sprintf(path, "%s/max_freq_khz", path_base); l = read_sysfs_int(path); - fprintf(outf, "Uncore Frequency pkg%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000); + fprintf(outf, "Uncore Frequency package%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000); - sprintf(path, - "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/initial_min_freq_khz", - i, j); + sprintf(path, "%s/initial_min_freq_khz", path_base); k = read_sysfs_int(path); - sprintf(path, - "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/initial_max_freq_khz", - i, j); + sprintf(path, "%s/initial_max_freq_khz", path_base); l = read_sysfs_int(path); - fprintf(outf, "(%d - %d MHz)\n", k / 1000, l / 1000); + fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000); + + sprintf(path, "%s/current_freq_khz", path_base); + k = read_sysfs_int(path); + fprintf(outf, " %d MHz\n", k / 1000); } } + return; + +probe_cluster: + if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK)) + return; + + if (quiet) + return; + + for (i = 0;; ++i) { + int k, l; + char path_base[128]; + int package_id, domain_id, cluster_id; + + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i); + + if (access(path_base, R_OK)) + break; + + sprintf(path, "%s/package_id", path_base); + package_id = read_sysfs_int(path); + + sprintf(path, "%s/domain_id", path_base); + domain_id = read_sysfs_int(path); + + sprintf(path, "%s/fabric_cluster_id", path_base); + cluster_id = read_sysfs_int(path); + + sprintf(path, "%s/min_freq_khz", path_base); + k = read_sysfs_int(path); + sprintf(path, "%s/max_freq_khz", path_base); + l = read_sysfs_int(path); + fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", package_id, domain_id, + cluster_id, k / 1000, l / 1000); + + sprintf(path, "%s/initial_min_freq_khz", path_base); + k = read_sysfs_int(path); + sprintf(path, "%s/initial_max_freq_khz", path_base); + l = read_sysfs_int(path); + fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000); + + sprintf(path, "%s/current_freq_khz", path_base); + k = read_sysfs_int(path); + fprintf(outf, " %d MHz\n", k / 1000); + } } static void probe_graphics(void) { + /* Xe graphics sysfs knobs */ + if (!access("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", R_OK)) { + FILE *fp; + char buf[8]; + bool gt0_is_gt; + int idx; + + fp = fopen("/sys/class/drm/card0/device/tile0/gt0/gtidle/name", "r"); + if (!fp) + goto next; + + if (!fread(buf, sizeof(char), 7, fp)) { + fclose(fp); + goto next; + } + fclose(fp); + + if (!strncmp(buf, "gt0-rc", strlen("gt0-rc"))) + gt0_is_gt = true; + else if (!strncmp(buf, "gt0-mc", strlen("gt0-mc"))) + gt0_is_gt = false; + else + goto next; + + idx = gt0_is_gt ? GFX_rc6 : SAM_mc6; + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms"; + + idx = gt0_is_gt ? GFX_MHz : SAM_MHz; + if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq"; + + idx = gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz; + if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq"; + + idx = gt0_is_gt ? SAM_mc6 : GFX_rc6; + if (!access("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms"; + + idx = gt0_is_gt ? SAM_MHz : GFX_MHz; + if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq"; + + idx = gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz; + if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq"; + + goto end; + } + +next: + /* New i915 graphics sysfs knobs */ + if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) { + gfx_info[GFX_rc6].path = "/sys/class/drm/card0/gt/gt0/rc6_residency_ms"; + + if (!access("/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz", R_OK)) + gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz"; + + if (!access("/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz", R_OK)) + gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz"; + + if (!access("/sys/class/drm/card0/gt/gt1/rc6_residency_ms", R_OK)) + gfx_info[SAM_mc6].path = "/sys/class/drm/card0/gt/gt1/rc6_residency_ms"; + + if (!access("/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz", R_OK)) + gfx_info[SAM_MHz].path = "/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz"; + + if (!access("/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz", R_OK)) + gfx_info[SAM_ACTMHz].path = "/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz"; + + goto end; + } + + /* Fall back to traditional i915 graphics sysfs knobs */ if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) - BIC_PRESENT(BIC_GFX_rc6); + gfx_info[GFX_rc6].path = "/sys/class/drm/card0/power/rc6_residency_ms"; + + if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK)) + gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt_cur_freq_mhz"; + else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) + gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz"; - if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) || - !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) - BIC_PRESENT(BIC_GFXMHz); - if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) || - !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK)) + gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz"; + else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + gfx_info[GFX_ACTMHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz"; + +end: + if (gfx_info[GFX_rc6].path) + BIC_PRESENT(BIC_GFX_rc6); + if (gfx_info[GFX_MHz].path) + BIC_PRESENT(BIC_GFXMHz); + if (gfx_info[GFX_ACTMHz].path) BIC_PRESENT(BIC_GFXACTMHz); + if (gfx_info[SAM_mc6].path) + BIC_PRESENT(BIC_SAM_mc6); + if (gfx_info[SAM_MHz].path) + BIC_PRESENT(BIC_SAMMHz); + if (gfx_info[SAM_ACTMHz].path) + BIC_PRESENT(BIC_SAMACTMHz); } static void dump_sysfs_cstate_config(void) @@ -4783,6 +5646,9 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) UNUSED(c); UNUSED(p); + if (no_msr) + return 0; + if (!has_hwp) return 0; @@ -4869,6 +5735,9 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data UNUSED(c); UNUSED(p); + if (no_msr) + return 0; + cpu = t->cpu_id; /* per-package */ @@ -4983,31 +5852,18 @@ void rapl_probe_intel(void) unsigned long long msr; unsigned int time_unit; double tdp; + const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt | BIC_RAMWatt | BIC_GFXWatt; + const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J | BIC_RAM_J | BIC_GFX_J; - if (rapl_joules) { - if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS) - BIC_PRESENT(BIC_Pkg_J); - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) - BIC_PRESENT(BIC_Cor_J); - if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS) - BIC_PRESENT(BIC_RAM_J); - if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS) - BIC_PRESENT(BIC_GFX_J); - } else { - if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS) - BIC_PRESENT(BIC_PkgWatt); - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) - BIC_PRESENT(BIC_CorWatt); - if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS) - BIC_PRESENT(BIC_RAMWatt); - if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS) - BIC_PRESENT(BIC_GFXWatt); - } + if (rapl_joules) + bic_enabled &= ~bic_watt_bits; + else + bic_enabled &= ~bic_joules_bits; - if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) - BIC_PRESENT(BIC_PKG__); - if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) - BIC_PRESENT(BIC_RAM__); + if (!(platform->rapl_msrs & RAPL_PKG_PERF_STATUS)) + bic_enabled &= ~BIC_PKG__; + if (!(platform->rapl_msrs & RAPL_DRAM_PERF_STATUS)) + bic_enabled &= ~BIC_RAM__; /* units on package 0, verify later other packages match */ if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr)) @@ -5041,14 +5897,13 @@ void rapl_probe_amd(void) { unsigned long long msr; double tdp; + const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt; + const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J; - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - } + if (rapl_joules) + bic_enabled &= ~bic_watt_bits; + else + bic_enabled &= ~bic_joules_bits; if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr)) return; @@ -5202,7 +6057,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) */ void probe_rapl(void) { - if (!platform->rapl_msrs) + if (!platform->rapl_msrs || no_msr) return; if (genuine_intel) @@ -5258,7 +6113,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk } /* Temperature Target MSR is Nehalem and newer only */ - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) goto guess; if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) @@ -5305,6 +6160,9 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p UNUSED(c); UNUSED(p); + if (no_msr) + return 0; + if (!(do_dts || do_ptm)) return 0; @@ -5402,6 +6260,9 @@ void decode_feature_control_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr)) fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n", base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : ""); @@ -5411,6 +6272,9 @@ void decode_misc_enable_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!genuine_intel) return; @@ -5428,6 +6292,9 @@ void decode_misc_feature_control(void) { unsigned long long msr; + if (no_msr) + return; + if (!platform->has_msr_misc_feature_control) return; @@ -5449,6 +6316,9 @@ void decode_misc_pwr_mgmt_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!platform->has_msr_misc_pwr_mgmt) return; @@ -5468,6 +6338,9 @@ void decode_c6_demotion_policy_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!platform->has_msr_c6_demotion_policy_config) return; @@ -5489,7 +6362,8 @@ void print_dev_latency(void) fd = open(path, O_RDONLY); if (fd < 0) { - warnx("capget(CAP_SYS_ADMIN) failed, try \"# setcap cap_sys_admin=ep %s\"", progname); + if (debug) + warnx("Read %s failed", path); return; } @@ -5504,23 +6378,260 @@ void print_dev_latency(void) close(fd); } +static int has_instr_count_access(void) +{ + int fd; + int has_access; + + if (no_perf) + return 0; + + fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); + has_access = fd != -1; + + if (fd != -1) + close(fd); + + if (!has_access) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", + "instructions retired perf counter", "--no-perf"); + + return has_access; +} + +bool is_aperf_access_required(void) +{ + return BIC_IS_ENABLED(BIC_Avg_MHz) + || BIC_IS_ENABLED(BIC_Busy) + || BIC_IS_ENABLED(BIC_Bzy_MHz) + || BIC_IS_ENABLED(BIC_IPC); +} + +int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, + double *scale_, enum rapl_unit *unit_) +{ + if (no_perf) + return -1; + + const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name); + + if (scale == 0.0) + return -1; + + const enum rapl_unit unit = read_perf_rapl_unit(cai->perf_subsys, cai->perf_name); + + if (unit == RAPL_UNIT_INVALID) + return -1; + + const unsigned int rapl_type = read_perf_type(cai->perf_subsys); + const unsigned int rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name); + + const int fd_counter = + open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP); + if (fd_counter == -1) + return -1; + + /* If it's the first counter opened, make it a group descriptor */ + if (rci->fd_perf == -1) + rci->fd_perf = fd_counter; + + *scale_ = scale; + *unit_ = unit; + return fd_counter; +} + +int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, + double *scale, enum rapl_unit *unit) +{ + int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit); + + if (debug) + fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); + + return ret; +} + /* * Linux-perf manages the HW instructions-retired counter * by enabling when requested, and hiding rollover */ void linux_perf_init(void) { - if (!BIC_IS_ENABLED(BIC_IPC)) - return; - if (access("/proc/sys/kernel/perf_event_paranoid", F_OK)) return; - fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); - if (fd_instr_count_percpu == NULL) - err(-1, "calloc fd_instr_count_percpu"); + if (BIC_IS_ENABLED(BIC_IPC) && has_aperf) { + fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); + if (fd_instr_count_percpu == NULL) + err(-1, "calloc fd_instr_count_percpu"); + } + + const bool aperf_required = is_aperf_access_required(); + + if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) { + fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu)); + if (fd_amperf_percpu == NULL) + err(-1, "calloc fd_amperf_percpu"); + } +} + +void rapl_perf_init(void) +{ + const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + bool *domain_visited = calloc(num_domains, sizeof(bool)); + + rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain)); + if (rapl_counter_info_perdomain == NULL) + err(-1, "calloc rapl_counter_info_percpu"); + + /* + * Initialize rapl_counter_info_percpu + */ + for (int domain_id = 0; domain_id < num_domains; ++domain_id) { + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id]; + + rci->fd_perf = -1; + for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) { + rci->data[i] = 0; + rci->source[i] = RAPL_SOURCE_NONE; + } + } - BIC_PRESENT(BIC_IPC); + /* + * Open/probe the counters + * If can't get it via perf, fallback to MSR + */ + for (size_t i = 0; i < ARRAY_SIZE(rapl_counter_arch_infos); ++i) { + + const struct rapl_counter_arch_info *const cai = &rapl_counter_arch_infos[i]; + bool has_counter = 0; + double scale; + enum rapl_unit unit; + int next_domain; + + memset(domain_visited, 0, num_domains * sizeof(*domain_visited)); + + for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) { + + if (cpu_is_not_allowed(cpu)) + continue; + + /* Skip already seen and handled RAPL domains */ + next_domain = + platform->has_per_core_rapl ? cpus[cpu].physical_core_id : cpus[cpu].physical_package_id; + + if (domain_visited[next_domain]) + continue; + + domain_visited[next_domain] = 1; + + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain]; + + /* Check if the counter is enabled and accessible */ + if (BIC_IS_ENABLED(cai->bic) && (platform->rapl_msrs & cai->feature_mask)) { + + /* Use perf API for this counter */ + if (!no_perf && cai->perf_name + && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) { + rci->source[cai->rci_index] = RAPL_SOURCE_PERF; + rci->scale[cai->rci_index] = scale * cai->compat_scale; + rci->unit[cai->rci_index] = unit; + rci->flags[cai->rci_index] = cai->flags; + + /* Use MSR for this counter */ + } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { + rci->source[cai->rci_index] = RAPL_SOURCE_MSR; + rci->msr[cai->rci_index] = cai->msr; + rci->msr_mask[cai->rci_index] = cai->msr_mask; + rci->msr_shift[cai->rci_index] = cai->msr_shift; + rci->unit[cai->rci_index] = RAPL_UNIT_JOULES; + rci->scale[cai->rci_index] = *cai->platform_rapl_msr_scale * cai->compat_scale; + rci->flags[cai->rci_index] = cai->flags; + } + } + + if (rci->source[cai->rci_index] != RAPL_SOURCE_NONE) + has_counter = 1; + } + + /* If any CPU has access to the counter, make it present */ + if (has_counter) + BIC_PRESENT(cai->bic); + } + + free(domain_visited); +} + +static int has_amperf_access_via_msr(void) +{ + if (no_msr) + return 0; + + if (probe_msr(base_cpu, MSR_IA32_APERF)) + return 0; + + if (probe_msr(base_cpu, MSR_IA32_MPERF)) + return 0; + + return 1; +} + +static int has_amperf_access_via_perf(void) +{ + struct amperf_group_fd fds; + + /* + * Cache the last result, so we don't warn the user multiple times + * + * Negative means cached, no access + * Zero means not cached + * Positive means cached, has access + */ + static int has_access_cached; + + if (no_perf) + return 0; + + if (has_access_cached != 0) + return has_access_cached > 0; + + fds = open_amperf_fd(base_cpu); + has_access_cached = (fds.aperf != -1) && (fds.mperf != -1); + + if (fds.aperf == -1) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", + "APERF perf counter", "--no-perf"); + else + close(fds.aperf); + + if (fds.mperf == -1) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", + "MPERF perf counter", "--no-perf"); + else + close(fds.mperf); + + if (has_access_cached == 0) + has_access_cached = -1; + + return has_access_cached > 0; +} + +/* Check if we can access APERF and MPERF */ +static int has_amperf_access(void) +{ + if (!is_aperf_access_required()) + return 0; + + if (!no_msr && has_amperf_access_via_msr()) + return 1; + + if (!no_perf && has_amperf_access_via_perf()) + return 1; + + return 0; } void probe_cstates(void) @@ -5563,7 +6674,7 @@ void probe_cstates(void) if (platform->has_msr_module_c6_res_ms) BIC_PRESENT(BIC_Mod_c6); - if (platform->has_ext_cst_msrs) { + if (platform->has_ext_cst_msrs && !no_msr) { BIC_PRESENT(BIC_Totl_c0); BIC_PRESENT(BIC_Any_c0); BIC_PRESENT(BIC_GFX_c0); @@ -5623,6 +6734,7 @@ void process_cpuid() unsigned int eax, ebx, ecx, edx; unsigned int fms, family, model, stepping, ecx_flags, edx_flags; unsigned long long ucode_patch = 0; + bool ucode_patch_valid = false; eax = ebx = ecx = edx = 0; @@ -5650,8 +6762,12 @@ void process_cpuid() ecx_flags = ecx; edx_flags = edx; - if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) - warnx("get_msr(UCODE)"); + if (!no_msr) { + if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) + warnx("get_msr(UCODE)"); + else + ucode_patch_valid = true; + } /* * check max extended function levels of CPUID. @@ -5662,9 +6778,12 @@ void process_cpuid() __cpuid(0x80000000, max_extended_level, ebx, ecx, edx); if (!quiet) { - fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d) microcode 0x%x\n", - family, model, stepping, family, model, stepping, - (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); + fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d)", + family, model, stepping, family, model, stepping); + if (ucode_patch_valid) + fprintf(outf, " microcode 0x%x", (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); + fputc('\n', outf); + fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level); fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n", ecx_flags & (1 << 0) ? "SSE3" : "-", @@ -5700,10 +6819,11 @@ void process_cpuid() __cpuid(0x6, eax, ebx, ecx, edx); has_aperf = ecx & (1 << 0); - if (has_aperf) { + if (has_aperf && has_amperf_access()) { BIC_PRESENT(BIC_Avg_MHz); BIC_PRESENT(BIC_Busy); BIC_PRESENT(BIC_Bzy_MHz); + BIC_PRESENT(BIC_IPC); } do_dts = eax & (1 << 0); if (do_dts) @@ -5786,6 +6906,15 @@ void process_cpuid() base_mhz = max_mhz = bus_mhz = edx = 0; __cpuid(0x16, base_mhz, max_mhz, bus_mhz, edx); + + bclk = bus_mhz; + + base_hz = base_mhz * 1000000; + has_base_hz = 1; + + if (platform->enable_tsc_tweak) + tsc_tweak = base_hz / tsc_hz; + if (!quiet) fprintf(outf, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n", base_mhz, max_mhz, bus_mhz); @@ -5814,7 +6943,7 @@ void probe_pm_features(void) probe_thermal(); - if (platform->has_nhm_msrs) + if (platform->has_nhm_msrs && !no_msr) BIC_PRESENT(BIC_SMI); if (!quiet) @@ -6142,6 +7271,7 @@ void topology_update(void) topo.allowed_packages = 0; for_all_cpus(update_topo, ODD_COUNTERS); } + void setup_all_buffers(bool startup) { topology_probe(startup); @@ -6169,21 +7299,129 @@ void set_base_cpu(void) err(-ENODEV, "No valid cpus found"); } +static void set_amperf_source(void) +{ + amperf_source = AMPERF_SOURCE_PERF; + + const bool aperf_required = is_aperf_access_required(); + + if (no_perf || !aperf_required || !has_amperf_access_via_perf()) + amperf_source = AMPERF_SOURCE_MSR; + + if (quiet || !debug) + return; + + fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); +} + +bool has_added_counters(void) +{ + /* + * It only makes sense to call this after the command line is parsed, + * otherwise sys structure is not populated. + */ + + return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters; +} + +bool is_msr_access_required(void) +{ + if (no_msr) + return false; + + if (has_added_counters()) + return true; + + return BIC_IS_ENABLED(BIC_SMI) + || BIC_IS_ENABLED(BIC_CPU_c1) + || BIC_IS_ENABLED(BIC_CPU_c3) + || BIC_IS_ENABLED(BIC_CPU_c6) + || BIC_IS_ENABLED(BIC_CPU_c7) + || BIC_IS_ENABLED(BIC_Mod_c6) + || BIC_IS_ENABLED(BIC_CoreTmp) + || BIC_IS_ENABLED(BIC_Totl_c0) + || BIC_IS_ENABLED(BIC_Any_c0) + || BIC_IS_ENABLED(BIC_GFX_c0) + || BIC_IS_ENABLED(BIC_CPUGFX) + || BIC_IS_ENABLED(BIC_Pkgpc3) + || BIC_IS_ENABLED(BIC_Pkgpc6) + || BIC_IS_ENABLED(BIC_Pkgpc2) + || BIC_IS_ENABLED(BIC_Pkgpc7) + || BIC_IS_ENABLED(BIC_Pkgpc8) + || BIC_IS_ENABLED(BIC_Pkgpc9) + || BIC_IS_ENABLED(BIC_Pkgpc10) + /* TODO: Multiplex access with perf */ + || BIC_IS_ENABLED(BIC_CorWatt) + || BIC_IS_ENABLED(BIC_Cor_J) + || BIC_IS_ENABLED(BIC_PkgWatt) + || BIC_IS_ENABLED(BIC_CorWatt) + || BIC_IS_ENABLED(BIC_GFXWatt) + || BIC_IS_ENABLED(BIC_RAMWatt) + || BIC_IS_ENABLED(BIC_Pkg_J) + || BIC_IS_ENABLED(BIC_Cor_J) + || BIC_IS_ENABLED(BIC_GFX_J) + || BIC_IS_ENABLED(BIC_RAM_J) + || BIC_IS_ENABLED(BIC_PKG__) + || BIC_IS_ENABLED(BIC_RAM__) + || BIC_IS_ENABLED(BIC_PkgTmp) + || (is_aperf_access_required() && !has_amperf_access_via_perf()); +} + +void check_msr_access(void) +{ + if (!is_msr_access_required()) + no_msr = 1; + + check_dev_msr(); + check_msr_permission(); + + if (no_msr) + bic_disable_msr_access(); +} + +void check_perf_access(void) +{ + const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC); + + if (no_perf || !intrcount_required || !has_instr_count_access()) + bic_enabled &= ~BIC_IPC; + + const bool aperf_required = is_aperf_access_required(); + + if (!aperf_required || !has_amperf_access()) { + bic_enabled &= ~BIC_Avg_MHz; + bic_enabled &= ~BIC_Busy; + bic_enabled &= ~BIC_Bzy_MHz; + bic_enabled &= ~BIC_IPC; + } +} + void turbostat_init() { setup_all_buffers(true); set_base_cpu(); - check_dev_msr(); - check_permissions(); + check_msr_access(); + check_perf_access(); process_cpuid(); probe_pm_features(); + set_amperf_source(); linux_perf_init(); + rapl_perf_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); if (DO_BIC(BIC_IPC)) (void)get_instr_count_fd(base_cpu); + + /* + * If TSC tweak is needed, but couldn't get it, + * disable more BICs, since it can't be reported accurately. + */ + if (platform->enable_tsc_tweak && !has_base_hz) { + bic_enabled &= ~BIC_Busy; + bic_enabled &= ~BIC_Bzy_MHz; + } } int fork_it(char **argv) @@ -6259,7 +7497,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2023.11.07 - Len Brown <lenb@kernel.org>\n"); + fprintf(outf, "turbostat version 2024.04.08 - Len Brown <lenb@kernel.org>\n"); } #define COMMAND_LINE_SIZE 2048 @@ -6291,6 +7529,9 @@ int add_counter(unsigned int msr_num, char *path, char *name, { struct msr_counter *msrp; + if (no_msr && msr_num) + errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num); + msrp = calloc(1, sizeof(struct msr_counter)); if (msrp == NULL) { perror("calloc"); @@ -6595,6 +7836,8 @@ void cmdline(int argc, char **argv) { "list", no_argument, 0, 'l' }, { "out", required_argument, 0, 'o' }, { "quiet", no_argument, 0, 'q' }, + { "no-msr", no_argument, 0, 'M' }, + { "no-perf", no_argument, 0, 'P' }, { "show", required_argument, 0, 's' }, { "Summary", no_argument, 0, 'S' }, { "TCC", required_argument, 0, 'T' }, @@ -6604,7 +7847,25 @@ void cmdline(int argc, char **argv) progname = argv[0]; - while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qST:v", long_options, &option_index)) != -1) { + /* + * Parse some options early, because they may make other options invalid, + * like adding the MSR counter with --add and at the same time using --no-msr. + */ + while ((opt = getopt_long_only(argc, argv, "MP", long_options, &option_index)) != -1) { + switch (opt) { + case 'M': + no_msr = 1; + break; + case 'P': + no_perf = 1; + break; + default: + break; + } + } + optind = 0; + + while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qMST:v", long_options, &option_index)) != -1) { switch (opt) { case 'a': parse_add_command(optarg); @@ -6662,6 +7923,10 @@ void cmdline(int argc, char **argv) case 'q': quiet = 1; break; + case 'M': + case 'P': + /* Parsed earlier */ + break; case 'n': num_iterations = strtod(optarg, NULL); @@ -6704,6 +7969,22 @@ void cmdline(int argc, char **argv) } } +void set_rlimit(void) +{ + struct rlimit limit; + + if (getrlimit(RLIMIT_NOFILE, &limit) < 0) + err(1, "Failed to get rlimit"); + + if (limit.rlim_max < MAX_NOFILE) + limit.rlim_max = MAX_NOFILE; + if (limit.rlim_cur < MAX_NOFILE) + limit.rlim_cur = MAX_NOFILE; + + if (setrlimit(RLIMIT_NOFILE, &limit) < 0) + err(1, "Failed to set rlimit"); +} + int main(int argc, char **argv) { int fd, ret; @@ -6729,9 +8010,13 @@ skip_cgroup_setting: probe_sysfs(); + if (!getuid()) + set_rlimit(); + turbostat_init(); - msr_sum_record(); + if (!no_msr) + msr_sum_record(); /* dump counters and exit */ if (dump_only) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 908e0d083936..61c69297e797 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -986,10 +986,12 @@ static void dpa_perf_setup(struct cxl_port *endpoint, struct range *range, { dpa_perf->qos_class = FAKE_QTG_ID; dpa_perf->dpa_range = *range; - dpa_perf->coord.read_latency = 500; - dpa_perf->coord.write_latency = 500; - dpa_perf->coord.read_bandwidth = 1000; - dpa_perf->coord.write_bandwidth = 1000; + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + dpa_perf->coord[i].read_latency = 500; + dpa_perf->coord[i].write_latency = 500; + dpa_perf->coord[i].read_bandwidth = 1000; + dpa_perf->coord[i].write_bandwidth = 1000; + } } static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port) diff --git a/tools/testing/kunit/configs/all_tests.config b/tools/testing/kunit/configs/all_tests.config index c5914f4e75e1..b3b00269a52a 100644 --- a/tools/testing/kunit/configs/all_tests.config +++ b/tools/testing/kunit/configs/all_tests.config @@ -40,6 +40,7 @@ CONFIG_DAMON_VADDR=y CONFIG_DAMON_PADDR=y CONFIG_DEBUG_FS=y CONFIG_DAMON_DBGFS=y +CONFIG_DAMON_DBGFS_DEPRECATED=y CONFIG_REGMAP_BUILD=y diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index e1504833654d..c785b6256a45 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -17,6 +17,7 @@ TARGETS += devices TARGETS += dmabuf-heaps TARGETS += drivers/dma-buf TARGETS += drivers/s390x/uvdevice +TARGETS += drivers/net TARGETS += drivers/net/bonding TARGETS += drivers/net/team TARGETS += dt @@ -63,7 +64,7 @@ TARGETS += net/hsr TARGETS += net/mptcp TARGETS += net/openvswitch TARGETS += net/tcp_ao -TARGETS += netfilter +TARGETS += net/netfilter TARGETS += nsfs TARGETS += perf_events TARGETS += pidfd @@ -116,6 +117,13 @@ TARGETS += zram TARGETS_HOTPLUG = cpu-hotplug TARGETS_HOTPLUG += memory-hotplug +# Networking tests want the net/lib target, include it automatically +ifneq ($(filter net drivers/net,$(TARGETS)),) +ifeq ($(filter net/lib,$(TARGETS)),) + INSTALL_DEP_TARGETS := net/lib +endif +endif + # User can optionally provide a TARGETS skiplist. By default we skip # BPF since it has cutting edge build time dependencies which require # more effort to install. @@ -245,7 +253,7 @@ ifdef INSTALL_PATH install -m 744 run_kselftest.sh $(INSTALL_PATH)/ rm -f $(TEST_LIST) @ret=1; \ - for TARGET in $(TARGETS); do \ + for TARGET in $(TARGETS) $(INSTALL_DEP_TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install \ INSTALL_PATH=$(INSTALL_PATH)/$$TARGET \ diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3b9eb40d6343..f06c527eee34 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -102,7 +102,6 @@ TEST_PROGS := test_kmod.sh \ test_xdp_redirect_multi.sh \ test_xdp_meta.sh \ test_xdp_veth.sh \ - test_offload.py \ test_sock_addr.sh \ test_tunnel.sh \ test_lwt_seg6local.sh \ @@ -136,18 +135,7 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi -# Emit succinct information message describing current building step -# $1 - generic step name (e.g., CC, LINK, etc); -# $2 - optional "flavor" specifier; if provided, will be emitted as [flavor]; -# $3 - target (assumed to be file); only file name will be emitted; -# $4 - optional extra arg, emitted as-is, if provided. -ifeq ($(V),1) -Q = -msg = -else -Q = @ -msg = @printf ' %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))"; -MAKEFLAGS += --no-print-directory +ifneq ($(V),1) submake_extras := feature_display=0 endif diff --git a/tools/testing/selftests/dmabuf-heaps/config b/tools/testing/selftests/dmabuf-heaps/config new file mode 100644 index 000000000000..be091f1cdfa0 --- /dev/null +++ b/tools/testing/selftests/dmabuf-heaps/config @@ -0,0 +1,3 @@ +CONFIG_DMABUF_HEAPS=y +CONFIG_DMABUF_HEAPS_SYSTEM=y +CONFIG_DRM_VGEM=y diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile new file mode 100644 index 000000000000..754ec643768a --- /dev/null +++ b/tools/testing/selftests/drivers/net/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_INCLUDES := $(wildcard lib/py/*.py) + +TEST_PROGS := \ + ping.py \ + stats.py \ +# end of TEST_PROGS + +include ../../lib.mk diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst new file mode 100644 index 000000000000..0cbab33dad1f --- /dev/null +++ b/tools/testing/selftests/drivers/net/README.rst @@ -0,0 +1,63 @@ +Running tests +============= + +Tests are executed within kselftest framework like any other tests. +By default tests execute against software drivers such as netdevsim. +All tests must support running against a real device (SW-only tests +should instead be placed in net/ or drivers/net/netdevsim, HW-only +tests in drivers/net/hw). + +Set appropriate variables to point the tests at a real device. + +Variables +========= + +Variables can be set in the environment or by creating a net.config +file in the same directory as this README file. Example:: + + $ NETIF=eth0 ./some_test.sh + +or:: + + $ cat tools/testing/selftests/drivers/net/net.config + # Variable set in a file + NETIF=eth0 + +Please note that the config parser is very simple, if there are +any non-alphanumeric characters in the value it needs to be in +double quotes. + +NETIF +~~~~~ + +Name of the netdevice against which the test should be executed. +When empty or not set software devices will be used. + +LOCAL_V4, LOCAL_V6, REMOTE_V4, REMOTE_V6 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Local and remote endpoint IP addresses. + +REMOTE_TYPE +~~~~~~~~~~~ + +Communication method used to run commands on the remote endpoint. +Test framework has built-in support for ``netns`` and ``ssh`` channels. +``netns`` assumes the "remote" interface is part of the same +host, just moved to the specified netns. +``ssh`` communicates with remote endpoint over ``ssh`` and ``scp``. +Using persistent SSH connections is strongly encouraged to avoid +the latency of SSH connection setup on every command. + +Communication methods are defined by classes in ``lib/py/remote_{name}.py``. +It should be possible to add a new method without modifying any of +the framework, by simply adding an appropriately named file to ``lib/py``. + +REMOTE_ARGS +~~~~~~~~~~~ + +Arguments used to construct the communication channel. +Communication channel dependent:: + + for netns - name of the "remote" namespace + for ssh - name/address of the remote host diff --git a/tools/testing/selftests/drivers/net/config b/tools/testing/selftests/drivers/net/config new file mode 100644 index 000000000000..f6a58ce8a230 --- /dev/null +++ b/tools/testing/selftests/drivers/net/config @@ -0,0 +1,2 @@ +CONFIG_IPV6=y +CONFIG_NETDEVSIM=m diff --git a/tools/testing/selftests/drivers/net/hw/ethtool.sh b/tools/testing/selftests/drivers/net/hw/ethtool.sh index bb12d5d70949..fa6953de6b6d 100755 --- a/tools/testing/selftests/drivers/net/hw/ethtool.sh +++ b/tools/testing/selftests/drivers/net/hw/ethtool.sh @@ -65,9 +65,8 @@ same_speeds_autoneg_off() setup_wait_dev_with_timeout $h1 setup_wait_dev_with_timeout $h2 ping_do $h1 192.0.2.2 - check_err $? "speed $speed autoneg off" - log_test "force of same speed autoneg off" - log_info "speed = $speed" + check_err $? "ping with speed $speed autoneg off" + log_test "force speed $speed on both ends" done ethtool -s $h2 autoneg on @@ -112,9 +111,8 @@ combination_of_neg_on_and_off() setup_wait_dev_with_timeout $h1 setup_wait_dev_with_timeout $h2 ping_do $h1 192.0.2.2 - check_err $? "h1-speed=$speed autoneg off, h2 autoneg on" - log_test "one side with autoneg off and another with autoneg on" - log_info "force speed = $speed" + check_err $? "ping with h1-speed=$speed autoneg off, h2 autoneg on" + log_test "force speed $speed vs. autoneg" done ethtool -s $h1 autoneg on @@ -207,10 +205,9 @@ advertise_subset_of_speeds() setup_wait_dev_with_timeout $h1 setup_wait_dev_with_timeout $h2 ping_do $h1 192.0.2.2 - check_err $? "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)" + check_err $? "ping with h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)" - log_test "advertise subset of speeds" - log_info "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise" + log_test "advertise $speed_1_to_advertise vs. $speed_2_to_advertise" done ethtool -s $h2 autoneg on diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh index e2a1c10d3503..8f60c1685ad4 100755 --- a/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh +++ b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh @@ -44,6 +44,7 @@ bucket_test() # Mausezahn does not include FCS bytes in its length - but the # histogram counters do len=$((len - ETH_FCS_LEN)) + len=$((len > 0 ? len : 0)) before=$(ethtool --json -S $iface --groups rmon | \ jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val") diff --git a/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh index 7dfc50366c99..67fafefc80be 100755 --- a/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh +++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh @@ -50,6 +50,7 @@ ALL_TESTS=" NUM_NETIFS=4 lib_dir=$(dirname "$0") source "$lib_dir"/../../../net/forwarding/lib.sh +source "$lib_dir"/../../../net/forwarding/tc_common.sh h1_create() { diff --git a/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh index ab8d04855af5..a94d92e1abce 100755 --- a/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh +++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh @@ -15,6 +15,7 @@ NUM_NETIFS=6 lib_dir=$(dirname "$0") source "$lib_dir"/../../../net/forwarding/lib.sh source "$lib_dir"/../../../net/forwarding/ipip_lib.sh +source "$lib_dir"/../../../net/forwarding/tc_common.sh setup_prepare() { diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py new file mode 100644 index 000000000000..4789c1a4282d --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 + +import sys +from pathlib import Path + +KSFT_DIR = (Path(__file__).parent / "../../../..").resolve() + +try: + sys.path.append(KSFT_DIR.as_posix()) + from net.lib.py import * +except ModuleNotFoundError as e: + ksft_pr("Failed importing `net` library from kernel sources") + ksft_pr(str(e)) + ktap_result(True, comment="SKIP") + sys.exit(4) + +from .env import * +from .remote import Remote diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py new file mode 100644 index 000000000000..a3db1bb1afeb --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +import shlex +from pathlib import Path +from lib.py import KsftSkipEx +from lib.py import cmd, ip +from lib.py import NetNS, NetdevSimDev +from .remote import Remote + + +def _load_env_file(src_path): + env = os.environ.copy() + + src_dir = Path(src_path).parent.resolve() + if not (src_dir / "net.config").exists(): + return env + + lexer = shlex.shlex(open((src_dir / "net.config").as_posix(), 'r').read()) + k = None + for token in lexer: + if k is None: + k = token + env[k] = "" + elif token == "=": + pass + else: + env[k] = token + k = None + return env + + +class NetDrvEnv: + """ + Class for a single NIC / host env, with no remote end + """ + def __init__(self, src_path): + self._ns = None + + self.env = _load_env_file(src_path) + + if 'NETIF' in self.env: + self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0] + else: + self._ns = NetdevSimDev() + self.dev = self._ns.nsims[0].dev + self.ifindex = self.dev['ifindex'] + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + """ + __exit__ gets called at the end of a "with" block. + """ + self.__del__() + + def __del__(self): + if self._ns: + self._ns.remove() + self._ns = None + + +class NetDrvEpEnv: + """ + Class for an environment with a local device and "remote endpoint" + which can be used to send traffic in. + + For local testing it creates two network namespaces and a pair + of netdevsim devices. + """ + + # Network prefixes used for local tests + nsim_v4_pfx = "192.0.2." + nsim_v6_pfx = "2001:db8::" + + def __init__(self, src_path): + + self.env = _load_env_file(src_path) + + # Things we try to destroy + self.remote = None + # These are for local testing state + self._netns = None + self._ns = None + self._ns_peer = None + + if "NETIF" in self.env: + self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0] + + self.v4 = self.env.get("LOCAL_V4") + self.v6 = self.env.get("LOCAL_V6") + self.remote_v4 = self.env.get("REMOTE_V4") + self.remote_v6 = self.env.get("REMOTE_V6") + kind = self.env["REMOTE_TYPE"] + args = self.env["REMOTE_ARGS"] + else: + self.create_local() + + self.dev = self._ns.nsims[0].dev + + self.v4 = self.nsim_v4_pfx + "1" + self.v6 = self.nsim_v6_pfx + "1" + self.remote_v4 = self.nsim_v4_pfx + "2" + self.remote_v6 = self.nsim_v6_pfx + "2" + kind = "netns" + args = self._netns.name + + self.remote = Remote(kind, args, src_path) + + self.addr = self.v6 if self.v6 else self.v4 + self.remote_addr = self.remote_v6 if self.remote_v6 else self.remote_v4 + + self.addr_ipver = "6" if self.v6 else "4" + # Bracketed addresses, some commands need IPv6 to be inside [] + self.baddr = f"[{self.v6}]" if self.v6 else self.v4 + self.remote_baddr = f"[{self.remote_v6}]" if self.remote_v6 else self.remote_v4 + + self.ifname = self.dev['ifname'] + self.ifindex = self.dev['ifindex'] + + self._required_cmd = {} + + def create_local(self): + self._netns = NetNS() + self._ns = NetdevSimDev() + self._ns_peer = NetdevSimDev(ns=self._netns) + + with open("/proc/self/ns/net") as nsfd0, \ + open("/var/run/netns/" + self._netns.name) as nsfd1: + ifi0 = self._ns.nsims[0].ifindex + ifi1 = self._ns_peer.nsims[0].ifindex + NetdevSimDev.ctrl_write('link_device', + f'{nsfd0.fileno()}:{ifi0} {nsfd1.fileno()}:{ifi1}') + + ip(f" addr add dev {self._ns.nsims[0].ifname} {self.nsim_v4_pfx}1/24") + ip(f"-6 addr add dev {self._ns.nsims[0].ifname} {self.nsim_v6_pfx}1/64 nodad") + ip(f" link set dev {self._ns.nsims[0].ifname} up") + + ip(f" addr add dev {self._ns_peer.nsims[0].ifname} {self.nsim_v4_pfx}2/24", ns=self._netns) + ip(f"-6 addr add dev {self._ns_peer.nsims[0].ifname} {self.nsim_v6_pfx}2/64 nodad", ns=self._netns) + ip(f" link set dev {self._ns_peer.nsims[0].ifname} up", ns=self._netns) + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + """ + __exit__ gets called at the end of a "with" block. + """ + self.__del__() + + def __del__(self): + if self._ns: + self._ns.remove() + self._ns = None + if self._ns_peer: + self._ns_peer.remove() + self._ns_peer = None + if self._netns: + del self._netns + self._netns = None + if self.remote: + del self.remote + self.remote = None + + def require_v4(self): + if not self.v4 or not self.remote_v4: + raise KsftSkipEx("Test requires IPv4 connectivity") + + def require_v6(self): + if not self.v6 or not self.remote_v6: + raise KsftSkipEx("Test requires IPv6 connectivity") + + def _require_cmd(self, comm, key, host=None): + cached = self._required_cmd.get(comm, {}) + if cached.get(key) is None: + cached[key] = cmd("command -v -- " + comm, fail=False, + shell=True, host=host).ret == 0 + self._required_cmd[comm] = cached + return cached[key] + + def require_cmd(self, comm, local=True, remote=False): + if local: + if not self._require_cmd(comm, "local"): + raise KsftSkipEx("Test requires command: " + comm) + if remote: + if not self._require_cmd(comm, "remote"): + raise KsftSkipEx("Test requires (remote) command: " + comm) diff --git a/tools/testing/selftests/drivers/net/lib/py/remote.py b/tools/testing/selftests/drivers/net/lib/py/remote.py new file mode 100644 index 000000000000..b1780b987722 --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/remote.py @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +import importlib + +_modules = {} + +def Remote(kind, args, src_path): + global _modules + + if kind not in _modules: + _modules[kind] = importlib.import_module("..remote_" + kind, __name__) + + dir_path = os.path.abspath(src_path + "/../") + return getattr(_modules[kind], "Remote")(args, dir_path) diff --git a/tools/testing/selftests/drivers/net/lib/py/remote_netns.py b/tools/testing/selftests/drivers/net/lib/py/remote_netns.py new file mode 100644 index 000000000000..7d5eeb0271bc --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/remote_netns.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +import subprocess + +from lib.py import cmd + + +class Remote: + def __init__(self, name, dir_path): + self.name = name + self.dir_path = dir_path + + def cmd(self, comm): + return subprocess.Popen(["ip", "netns", "exec", self.name, "bash", "-c", comm], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + def deploy(self, what): + if os.path.isabs(what): + return what + return os.path.abspath(self.dir_path + "/" + what) diff --git a/tools/testing/selftests/drivers/net/lib/py/remote_ssh.py b/tools/testing/selftests/drivers/net/lib/py/remote_ssh.py new file mode 100644 index 000000000000..924addde19a3 --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/remote_ssh.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +import string +import subprocess +import random + +from lib.py import cmd + + +class Remote: + def __init__(self, name, dir_path): + self.name = name + self.dir_path = dir_path + self._tmpdir = None + + def __del__(self): + if self._tmpdir: + cmd("rm -rf " + self._tmpdir, host=self) + self._tmpdir = None + + def cmd(self, comm): + return subprocess.Popen(["ssh", "-q", self.name, comm], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + def _mktmp(self): + return ''.join(random.choice(string.ascii_lowercase) for _ in range(8)) + + def deploy(self, what): + if not self._tmpdir: + self._tmpdir = "/tmp/" + self._mktmp() + cmd("mkdir " + self._tmpdir, host=self) + file_name = self._tmpdir + "/" + self._mktmp() + os.path.basename(what) + + if not os.path.isabs(what): + what = os.path.abspath(self.dir_path + "/" + what) + + cmd(f"scp {what} {self.name}:{file_name}") + return file_name diff --git a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh index 91891b9418d7..877cd6df94a1 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh @@ -24,8 +24,8 @@ setup_prepare() busywait "$TIMEOUT" wait_for_port_up ethtool $swp2 check_err $? "ports did not come up" - local lanes_exist=$(ethtool $swp1 | grep 'Lanes:') - if [[ -z $lanes_exist ]]; then + busywait $TIMEOUT sh -c "ethtool $swp1 | grep -q Lanes:" + if [[ $? -ne 0 ]]; then log_test "SKIP: driver does not support lanes setting" exit 1 fi @@ -122,8 +122,9 @@ autoneg() ethtool_set $swp1 speed $max_speed lanes $lanes ip link set dev $swp1 up ip link set dev $swp2 up - busywait "$TIMEOUT" wait_for_port_up ethtool $swp2 - check_err $? "ports did not come up" + + busywait $TIMEOUT sh -c "ethtool $swp1 | grep -q Lanes:" + check_err $? "Lanes parameter is not presented on time" check_lanes $swp1 $lanes $max_speed log_test "$lanes lanes is autonegotiated" @@ -160,8 +161,9 @@ autoneg_force_mode() ethtool_set $swp2 speed $max_speed lanes $lanes autoneg off ip link set dev $swp1 up ip link set dev $swp2 up - busywait "$TIMEOUT" wait_for_port_up ethtool $swp2 - check_err $? "ports did not come up" + + busywait $TIMEOUT sh -c "ethtool $swp1 | grep -q Lanes:" + check_err $? "Lanes parameter is not presented on time" check_lanes $swp1 $lanes $max_speed log_test "Autoneg off, $lanes lanes detected during force mode" diff --git a/tools/testing/selftests/drivers/net/ping.py b/tools/testing/selftests/drivers/net/ping.py new file mode 100755 index 000000000000..eb83e7b48797 --- /dev/null +++ b/tools/testing/selftests/drivers/net/ping.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_eq +from lib.py import NetDrvEpEnv +from lib.py import bkg, cmd, wait_port_listen, rand_port + + +def test_v4(cfg) -> None: + cfg.require_v4() + + cmd(f"ping -c 1 -W0.5 {cfg.remote_v4}") + cmd(f"ping -c 1 -W0.5 {cfg.v4}", host=cfg.remote) + + +def test_v6(cfg) -> None: + cfg.require_v6() + + cmd(f"ping -c 1 -W0.5 {cfg.remote_v6}") + cmd(f"ping -c 1 -W0.5 {cfg.v6}", host=cfg.remote) + + +def test_tcp(cfg) -> None: + cfg.require_cmd("socat", remote=True) + + port = rand_port() + listen_cmd = f"socat -{cfg.addr_ipver} -t 2 -u TCP-LISTEN:{port},reuseport STDOUT" + + with bkg(listen_cmd, exit_wait=True) as nc: + wait_port_listen(port) + + cmd(f"echo ping | socat -t 2 -u STDIN TCP:{cfg.baddr}:{port}", + shell=True, host=cfg.remote) + ksft_eq(nc.stdout.strip(), "ping") + + with bkg(listen_cmd, host=cfg.remote, exit_wait=True) as nc: + wait_port_listen(port, host=cfg.remote) + + cmd(f"echo ping | socat -t 2 -u STDIN TCP:{cfg.remote_baddr}:{port}", shell=True) + ksft_eq(nc.stdout.strip(), "ping") + + +def main() -> None: + with NetDrvEpEnv(__file__) as cfg: + ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py new file mode 100755 index 000000000000..7a7b16b180e2 --- /dev/null +++ b/tools/testing/selftests/drivers/net/stats.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import ksft_ge, ksft_eq, ksft_in, ksft_true, ksft_raises, KsftSkipEx, KsftXfailEx +from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError +from lib.py import NetDrvEnv + +ethnl = EthtoolFamily() +netfam = NetdevFamily() +rtnl = RtnlFamily() + + +def check_pause(cfg) -> None: + global ethnl + + try: + ethnl.pause_get({"header": {"dev-index": cfg.ifindex}}) + except NlError as e: + if e.error == 95: + raise KsftXfailEx("pause not supported by the device") + raise + + data = ethnl.pause_get({"header": {"dev-index": cfg.ifindex, + "flags": {'stats'}}}) + ksft_true(data['stats'], "driver does not report stats") + + +def check_fec(cfg) -> None: + global ethnl + + try: + ethnl.fec_get({"header": {"dev-index": cfg.ifindex}}) + except NlError as e: + if e.error == 95: + raise KsftXfailEx("FEC not supported by the device") + raise + + data = ethnl.fec_get({"header": {"dev-index": cfg.ifindex, + "flags": {'stats'}}}) + ksft_true(data['stats'], "driver does not report stats") + + +def pkt_byte_sum(cfg) -> None: + global netfam, rtnl + + def get_qstat(test): + global netfam + stats = netfam.qstats_get({}, dump=True) + if stats: + for qs in stats: + if qs["ifindex"]== test.ifindex: + return qs + + qstat = get_qstat(cfg) + if qstat is None: + raise KsftSkipEx("qstats not supported by the device") + + for key in ['tx-packets', 'tx-bytes', 'rx-packets', 'rx-bytes']: + ksft_in(key, qstat, "Drivers should always report basic keys") + + # Compare stats, rtnl stats and qstats must match, + # but the interface may be up, so do a series of dumps + # each time the more "recent" stats must be higher or same. + def stat_cmp(rstat, qstat): + for key in ['tx-packets', 'tx-bytes', 'rx-packets', 'rx-bytes']: + if rstat[key] != qstat[key]: + return rstat[key] - qstat[key] + return 0 + + for _ in range(10): + rtstat = rtnl.getlink({"ifi-index": cfg.ifindex})['stats'] + if stat_cmp(rtstat, qstat) < 0: + raise Exception("RTNL stats are lower, fetched later") + qstat = get_qstat(cfg) + if stat_cmp(rtstat, qstat) > 0: + raise Exception("Qstats are lower, fetched later") + + +def qstat_by_ifindex(cfg) -> None: + global netfam + global rtnl + + # Construct a map ifindex -> [dump, by-index, dump] + ifindexes = {} + stats = netfam.qstats_get({}, dump=True) + for entry in stats: + ifindexes[entry['ifindex']] = [entry, None, None] + + for ifindex in ifindexes.keys(): + entry = netfam.qstats_get({"ifindex": ifindex}, dump=True) + ksft_eq(len(entry), 1) + ifindexes[entry[0]['ifindex']][1] = entry[0] + + stats = netfam.qstats_get({}, dump=True) + for entry in stats: + ifindexes[entry['ifindex']][2] = entry + + if len(ifindexes) == 0: + raise KsftSkipEx("No ifindex supports qstats") + + # Now make sure the stats match/make sense + for ifindex, triple in ifindexes.items(): + all_keys = triple[0].keys() | triple[1].keys() | triple[2].keys() + + for key in all_keys: + ksft_ge(triple[1][key], triple[0][key], comment="bad key: " + key) + ksft_ge(triple[2][key], triple[1][key], comment="bad key: " + key) + + # Test invalid dumps + # 0 is invalid + with ksft_raises(NlError) as cm: + netfam.qstats_get({"ifindex": 0}, dump=True) + ksft_eq(cm.exception.nl_msg.error, -34) + ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') + + # loopback has no stats + with ksft_raises(NlError) as cm: + netfam.qstats_get({"ifindex": 1}, dump=True) + ksft_eq(cm.exception.nl_msg.error, -95) + ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') + + # Try to get stats for lowest unused ifindex but not 0 + devs = rtnl.getlink({}, dump=True) + all_ifindexes = set([dev["ifi-index"] for dev in devs]) + lowest = 2 + while lowest in all_ifindexes: + lowest += 1 + + with ksft_raises(NlError) as cm: + netfam.qstats_get({"ifindex": lowest}, dump=True) + ksft_eq(cm.exception.nl_msg.error, -19) + ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') + + +def main() -> None: + with NetDrvEnv(__file__) as cfg: + ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc index b1ede6249866..b7c8f29c09a9 100644 --- a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc +++ b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc @@ -18,7 +18,7 @@ echo 'sched:*' > set_event yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -lt 3 ]; then fail "at least fork, exec and exit events should be recorded" fi @@ -29,7 +29,7 @@ echo 1 > events/sched/enable yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -lt 3 ]; then fail "at least fork, exec and exit events should be recorded" fi @@ -40,7 +40,7 @@ echo 0 > events/sched/enable yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -ne 0 ]; then fail "any of scheduler events should not be recorded" fi diff --git a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc index 2de7c61d1ae3..3f74c09c56b6 100644 --- a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc +++ b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc @@ -24,7 +24,7 @@ echo 0 > events/enable echo "Get the most frequently calling function" sample_events -target_func=`cut -d: -f3 trace | sed 's/call_site=\([^+]*\)+0x.*/\1/' | sort | uniq -c | sort | tail -n 1 | sed 's/^[ 0-9]*//'` +target_func=`cat trace | grep -o 'call_site=\([^+]*\)' | sed 's/call_site=//' | sort | uniq -c | sort | tail -n 1 | sed 's/^[ 0-9]*//'` if [ -z "$target_func" ]; then exit_fail fi diff --git a/tools/testing/selftests/iommu/config b/tools/testing/selftests/iommu/config index 110d73917615..02a2a1b267c1 100644 --- a/tools/testing/selftests/iommu/config +++ b/tools/testing/selftests/iommu/config @@ -1,3 +1,5 @@ CONFIG_IOMMUFD=y +CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FAULT_INJECTION=y CONFIG_IOMMUFD_TEST=y +CONFIG_FAILSLAB=y diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 541bf192e30e..14bbab0cce13 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -51,6 +51,7 @@ #include <stdarg.h> #include <string.h> #include <stdio.h> +#include <sys/utsname.h> #endif #ifndef ARRAY_SIZE @@ -79,6 +80,9 @@ #define KSFT_XPASS 3 #define KSFT_SKIP 4 +#ifndef __noreturn +#define __noreturn __attribute__((__noreturn__)) +#endif #define __printf(a, b) __attribute__((format(printf, a, b))) /* counters */ @@ -288,24 +292,26 @@ void ksft_test_result_code(int exit_code, const char *test_name, } /* Docs seem to call for double space if directive is absent */ - if (!directive[0] && msg[0]) + if (!directive[0] && msg) directive = " # "; - va_start(args, msg); printf("%s %u %s%s", tap_code, ksft_test_num(), test_name, directive); errno = saved_errno; - vprintf(msg, args); + if (msg) { + va_start(args, msg); + vprintf(msg, args); + va_end(args); + } printf("\n"); - va_end(args); } -static inline int ksft_exit_pass(void) +static inline __noreturn int ksft_exit_pass(void) { ksft_print_cnts(); exit(KSFT_PASS); } -static inline int ksft_exit_fail(void) +static inline __noreturn int ksft_exit_fail(void) { ksft_print_cnts(); exit(KSFT_FAIL); @@ -332,7 +338,7 @@ static inline int ksft_exit_fail(void) ksft_cnt.ksft_xfail + \ ksft_cnt.ksft_xskip) -static inline __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) +static inline __noreturn __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) { int saved_errno = errno; va_list args; @@ -347,19 +353,19 @@ static inline __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) exit(KSFT_FAIL); } -static inline int ksft_exit_xfail(void) +static inline __noreturn int ksft_exit_xfail(void) { ksft_print_cnts(); exit(KSFT_XFAIL); } -static inline int ksft_exit_xpass(void) +static inline __noreturn int ksft_exit_xpass(void) { ksft_print_cnts(); exit(KSFT_XPASS); } -static inline __printf(1, 2) int ksft_exit_skip(const char *msg, ...) +static inline __noreturn __printf(1, 2) int ksft_exit_skip(const char *msg, ...) { int saved_errno = errno; va_list args; @@ -388,4 +394,21 @@ static inline __printf(1, 2) int ksft_exit_skip(const char *msg, ...) exit(KSFT_SKIP); } +static inline int ksft_min_kernel_version(unsigned int min_major, + unsigned int min_minor) +{ +#ifdef NOLIBC + ksft_print_msg("NOLIBC: Can't check kernel version: Function not implemented\n"); + return 0; +#else + unsigned int major, minor; + struct utsname info; + + if (uname(&info) || sscanf(info.release, "%u.%u.", &major, &minor) != 2) + ksft_exit_fail_msg("Can't parse kernel version\n"); + + return major > min_major || (major == min_major && minor >= min_minor); +#endif +} + #endif /* __KSELFTEST_H */ diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 4fd735e48ee7..ba3ddeda24bf 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -383,6 +383,7 @@ FIXTURE_DATA(fixture_name) self; \ pid_t child = 1; \ int status = 0; \ + bool jmp = false; \ memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \ if (setjmp(_metadata->env) == 0) { \ /* Use the same _metadata. */ \ @@ -399,8 +400,10 @@ _metadata->exit_code = KSFT_FAIL; \ } \ } \ + else \ + jmp = true; \ if (child == 0) { \ - if (_metadata->setup_completed && !_metadata->teardown_parent) \ + if (_metadata->setup_completed && !_metadata->teardown_parent && !jmp) \ fixture_name##_teardown(_metadata, &self, variant->data); \ _exit(0); \ } \ @@ -1202,7 +1205,7 @@ void __run_test(struct __fixture_metadata *f, diagnostic = "unknown"; ksft_test_result_code(t->exit_code, test_name, - diagnostic ? "%s" : "", diagnostic); + diagnostic ? "%s" : NULL, diagnostic); } static int test_harness_run(int argc, char **argv) diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index ddba2c2fb5de..4eaba83cdcf3 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -135,8 +135,8 @@ static void guest_run_stage(struct test_vcpu_shared_data *shared_data, irq_iter = READ_ONCE(shared_data->nr_iter); __GUEST_ASSERT(config_iter + 1 == irq_iter, - "config_iter + 1 = 0x%lx, irq_iter = 0x%lx.\n" - " Guest timer interrupt was not trigged within the specified\n" + "config_iter + 1 = 0x%x, irq_iter = 0x%x.\n" + " Guest timer interrupt was not triggered within the specified\n" " interval, try to increase the error margin by [-e] option.\n", config_iter + 1, irq_iter); } diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 3bd03b088dda..81ce37ec407d 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -1037,8 +1037,19 @@ static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu) void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu, struct kvm_x86_cpu_property property, uint32_t value); +void vcpu_set_cpuid_maxphyaddr(struct kvm_vcpu *vcpu, uint8_t maxphyaddr); void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function); + +static inline bool vcpu_cpuid_has(struct kvm_vcpu *vcpu, + struct kvm_x86_cpu_feature feature) +{ + struct kvm_cpuid_entry2 *entry; + + entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index); + return *((&entry->eax) + feature.reg) & BIT(feature.bit); +} + void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu, struct kvm_x86_cpu_feature feature, bool set); diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c index 6628dc4dda89..1a6da7389bf1 100644 --- a/tools/testing/selftests/kvm/max_guest_memory_test.c +++ b/tools/testing/selftests/kvm/max_guest_memory_test.c @@ -22,10 +22,11 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride) { uint64_t gpa; - for (gpa = start_gpa; gpa < end_gpa; gpa += stride) - *((volatile uint64_t *)gpa) = gpa; - - GUEST_DONE(); + for (;;) { + for (gpa = start_gpa; gpa < end_gpa; gpa += stride) + *((volatile uint64_t *)gpa) = gpa; + GUEST_SYNC(0); + } } struct vcpu_info { @@ -55,7 +56,7 @@ static void rendezvous_with_boss(void) static void run_vcpu(struct kvm_vcpu *vcpu) { vcpu_run(vcpu); - TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC); } static void *vcpu_worker(void *data) @@ -64,17 +65,13 @@ static void *vcpu_worker(void *data) struct kvm_vcpu *vcpu = info->vcpu; struct kvm_vm *vm = vcpu->vm; struct kvm_sregs sregs; - struct kvm_regs regs; vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size); - /* Snapshot regs before the first run. */ - vcpu_regs_get(vcpu, ®s); rendezvous_with_boss(); run_vcpu(vcpu); rendezvous_with_boss(); - vcpu_regs_set(vcpu, ®s); vcpu_sregs_get(vcpu, &sregs); #ifdef __x86_64__ /* Toggle CR0.WP to trigger a MMU context reset. */ diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c index e22848f747c0..0f9cabd99fd4 100644 --- a/tools/testing/selftests/kvm/riscv/arch_timer.c +++ b/tools/testing/selftests/kvm/riscv/arch_timer.c @@ -60,7 +60,7 @@ static void guest_run(struct test_vcpu_shared_data *shared_data) irq_iter = READ_ONCE(shared_data->nr_iter); __GUEST_ASSERT(config_iter + 1 == irq_iter, "config_iter + 1 = 0x%x, irq_iter = 0x%x.\n" - " Guest timer interrupt was not trigged within the specified\n" + " Guest timer interrupt was not triggered within the specified\n" " interval, try to increase the error margin by [-e] option.\n", config_iter + 1, irq_iter); } diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 06b43ed23580..bd57d991e27d 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -333,7 +333,7 @@ static void test_invalid_memory_region_flags(void) struct kvm_vm *vm; int r, i; -#if defined __aarch64__ || defined __x86_64__ +#if defined __aarch64__ || defined __riscv || defined __x86_64__ supported_flags |= KVM_MEM_READONLY; #endif diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index 9e2879af7c20..40cc59f4e650 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -133,6 +133,43 @@ static void enter_guest(struct kvm_vcpu *vcpu) } } +static void test_pv_unhalt(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct kvm_cpuid_entry2 *ent; + u32 kvm_sig_old; + + pr_info("testing KVM_FEATURE_PV_UNHALT\n"); + + TEST_REQUIRE(KVM_CAP_X86_DISABLE_EXITS); + + /* KVM_PV_UNHALT test */ + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + vcpu_set_cpuid_feature(vcpu, X86_FEATURE_KVM_PV_UNHALT); + + TEST_ASSERT(vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT), + "Enabling X86_FEATURE_KVM_PV_UNHALT had no effect"); + + /* Make sure KVM clears vcpu->arch.kvm_cpuid */ + ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE); + kvm_sig_old = ent->ebx; + ent->ebx = 0xdeadbeef; + vcpu_set_cpuid(vcpu); + + vm_enable_cap(vm, KVM_CAP_X86_DISABLE_EXITS, KVM_X86_DISABLE_EXITS_HLT); + ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE); + ent->ebx = kvm_sig_old; + vcpu_set_cpuid(vcpu); + + TEST_ASSERT(!vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT), + "KVM_FEATURE_PV_UNHALT is set with KVM_CAP_X86_DISABLE_EXITS"); + + /* FIXME: actually test KVM_FEATURE_PV_UNHALT feature */ + + kvm_vm_free(vm); +} + int main(void) { struct kvm_vcpu *vcpu; @@ -151,4 +188,6 @@ int main(void) enter_guest(vcpu); kvm_vm_free(vm); + + test_pv_unhalt(); } diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c index 29609b52f8fa..26c85815f7e9 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c @@ -416,12 +416,30 @@ static void guest_rd_wr_counters(uint32_t base_msr, uint8_t nr_possible_counters static void guest_test_gp_counters(void) { + uint8_t pmu_version = guest_get_pmu_version(); uint8_t nr_gp_counters = 0; uint32_t base_msr; - if (guest_get_pmu_version()) + if (pmu_version) nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS); + /* + * For v2+ PMUs, PERF_GLOBAL_CTRL's architectural post-RESET value is + * "Sets bits n-1:0 and clears the upper bits", where 'n' is the number + * of GP counters. If there are no GP counters, require KVM to leave + * PERF_GLOBAL_CTRL '0'. This edge case isn't covered by the SDM, but + * follow the spirit of the architecture and only globally enable GP + * counters, of which there are none. + */ + if (pmu_version > 1) { + uint64_t global_ctrl = rdmsr(MSR_CORE_PERF_GLOBAL_CTRL); + + if (nr_gp_counters) + GUEST_ASSERT_EQ(global_ctrl, GENMASK_ULL(nr_gp_counters - 1, 0)); + else + GUEST_ASSERT_EQ(global_ctrl, 0); + } + if (this_cpu_has(X86_FEATURE_PDCM) && rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES) base_msr = MSR_IA32_PMC0; diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index 7f6f5f23fb9b..977948fd52e6 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -28,16 +28,16 @@ #define NESTED_TEST_MEM1 0xc0001000 #define NESTED_TEST_MEM2 0xc0002000 -static void l2_guest_code(void) +static void l2_guest_code(u64 *a, u64 *b) { - *(volatile uint64_t *)NESTED_TEST_MEM1; - *(volatile uint64_t *)NESTED_TEST_MEM1 = 1; + READ_ONCE(*a); + WRITE_ONCE(*a, 1); GUEST_SYNC(true); GUEST_SYNC(false); - *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + WRITE_ONCE(*b, 1); GUEST_SYNC(true); - *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + WRITE_ONCE(*b, 1); GUEST_SYNC(true); GUEST_SYNC(false); @@ -45,17 +45,33 @@ static void l2_guest_code(void) vmcall(); } +static void l2_guest_code_ept_enabled(void) +{ + l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2); +} + +static void l2_guest_code_ept_disabled(void) +{ + /* Access the same L1 GPAs as l2_guest_code_ept_enabled() */ + l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM); +} + void l1_guest_code(struct vmx_pages *vmx) { #define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + void *l2_rip; GUEST_ASSERT(vmx->vmcs_gpa); GUEST_ASSERT(prepare_for_vmx_operation(vmx)); GUEST_ASSERT(load_vmcs(vmx)); - prepare_vmcs(vmx, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + if (vmx->eptp_gpa) + l2_rip = l2_guest_code_ept_enabled; + else + l2_rip = l2_guest_code_ept_disabled; + + prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); GUEST_SYNC(false); GUEST_ASSERT(!vmlaunch()); @@ -64,7 +80,7 @@ void l1_guest_code(struct vmx_pages *vmx) GUEST_DONE(); } -int main(int argc, char *argv[]) +static void test_vmx_dirty_log(bool enable_ept) { vm_vaddr_t vmx_pages_gva = 0; struct vmx_pages *vmx; @@ -76,8 +92,7 @@ int main(int argc, char *argv[]) struct ucall uc; bool done = false; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); - TEST_REQUIRE(kvm_cpu_has_ept()); + pr_info("Nested EPT: %s\n", enable_ept ? "enabled" : "disabled"); /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); @@ -103,11 +118,16 @@ int main(int argc, char *argv[]) * * Note that prepare_eptp should be called only L1's GPA map is done, * meaning after the last call to virt_map. + * + * When EPT is disabled, the L2 guest code will still access the same L1 + * GPAs as the EPT enabled case. */ - prepare_eptp(vmx, vm, 0); - nested_map_memslot(vmx, vm, 0); - nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); - nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); + if (enable_ept) { + prepare_eptp(vmx, vm, 0); + nested_map_memslot(vmx, vm, 0); + nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); + nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); + } bmap = bitmap_zalloc(TEST_MEM_PAGES); host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); @@ -148,3 +168,15 @@ int main(int argc, char *argv[]) } } } + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + test_vmx_dirty_log(/*enable_ept=*/false); + + if (kvm_cpu_has_ept()) + test_vmx_dirty_log(/*enable_ept=*/true); + + return 0; +} diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index da2cade3bab0..aeeac5f83492 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -44,6 +44,20 @@ endif selfdir = $(realpath $(dir $(filter %/lib.mk,$(MAKEFILE_LIST)))) top_srcdir = $(selfdir)/../../.. +# msg: emit succinct information message describing current building step +# $1 - generic step name (e.g., CC, LINK, etc); +# $2 - optional "flavor" specifier; if provided, will be emitted as [flavor]; +# $3 - target (assumed to be file); only file name will be emitted; +# $4 - optional extra arg, emitted as-is, if provided. +ifeq ($(V),1) +Q = +msg = +else +Q = @ +msg = @printf ' %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))"; +MAKEFLAGS += --no-print-directory +endif + ifeq ($(KHDR_INCLUDES),) KHDR_INCLUDES := -isystem $(top_srcdir)/usr/include endif @@ -176,7 +190,8 @@ endif ifeq ($(OVERRIDE_TARGETS),) LOCAL_HDRS += $(selfdir)/kselftest_harness.h $(selfdir)/kselftest.h $(OUTPUT)/%:%.c $(LOCAL_HDRS) - $(LINK.c) $(filter-out $(LOCAL_HDRS),$^) $(LDLIBS) -o $@ + $(call msg,CC,,$@) + $(Q)$(LINK.c) $(filter-out $(LOCAL_HDRS),$^) $(LDLIBS) -o $@ $(OUTPUT)/%.o:%.S $(COMPILE.S) $^ -o $@ diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index c02990bbd56f..9007c420d52c 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -3,7 +3,7 @@ #include <stdbool.h> #include <sys/mman.h> #include <err.h> -#include <string.h> /* ffsl() */ +#include <strings.h> /* ffsl() */ #include <unistd.h> /* _SC_PAGESIZE */ #define BIT_ULL(nr) (1ULL << (nr)) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index cb418a2346bc..5befca249452 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -34,6 +34,7 @@ TEST_PROGS += gre_gso.sh TEST_PROGS += cmsg_so_mark.sh TEST_PROGS += cmsg_time.sh cmsg_ipv6.sh TEST_PROGS += netns-name.sh +TEST_PROGS += nl_netdev.py TEST_PROGS += srv6_end_dt46_l3vpn_test.sh TEST_PROGS += srv6_end_dt4_l3vpn_test.sh TEST_PROGS += srv6_end_dt6_l3vpn_test.sh @@ -81,8 +82,6 @@ TEST_GEN_PROGS += so_incoming_cpu TEST_PROGS += sctp_vrf.sh TEST_GEN_FILES += sctp_hello TEST_GEN_FILES += csum -TEST_GEN_FILES += nat6to4.o -TEST_GEN_FILES += xdp_dummy.o TEST_GEN_FILES += ip_local_port_range TEST_GEN_FILES += bind_wildcard TEST_PROGS += test_vxlan_mdb.sh @@ -92,10 +91,13 @@ TEST_PROGS += test_bridge_backup_port.sh TEST_PROGS += fdb_flush.sh TEST_PROGS += fq_band_pktlimit.sh TEST_PROGS += vlan_hw_filter.sh +TEST_PROGS += bpf_offload.py TEST_FILES := settings TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh +TEST_GEN_FILES += $(patsubst %.c,%.o,$(wildcard *.bpf.c)) + TEST_INCLUDES := forwarding/lib.sh include ../lib.mk @@ -106,49 +108,4 @@ $(OUTPUT)/tcp_inq: LDLIBS += -lpthread $(OUTPUT)/bind_bhash: LDLIBS += -lpthread $(OUTPUT)/io_uring_zerocopy_tx: CFLAGS += -I../../../include/ -# Rules to generate bpf objs -CLANG ?= clang -SCRATCH_DIR := $(OUTPUT)/tools -BUILD_DIR := $(SCRATCH_DIR)/build -BPFDIR := $(abspath ../../../lib/bpf) -APIDIR := $(abspath ../../../include/uapi) - -CCINCLUDE += -I../bpf -CCINCLUDE += -I../../../../usr/include/ -CCINCLUDE += -I$(SCRATCH_DIR)/include - -BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a - -MAKE_DIRS := $(BUILD_DIR)/libbpf -$(MAKE_DIRS): - mkdir -p $@ - -# Get Clang's default includes on this system, as opposed to those seen by -# '--target=bpf'. This fixes "missing" files on some architectures/distros, -# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. -# -# Use '-idirafter': Don't interfere with include mechanics except where the -# build would have failed anyways. -define get_sys_includes -$(shell $(1) $(2) -v -E - </dev/null 2>&1 \ - | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ -$(shell $(1) $(2) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}') -endef - -ifneq ($(CROSS_COMPILE),) -CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%)) -endif - -CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) - -$(OUTPUT)/nat6to4.o $(OUTPUT)/xdp_dummy.o: $(OUTPUT)/%.o : %.c $(BPFOBJ) | $(MAKE_DIRS) - $(CLANG) -O2 --target=bpf -c $< $(CCINCLUDE) $(CLANG_SYS_INCLUDES) -o $@ - -$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ - $(APIDIR)/linux/bpf.h \ - | $(BUILD_DIR)/libbpf - $(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \ - EXTRA_CFLAGS='-g -O0' \ - DESTDIR=$(SCRATCH_DIR) prefix= all install_headers - -EXTRA_CLEAN := $(SCRATCH_DIR) +include bpf.mk diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index a2662348cdb1..b7b54d646b93 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -6,7 +6,9 @@ #include "../kselftest_harness.h" -struct in6_addr in6addr_v4mapped_any = { +static const __u32 in4addr_any = INADDR_ANY; +static const __u32 in4addr_loopback = INADDR_LOOPBACK; +static const struct in6_addr in6addr_v4mapped_any = { .s6_addr = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -14,8 +16,7 @@ struct in6_addr in6addr_v4mapped_any = { 0, 0, 0, 0 } }; - -struct in6_addr in6addr_v4mapped_loopback = { +static const struct in6_addr in6addr_v4mapped_loopback = { .s6_addr = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -24,137 +25,785 @@ struct in6_addr in6addr_v4mapped_loopback = { } }; +#define NR_SOCKETS 8 + FIXTURE(bind_wildcard) { - struct sockaddr_in addr4; - struct sockaddr_in6 addr6; + int fd[NR_SOCKETS]; + socklen_t addrlen[NR_SOCKETS]; + union { + struct sockaddr addr; + struct sockaddr_in addr4; + struct sockaddr_in6 addr6; + } addr[NR_SOCKETS]; }; FIXTURE_VARIANT(bind_wildcard) { - const __u32 addr4_const; - const struct in6_addr *addr6_const; - int expected_errno; + sa_family_t family[2]; + const void *addr[2]; + bool ipv6_only[2]; + + /* 6 bind() calls below follow two bind() for the defined 2 addresses: + * + * 0.0.0.0 + * 127.0.0.1 + * :: + * ::1 + * ::ffff:0.0.0.0 + * ::ffff:127.0.0.1 + */ + int expected_errno[NR_SOCKETS]; + int expected_reuse_errno[NR_SOCKETS]; +}; + +/* (IPv4, IPv4) */ +FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v4_local) +{ + .family = {AF_INET, AF_INET}, + .addr = {&in4addr_any, &in4addr_loopback}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v4_any) +{ + .family = {AF_INET, AF_INET}, + .addr = {&in4addr_loopback, &in4addr_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; +/* (IPv4, IPv6) */ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_any, - .expected_errno = EADDRINUSE, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any_only) +{ + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_loopback, - .expected_errno = 0, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_loopback}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_any) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_v4mapped_any, - .expected_errno = EADDRINUSE, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_v4mapped_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_local) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_v4mapped_loopback, - .expected_errno = EADDRINUSE, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_v4mapped_loopback}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_any, - .expected_errno = EADDRINUSE, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any_only) +{ + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_loopback, - .expected_errno = 0, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_loopback}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_any) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_v4mapped_any, - .expected_errno = EADDRINUSE, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_v4mapped_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_v4mapped_loopback, - .expected_errno = EADDRINUSE, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_v4mapped_loopback}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, +}; + +/* (IPv6, IPv4) */ +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_any}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_loopback}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_loopback}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_loopback, &in4addr_any}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_loopback, &in4addr_loopback}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_any, &in4addr_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_any, &in4addr_loopback}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_loopback, &in4addr_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_loopback, &in4addr_loopback}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, +}; + +/* (IPv6, IPv6) */ +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .ipv6_only = {true, false}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .ipv6_only = {true, true}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_loopback}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_loopback}, + .ipv6_only = {true, false}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_any}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_loopback}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_loopback}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_any}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_v4mapped_any}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_v4mapped_loopback}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_loopback}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_v4mapped_loopback}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_loopback}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_v4mapped_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, +}; + +static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, + int family, const void *addr_const) +{ + if (family == AF_INET) { + struct sockaddr_in *addr4 = &self->addr[i].addr4; + const __u32 *addr4_const = addr_const; + + addr4->sin_family = AF_INET; + addr4->sin_port = htons(0); + addr4->sin_addr.s_addr = htonl(*addr4_const); + + self->addrlen[i] = sizeof(struct sockaddr_in); + } else { + struct sockaddr_in6 *addr6 = &self->addr[i].addr6; + const struct in6_addr *addr6_const = addr_const; + + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(0); + addr6->sin6_addr = *addr6_const; + + self->addrlen[i] = sizeof(struct sockaddr_in6); + } +} + FIXTURE_SETUP(bind_wildcard) { - self->addr4.sin_family = AF_INET; - self->addr4.sin_port = htons(0); - self->addr4.sin_addr.s_addr = htonl(variant->addr4_const); + setup_addr(self, 0, variant->family[0], variant->addr[0]); + setup_addr(self, 1, variant->family[1], variant->addr[1]); + + setup_addr(self, 2, AF_INET, &in4addr_any); + setup_addr(self, 3, AF_INET, &in4addr_loopback); - self->addr6.sin6_family = AF_INET6; - self->addr6.sin6_port = htons(0); - self->addr6.sin6_addr = *variant->addr6_const; + setup_addr(self, 4, AF_INET6, &in6addr_any); + setup_addr(self, 5, AF_INET6, &in6addr_loopback); + setup_addr(self, 6, AF_INET6, &in6addr_v4mapped_any); + setup_addr(self, 7, AF_INET6, &in6addr_v4mapped_loopback); } FIXTURE_TEARDOWN(bind_wildcard) { + int i; + + for (i = 0; i < NR_SOCKETS; i++) + close(self->fd[i]); } -void bind_sockets(struct __test_metadata *_metadata, - FIXTURE_DATA(bind_wildcard) *self, - int expected_errno, - struct sockaddr *addr1, socklen_t addrlen1, - struct sockaddr *addr2, socklen_t addrlen2) +void bind_socket(struct __test_metadata *_metadata, + FIXTURE_DATA(bind_wildcard) *self, + const FIXTURE_VARIANT(bind_wildcard) *variant, + int i, int reuse) { - int fd[2]; int ret; - fd[0] = socket(addr1->sa_family, SOCK_STREAM, 0); - ASSERT_GT(fd[0], 0); + self->fd[i] = socket(self->addr[i].addr.sa_family, SOCK_STREAM, 0); + ASSERT_GT(self->fd[i], 0); - ret = bind(fd[0], addr1, addrlen1); - ASSERT_EQ(ret, 0); + if (i < 2 && variant->ipv6_only[i]) { + ret = setsockopt(self->fd[i], SOL_IPV6, IPV6_V6ONLY, &(int){1}, sizeof(int)); + ASSERT_EQ(ret, 0); + } - ret = getsockname(fd[0], addr1, &addrlen1); - ASSERT_EQ(ret, 0); + if (i < 2 && reuse) { + ret = setsockopt(self->fd[i], SOL_SOCKET, reuse, &(int){1}, sizeof(int)); + ASSERT_EQ(ret, 0); + } - ((struct sockaddr_in *)addr2)->sin_port = ((struct sockaddr_in *)addr1)->sin_port; + self->addr[i].addr4.sin_port = self->addr[0].addr4.sin_port; - fd[1] = socket(addr2->sa_family, SOCK_STREAM, 0); - ASSERT_GT(fd[1], 0); + ret = bind(self->fd[i], &self->addr[i].addr, self->addrlen[i]); - ret = bind(fd[1], addr2, addrlen2); - if (expected_errno) { - ASSERT_EQ(ret, -1); - ASSERT_EQ(errno, expected_errno); + if (reuse) { + if (variant->expected_reuse_errno[i]) { + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, variant->expected_reuse_errno[i]); + } else { + ASSERT_EQ(ret, 0); + } } else { + if (variant->expected_errno[i]) { + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, variant->expected_errno[i]); + } else { + ASSERT_EQ(ret, 0); + } + } + + if (i == 0) { + ret = getsockname(self->fd[0], &self->addr[0].addr, &self->addrlen[0]); ASSERT_EQ(ret, 0); } +} - close(fd[1]); - close(fd[0]); +TEST_F(bind_wildcard, plain) +{ + int i; + + for (i = 0; i < NR_SOCKETS; i++) + bind_socket(_metadata, self, variant, i, 0); } -TEST_F(bind_wildcard, v4_v6) +TEST_F(bind_wildcard, reuseaddr) { - bind_sockets(_metadata, self, variant->expected_errno, - (struct sockaddr *)&self->addr4, sizeof(self->addr4), - (struct sockaddr *)&self->addr6, sizeof(self->addr6)); + int i; + + for (i = 0; i < NR_SOCKETS; i++) + bind_socket(_metadata, self, variant, i, SO_REUSEADDR); } -TEST_F(bind_wildcard, v6_v4) +TEST_F(bind_wildcard, reuseport) { - bind_sockets(_metadata, self, variant->expected_errno, - (struct sockaddr *)&self->addr6, sizeof(self->addr6), - (struct sockaddr *)&self->addr4, sizeof(self->addr4)); + int i; + + for (i = 0; i < NR_SOCKETS; i++) + bind_socket(_metadata, self, variant, i, SO_REUSEPORT); } TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/bpf.mk b/tools/testing/selftests/net/bpf.mk new file mode 100644 index 000000000000..a4f6755dd894 --- /dev/null +++ b/tools/testing/selftests/net/bpf.mk @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: GPL-2.0 +# Rules to generate bpf objs +CLANG ?= clang +SCRATCH_DIR := $(OUTPUT)/tools +BUILD_DIR := $(SCRATCH_DIR)/build +BPFDIR := $(top_srcdir)/tools/lib/bpf +APIDIR := $(top_srcdir)/tools/include/uapi + +CCINCLUDE += -I$(selfdir)/bpf +CCINCLUDE += -I$(top_srcdir)/usr/include/ +CCINCLUDE += -I$(SCRATCH_DIR)/include + +BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a + +MAKE_DIRS := $(BUILD_DIR)/libbpf +$(MAKE_DIRS): + $(call msg,MKDIR,,$@) + $(Q)mkdir -p $@ + +# Get Clang's default includes on this system, as opposed to those seen by +# '--target=bpf'. This fixes "missing" files on some architectures/distros, +# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +define get_sys_includes +$(shell $(1) $(2) -v -E - </dev/null 2>&1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) $(2) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}') +endef + +ifneq ($(CROSS_COMPILE),) +CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%)) +endif + +CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) + +BPF_PROG_OBJS := $(patsubst %.c,$(OUTPUT)/%.o,$(wildcard *.bpf.c)) + +$(BPF_PROG_OBJS): $(OUTPUT)/%.o : %.c $(BPFOBJ) | $(MAKE_DIRS) + $(call msg,BPF_PROG,,$@) + $(Q)$(CLANG) -O2 -g --target=bpf $(CCINCLUDE) $(CLANG_SYS_INCLUDES) \ + -c $< -o $@ + +$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ + $(APIDIR)/linux/bpf.h \ + | $(BUILD_DIR)/libbpf + $(call msg,MAKE,,$@) + $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \ + EXTRA_CFLAGS='-g -O0' \ + DESTDIR=$(SCRATCH_DIR) prefix= all install_headers + +EXTRA_CLEAN += $(SCRATCH_DIR) diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/net/bpf_offload.py index 6157f884d091..3efe44f6e92a 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/net/bpf_offload.py @@ -29,6 +29,9 @@ import subprocess import time import traceback +from lib.py import NetdevSim, NetdevSimDev + + logfile = None log_level = 1 skip_extack = False @@ -145,8 +148,10 @@ def tool(name, args, flags, JSON=True, ns="", fail=True, include_stderr=False): if JSON: params += "%s " % (flags["json"]) - if ns != "": + if ns: ns = "ip netns exec %s " % (ns) + elif ns is None: + ns = "" if include_stderr: ret, stdout, stderr = cmd(ns + name + " " + params + args, @@ -201,11 +206,11 @@ def bpftool_prog_list_wait(expected=0, n_retry=20): time.sleep(0.05) raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs)) -def bpftool_map_list_wait(expected=0, n_retry=20): +def bpftool_map_list_wait(expected=0, n_retry=20, ns=""): for i in range(n_retry): - nmaps = len(bpftool_map_list()) - if nmaps == expected: - return + maps = bpftool_map_list(ns=ns) + if len(maps) == expected: + return maps time.sleep(0.05) raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps)) @@ -237,7 +242,7 @@ def tc(args, JSON=True, ns="", fail=True, include_stderr=False): def ethtool(dev, opt, args, fail=True): return cmd("ethtool %s %s %s" % (opt, dev["ifname"], args), fail=fail) -def bpf_obj(name, sec=".text", path=bpf_test_dir,): +def bpf_obj(name, sec="xdp", path=bpf_test_dir,): return "obj %s sec %s" % (os.path.join(path, name), sec) def bpf_pinned(name): @@ -334,72 +339,16 @@ class DebugfsDir: return dfs -class NetdevSimDev: +class BpfNetdevSimDev(NetdevSimDev): """ Class for netdevsim bus device and its attributes. """ - @staticmethod - def ctrl_write(path, val): - fullpath = os.path.join("/sys/bus/netdevsim/", path) - try: - with open(fullpath, "w") as f: - f.write(val) - except OSError as e: - log("WRITE %s: %r" % (fullpath, val), -e.errno) - raise e - log("WRITE %s: %r" % (fullpath, val), 0) - - def __init__(self, port_count=1): - addr = 0 - while True: - try: - self.ctrl_write("new_device", "%u %u" % (addr, port_count)) - except OSError as e: - if e.errno == errno.ENOSPC: - addr += 1 - continue - raise e - break - self.addr = addr - - # As probe of netdevsim device might happen from a workqueue, - # so wait here until all netdevs appear. - self.wait_for_netdevs(port_count) - - ret, out = cmd("udevadm settle", fail=False) - if ret: - raise Exception("udevadm settle failed") - ifnames = self.get_ifnames() - + def __init__(self, port_count=1, ns=None): + super().__init__(port_count, ns=ns) devs.append(self) - self.dfs_dir = "/sys/kernel/debug/netdevsim/netdevsim%u/" % addr - - self.nsims = [] - for port_index in range(port_count): - self.nsims.append(NetdevSim(self, port_index, ifnames[port_index])) - - def get_ifnames(self): - ifnames = [] - listdir = os.listdir("/sys/bus/netdevsim/devices/netdevsim%u/net/" % self.addr) - for ifname in listdir: - ifnames.append(ifname) - ifnames.sort() - return ifnames - - def wait_for_netdevs(self, port_count): - timeout = 5 - timeout_start = time.time() - - while True: - try: - ifnames = self.get_ifnames() - except FileNotFoundError as e: - ifnames = [] - if len(ifnames) == port_count: - break - if time.time() < timeout_start + timeout: - continue - raise Exception("netdevices did not appear within timeout") + + def _make_port(self, port_index, ifname): + return BpfNetdevSim(self, port_index, ifname, self.ns) def dfs_num_bound_progs(self): path = os.path.join(self.dfs_dir, "bpf_bound_progs") @@ -415,33 +364,20 @@ class NetdevSimDev: return progs def remove(self): - self.ctrl_write("del_device", "%u" % (self.addr, )) + super().remove() devs.remove(self) - def remove_nsim(self, nsim): - self.nsims.remove(nsim) - self.ctrl_write("devices/netdevsim%u/del_port" % (self.addr, ), - "%u" % (nsim.port_index, )) -class NetdevSim: +class BpfNetdevSim(NetdevSim): """ Class for netdevsim netdevice and its attributes. """ - def __init__(self, nsimdev, port_index, ifname): - # In case udev renamed the netdev to according to new schema, - # check if the name matches the port_index. - nsimnamere = re.compile("eni\d+np(\d+)") - match = nsimnamere.match(ifname) - if match and int(match.groups()[0]) != port_index + 1: - raise Exception("netdevice name mismatches the expected one") - - self.nsimdev = nsimdev - self.port_index = port_index - self.ns = "" + def __init__(self, nsimdev, port_index, ifname, ns=None): + super().__init__(nsimdev, port_index, ifname, ns=ns) + self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index) self.dfs_refresh() - _, [self.dev] = ip("link show dev %s" % ifname) def __getitem__(self, key): return self.dev[key] @@ -468,7 +404,7 @@ class NetdevSim: raise Exception("Time out waiting for program counts to stabilize want %d/%d, have %d bound, %d loaded" % (bound, total, nbound, nprogs)) def set_ns(self, ns): - name = "1" if ns == "" else ns + name = ns if ns else "1" ip("link set dev %s netns %s" % (self.dev["ifname"], name), ns=self.ns) self.ns = ns @@ -605,7 +541,7 @@ def pin_prog(file_name, idx=0): return file_name, bpf_pinned(file_name) def pin_map(file_name, idx=0, expected=1): - maps = bpftool_map_list(expected=expected) + maps = bpftool_map_list_wait(expected=expected) m = maps[idx] bpftool("map pin id %d %s" % (m["id"], file_name)) files.append(file_name) @@ -618,7 +554,7 @@ def check_dev_info_removed(prog_file=None, map_file=None): ret, err = bpftool("prog show pin %s" % (prog_file), fail=False) fail(ret != 0, "failed to show prog with removed device") - bpftool_map_list(expected=0) + bpftool_map_list_wait(expected=0) ret, err = bpftool("map show pin %s" % (map_file), fail=False) fail(ret == 0, "Showing map with removed device did not fail") fail(err["error"].find("No such device") == -1, @@ -642,7 +578,7 @@ def check_dev_info(other_ns, ns, prog_file=None, map_file=None, removed=False): else: fail("ifname" in dev.keys(), "Ifname is reported for other ns") - maps = bpftool_map_list(expected=2, ns=ns) + maps = bpftool_map_list_wait(expected=2, ns=ns) for m in maps: fail("dev" not in m.keys(), "Device parameters not reported") fail(dev != m["dev"], "Map's device different than program's") @@ -744,7 +680,7 @@ def test_multi_prog(simdev, sim, obj, modename, modeid): start_test("Test multi-attachment XDP - device remove...") simdev.remove() - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.set_ethtool_tc_offloads(True) return [simdev, sim] @@ -809,13 +745,13 @@ try: bytecode = bpf_bytecode("1,6 0 0 4294967295,") start_test("Test destruction of generic XDP...") - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.set_xdp(obj, "generic") simdev.remove() bpftool_prog_list_wait(expected=0) - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.tc_add_ingress() @@ -967,7 +903,7 @@ try: simdev.remove() bpftool_prog_list_wait(expected=0) - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.set_ethtool_tc_offloads(True) @@ -976,7 +912,7 @@ try: simdev.remove() bpftool_prog_list_wait(expected=0) - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.set_ethtool_tc_offloads(True) @@ -1080,7 +1016,7 @@ try: bpftool_prog_list_wait(expected=0) start_test("Test attempt to use a program for a wrong device...") - simdev2 = NetdevSimDev() + simdev2 = BpfNetdevSimDev() sim2, = simdev2.nsims sim2.set_xdp(obj, "offload") pin_file, pinned = pin_prog("/sys/fs/bpf/tmp") @@ -1169,7 +1105,7 @@ try: clean_up() bpftool_prog_list_wait(expected=0) - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims map_obj = bpf_obj("sample_map_ret0.bpf.o") start_test("Test loading program with maps...") @@ -1201,12 +1137,12 @@ try: clean_up() bpftool_prog_list_wait(expected=0) - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims start_test("Test map update (no flags)...") sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON - maps = bpftool_map_list(expected=2) + maps = bpftool_map_list_wait(expected=2) array = maps[0] if maps[0]["type"] == "array" else maps[1] htab = maps[0] if maps[0]["type"] == "hash" else maps[1] for m in maps: @@ -1285,14 +1221,14 @@ try: bpftool_map_list_wait(expected=0) simdev.remove() - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON simdev.remove() bpftool_map_list_wait(expected=0) start_test("Test map creation fail path...") - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.dfs["bpf_map_accept"] = "N" ret, _ = sim.set_xdp(map_obj, "offload", JSON=False, fail=False) @@ -1302,9 +1238,9 @@ try: simdev.remove() start_test("Test multi-dev ASIC program reuse...") - simdevA = NetdevSimDev() + simdevA = BpfNetdevSimDev() simA, = simdevA.nsims - simdevB = NetdevSimDev(3) + simdevB = BpfNetdevSimDev(3) simB1, simB2, simB3 = simdevB.nsims sims = (simA, simB1, simB2, simB3) simB = (simB1, simB2, simB3) diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index 51157a5559b7..7c01f58a20de 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -9,6 +9,7 @@ PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} RTABLE=100 RTABLE_PEER=101 +RTABLE_VRF=102 GW_IP4=192.51.100.2 SRC_IP=192.51.100.3 GW_IP6=2001:db8:1::2 @@ -17,7 +18,14 @@ SRC_IP6=2001:db8:1::3 DEV_ADDR=192.51.100.1 DEV_ADDR6=2001:db8:1::1 DEV=dummy0 -TESTS="fib_rule6 fib_rule4 fib_rule6_connect fib_rule4_connect" +TESTS=" + fib_rule6 + fib_rule4 + fib_rule6_connect + fib_rule4_connect + fib_rule6_vrf + fib_rule4_vrf +" SELFTEST_PATH="" @@ -27,13 +35,18 @@ log_test() local expected=$2 local msg="$3" + $IP rule show | grep -q l3mdev + if [ $? -eq 0 ]; then + msg="$msg (VRF)" + fi + if [ ${rc} -eq ${expected} ]; then nsuccess=$((nsuccess+1)) - printf "\n TEST: %-50s [ OK ]\n" "${msg}" + printf "\n TEST: %-60s [ OK ]\n" "${msg}" else ret=1 nfail=$((nfail+1)) - printf "\n TEST: %-50s [FAIL]\n" "${msg}" + printf "\n TEST: %-60s [FAIL]\n" "${msg}" if [ "${PAUSE_ON_FAIL}" = "yes" ]; then echo echo "hit enter to continue, 'q' to quit" @@ -130,6 +143,17 @@ cleanup_peer() ip netns del $peerns } +setup_vrf() +{ + $IP link add name vrf0 up type vrf table $RTABLE_VRF + $IP link set dev $DEV master vrf0 +} + +cleanup_vrf() +{ + $IP link del dev vrf0 +} + fib_check_iproute_support() { ip rule help 2>&1 | grep -q $1 @@ -248,6 +272,13 @@ fib_rule6_test() fi } +fib_rule6_vrf_test() +{ + setup_vrf + fib_rule6_test + cleanup_vrf +} + # Verify that the IPV6_TCLASS option of UDPv6 and TCPv6 sockets is properly # taken into account when connecting the socket and when sending packets. fib_rule6_connect_test() @@ -385,6 +416,13 @@ fib_rule4_test() fi } +fib_rule4_vrf_test() +{ + setup_vrf + fib_rule4_test + cleanup_vrf +} + # Verify that the IP_TOS option of UDPv4 and TCPv4 sockets is properly taken # into account when connecting the socket and when sending packets. fib_rule4_connect_test() @@ -467,6 +505,8 @@ do fib_rule4_test|fib_rule4) fib_rule4_test;; fib_rule6_connect_test|fib_rule6_connect) fib_rule6_connect_test;; fib_rule4_connect_test|fib_rule4_connect) fib_rule4_connect_test;; + fib_rule6_vrf_test|fib_rule6_vrf) fib_rule6_vrf_test;; + fib_rule4_vrf_test|fib_rule4_vrf) fib_rule4_vrf_test;; help) echo "Test names: $TESTS"; exit 0;; diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 4103ed7afcde..7913c6ee418d 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -95,27 +95,9 @@ source "$net_forwarding_dir/../lib.sh" # timeout in seconds slowwait() { - local timeout=$1; shift - - local start_time="$(date -u +%s)" - while true - do - local out - out=$("$@") - local ret=$? - if ((!ret)); then - echo -n "$out" - return 0 - fi - - local current_time="$(date -u +%s)" - if ((current_time - start_time > timeout)); then - echo -n "$out" - return 1 - fi + local timeout_sec=$1; shift - sleep 0.1 - done + loopy_wait "sleep 0.1" "$((timeout_sec * 1000))" "$@" } ############################################################################## @@ -291,11 +273,6 @@ if [[ "$REQUIRE_MTOOLS" = "yes" ]]; then require_command mreceive fi -if [[ ! -v NUM_NETIFS ]]; then - echo "SKIP: importer does not define \"NUM_NETIFS\"" - exit $ksft_skip -fi - ############################################################################## # Command line options handling @@ -314,6 +291,23 @@ done ############################################################################## # Network interfaces configuration +if [[ ! -v NUM_NETIFS ]]; then + echo "SKIP: importer does not define \"NUM_NETIFS\"" + exit $ksft_skip +fi + +if (( NUM_NETIFS > ${#NETIFS[@]} )); then + echo "SKIP: Importer requires $NUM_NETIFS NETIFS, but only ${#NETIFS[@]} are defined (${NETIFS[@]})" + exit $ksft_skip +fi + +for i in $(seq ${#NETIFS[@]}); do + if [[ ! ${NETIFS[p$i]} ]]; then + echo "SKIP: NETIFS[p$i] not given" + exit $ksft_skip + fi +done + create_netif_veth() { local i @@ -2144,6 +2138,8 @@ bail_on_lldpad() { local reason1="$1"; shift local reason2="$1"; shift + local caller=${FUNCNAME[1]} + local src=${BASH_SOURCE[1]} if systemctl is-active --quiet lldpad; then @@ -2164,7 +2160,8 @@ bail_on_lldpad() an environment variable ALLOW_LLDPAD to a non-empty string. EOF - exit 1 + log_test_skip $src:$caller + exit $EXIT_STATUS else return fi diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh index 3f0f5dc95542..2ba44247c60a 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh @@ -1,6 +1,41 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# +-------------------------+ +# | H1 | +# | $h1 + | +# | 192.0.2.2/24 | | +# | 2001:db8:1::2/64 | | +# +-------------------|-----+ +# | +# +-------------------|----------------------+ +# | | R1 | +# | $rp11 + | +# | 192.0.2.1/24 | +# | 2001:db8:1::1/64 | +# | | +# | + $rp12 + $rp13 | +# | | 169.254.2.12/24 | 169.254.3.13/24 | +# | | fe80:2::12/64 | fe80:3::13/64 | +# +--|--------------------|------------------+ +# | | +# +--|--------------------|------------------+ +# | + $rp22 + $rp23 | +# | 169.254.2.22/24 169.254.3.23/24 | +# | fe80:2::22/64 fe80:3::23/64 | +# | | +# | $rp21 + | +# | 198.51.100.1/24 | | +# | 2001:db8:2::1/64 | R2 | +# +-------------------|----------------------+ +# | +# +-------------------|-----+ +# | | | +# | $h2 + | +# | 198.51.100.2/24 | +# | 2001:db8:2::2/64 H2 | +# +-------------------------+ + ALL_TESTS=" ping_ipv4 ping_ipv6 diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh index 4b483d24ad00..cd9e346436fc 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh @@ -1,6 +1,41 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# +-------------------------+ +# | H1 | +# | $h1 + | +# | 192.0.2.2/24 | | +# | 2001:db8:1::2/64 | | +# +-------------------|-----+ +# | +# +-------------------|----------------------+ +# | | R1 | +# | $rp11 + | +# | 192.0.2.1/24 | +# | 2001:db8:1::1/64 | +# | | +# | + $rp12 + $rp13 | +# | | 169.254.2.12/24 | 169.254.3.13/24 | +# | | fe80:2::12/64 | fe80:3::13/64 | +# +--|--------------------|------------------+ +# | | +# +--|--------------------|------------------+ +# | + $rp22 + $rp23 | +# | 169.254.2.22/24 169.254.3.23/24 | +# | fe80:2::22/64 fe80:3::23/64 | +# | | +# | $rp21 + | +# | 198.51.100.1/24 | | +# | 2001:db8:2::1/64 | R2 | +# +-------------------|----------------------+ +# | +# +-------------------|-----+ +# | | | +# | $h2 + | +# | 198.51.100.2/24 | +# | 2001:db8:2::2/64 H2 | +# +-------------------------+ + ALL_TESTS=" ping_ipv4 ping_ipv6 diff --git a/tools/testing/selftests/net/forwarding/router_nh.sh b/tools/testing/selftests/net/forwarding/router_nh.sh index f3a53738bdcc..92904b01eae9 100755 --- a/tools/testing/selftests/net/forwarding/router_nh.sh +++ b/tools/testing/selftests/net/forwarding/router_nh.sh @@ -1,6 +1,20 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# +-------------------------+ +-------------------------+ +# | H1 | | H2 | +# | $h1 + | | $h2 + | +# | 192.0.2.2/24 | | | 198.51.100.2/24 | | +# | 2001:db8:1::2/64 | | | 2001:db8:2::2/64 | | +# +-------------------|-----+ +-------------------|-----+ +# | | +# +-------------------|----------------------------|-----+ +# | R1 | | | +# | $rp1 + $rp2 + | +# | 192.0.2.1/24 198.51.100.1/24 | +# | 2001:db8:1::1/64 2001:db8:2::1/64 | +# +------------------------------------------------------+ + ALL_TESTS=" ping_ipv4 ping_ipv6 diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index b7f7b8695165..c868c0aec121 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -58,9 +58,10 @@ ksft_exit_status_merge() $ksft_xfail $ksft_pass $ksft_skip $ksft_fail } -busywait() +loopy_wait() { - local timeout=$1; shift + local sleep_cmd=$1; shift + local timeout_ms=$1; shift local start_time="$(date -u +%s%3N)" while true @@ -74,13 +75,22 @@ busywait() fi local current_time="$(date -u +%s%3N)" - if ((current_time - start_time > timeout)); then + if ((current_time - start_time > timeout_ms)); then echo -n "$out" return 1 fi + + $sleep_cmd done } +busywait() +{ + local timeout_ms=$1; shift + + loopy_wait : "$timeout_ms" "$@" +} + cleanup_ns() { local ns="" diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile new file mode 100644 index 000000000000..48557e6250dd --- /dev/null +++ b/tools/testing/selftests/net/lib/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_FILES := ../../../../../Documentation/netlink/specs +TEST_FILES += ../../../../net/ynl + +TEST_INCLUDES := $(wildcard py/*.py) + +include ../../lib.mk diff --git a/tools/testing/selftests/net/lib/py/__init__.py b/tools/testing/selftests/net/lib/py/__init__.py new file mode 100644 index 000000000000..b6d498d125fe --- /dev/null +++ b/tools/testing/selftests/net/lib/py/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 + +from .consts import KSRC +from .ksft import * +from .netns import NetNS +from .nsim import * +from .utils import * +from .ynl import NlError, YnlFamily, EthtoolFamily, NetdevFamily, RtnlFamily diff --git a/tools/testing/selftests/net/lib/py/consts.py b/tools/testing/selftests/net/lib/py/consts.py new file mode 100644 index 000000000000..f518ce79d82c --- /dev/null +++ b/tools/testing/selftests/net/lib/py/consts.py @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 + +import sys +from pathlib import Path + +KSFT_DIR = (Path(__file__).parent / "../../..").resolve() +KSRC = (Path(__file__).parent / "../../../../../..").resolve() + +KSFT_MAIN_NAME = Path(sys.argv[0]).with_suffix("").name diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py new file mode 100644 index 000000000000..f84e9fdd0032 --- /dev/null +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -0,0 +1,155 @@ +# SPDX-License-Identifier: GPL-2.0 + +import builtins +import inspect +import sys +import time +import traceback +from .consts import KSFT_MAIN_NAME + +KSFT_RESULT = None +KSFT_RESULT_ALL = True + + +class KsftSkipEx(Exception): + pass + + +class KsftXfailEx(Exception): + pass + + +def ksft_pr(*objs, **kwargs): + print("#", *objs, **kwargs) + + +def _fail(*args): + global KSFT_RESULT + KSFT_RESULT = False + + frame = inspect.stack()[2] + ksft_pr("At " + frame.filename + " line " + str(frame.lineno) + ":") + ksft_pr(*args) + + +def ksft_eq(a, b, comment=""): + global KSFT_RESULT + if a != b: + _fail("Check failed", a, "!=", b, comment) + + +def ksft_true(a, comment=""): + if not a: + _fail("Check failed", a, "does not eval to True", comment) + + +def ksft_in(a, b, comment=""): + if a not in b: + _fail("Check failed", a, "not in", b, comment) + + +def ksft_ge(a, b, comment=""): + if a < b: + _fail("Check failed", a, "<", b, comment) + + +class ksft_raises: + def __init__(self, expected_type): + self.exception = None + self.expected_type = expected_type + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type is None: + _fail(f"Expected exception {str(self.expected_type.__name__)}, none raised") + elif self.expected_type != exc_type: + _fail(f"Expected exception {str(self.expected_type.__name__)}, raised {str(exc_type.__name__)}") + self.exception = exc_val + # Suppress the exception if its the expected one + return self.expected_type == exc_type + + +def ksft_busy_wait(cond, sleep=0.005, deadline=1, comment=""): + end = time.monotonic() + deadline + while True: + if cond(): + return + if time.monotonic() > end: + _fail("Waiting for condition timed out", comment) + return + time.sleep(sleep) + + +def ktap_result(ok, cnt=1, case="", comment=""): + global KSFT_RESULT_ALL + KSFT_RESULT_ALL = KSFT_RESULT_ALL and ok + + res = "" + if not ok: + res += "not " + res += "ok " + res += str(cnt) + " " + res += KSFT_MAIN_NAME + if case: + res += "." + str(case.__name__) + if comment: + res += " # " + comment + print(res) + + +def ksft_run(cases=None, globs=None, case_pfx=None, args=()): + cases = cases or [] + + if globs and case_pfx: + for key, value in globs.items(): + if not callable(value): + continue + for prefix in case_pfx: + if key.startswith(prefix): + cases.append(value) + break + + totals = {"pass": 0, "fail": 0, "skip": 0, "xfail": 0} + + print("KTAP version 1") + print("1.." + str(len(cases))) + + global KSFT_RESULT + cnt = 0 + for case in cases: + KSFT_RESULT = True + cnt += 1 + try: + case(*args) + except KsftSkipEx as e: + ktap_result(True, cnt, case, comment="SKIP " + str(e)) + totals['skip'] += 1 + continue + except KsftXfailEx as e: + ktap_result(True, cnt, case, comment="XFAIL " + str(e)) + totals['xfail'] += 1 + continue + except Exception as e: + tb = traceback.format_exc() + for line in tb.strip().split('\n'): + ksft_pr("Exception|", line) + ktap_result(False, cnt, case) + totals['fail'] += 1 + continue + + ktap_result(KSFT_RESULT, cnt, case) + if KSFT_RESULT: + totals['pass'] += 1 + else: + totals['fail'] += 1 + + print( + f"# Totals: pass:{totals['pass']} fail:{totals['fail']} xfail:{totals['xfail']} xpass:0 skip:{totals['skip']} error:0" + ) + + +def ksft_exit(): + global KSFT_RESULT_ALL + sys.exit(0 if KSFT_RESULT_ALL else 1) diff --git a/tools/testing/selftests/net/lib/py/netns.py b/tools/testing/selftests/net/lib/py/netns.py new file mode 100644 index 000000000000..ecff85f9074f --- /dev/null +++ b/tools/testing/selftests/net/lib/py/netns.py @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: GPL-2.0 + +from .utils import ip +import random +import string + + +class NetNS: + def __init__(self, name=None): + if name: + self.name = name + else: + self.name = ''.join(random.choice(string.ascii_lowercase) for _ in range(8)) + ip('netns add ' + self.name) + + def __del__(self): + if self.name: + ip('netns del ' + self.name) + self.name = None + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + self.__del__() + + def __str__(self): + return self.name + + def __repr__(self): + return f"NetNS({self.name})" diff --git a/tools/testing/selftests/net/lib/py/nsim.py b/tools/testing/selftests/net/lib/py/nsim.py new file mode 100644 index 000000000000..06896cdf7c18 --- /dev/null +++ b/tools/testing/selftests/net/lib/py/nsim.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: GPL-2.0 + +import json +import os +import random +import re +import time +from .utils import cmd, ip + + +class NetdevSim: + """ + Class for netdevsim netdevice and its attributes. + """ + + def __init__(self, nsimdev, port_index, ifname, ns=None): + # In case udev renamed the netdev to according to new schema, + # check if the name matches the port_index. + nsimnamere = re.compile(r"eni\d+np(\d+)") + match = nsimnamere.match(ifname) + if match and int(match.groups()[0]) != port_index + 1: + raise Exception("netdevice name mismatches the expected one") + + self.ifname = ifname + self.nsimdev = nsimdev + self.port_index = port_index + self.ns = ns + self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index) + ret = ip("-j link show dev %s" % ifname, ns=ns) + self.dev = json.loads(ret.stdout)[0] + self.ifindex = self.dev["ifindex"] + + def dfs_write(self, path, val): + self.nsimdev.dfs_write(f'ports/{self.port_index}/' + path, val) + + +class NetdevSimDev: + """ + Class for netdevsim bus device and its attributes. + """ + @staticmethod + def ctrl_write(path, val): + fullpath = os.path.join("/sys/bus/netdevsim/", path) + with open(fullpath, "w") as f: + f.write(val) + + def dfs_write(self, path, val): + fullpath = os.path.join(f"/sys/kernel/debug/netdevsim/netdevsim{self.addr}/", path) + with open(fullpath, "w") as f: + f.write(val) + + def __init__(self, port_count=1, ns=None): + # nsim will spawn in init_net, we'll set to actual ns once we switch it there + self.ns = None + + if not os.path.exists("/sys/bus/netdevsim"): + cmd("modprobe netdevsim") + + addr = random.randrange(1 << 15) + while True: + try: + self.ctrl_write("new_device", "%u %u" % (addr, port_count)) + except OSError as e: + if e.errno == errno.ENOSPC: + addr = random.randrange(1 << 15) + continue + raise e + break + self.addr = addr + + # As probe of netdevsim device might happen from a workqueue, + # so wait here until all netdevs appear. + self.wait_for_netdevs(port_count) + + if ns: + cmd(f"devlink dev reload netdevsim/netdevsim{addr} netns {ns.name}") + self.ns = ns + + cmd("udevadm settle", ns=self.ns) + ifnames = self.get_ifnames() + + self.dfs_dir = "/sys/kernel/debug/netdevsim/netdevsim%u/" % addr + + self.nsims = [] + for port_index in range(port_count): + self.nsims.append(self._make_port(port_index, ifnames[port_index])) + + self.removed = False + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + """ + __exit__ gets called at the end of a "with" block. + """ + self.remove() + + def _make_port(self, port_index, ifname): + return NetdevSim(self, port_index, ifname, self.ns) + + def get_ifnames(self): + ifnames = [] + listdir = cmd(f"ls /sys/bus/netdevsim/devices/netdevsim{self.addr}/net/", + ns=self.ns).stdout.split() + for ifname in listdir: + ifnames.append(ifname) + ifnames.sort() + return ifnames + + def wait_for_netdevs(self, port_count): + timeout = 5 + timeout_start = time.time() + + while True: + try: + ifnames = self.get_ifnames() + except FileNotFoundError as e: + ifnames = [] + if len(ifnames) == port_count: + break + if time.time() < timeout_start + timeout: + continue + raise Exception("netdevices did not appear within timeout") + + def remove(self): + if not self.removed: + self.ctrl_write("del_device", "%u" % (self.addr, )) + self.removed = True + + def remove_nsim(self, nsim): + self.nsims.remove(nsim) + self.ctrl_write("devices/netdevsim%u/del_port" % (self.addr, ), + "%u" % (nsim.port_index, )) diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py new file mode 100644 index 000000000000..d3715e6c21f2 --- /dev/null +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: GPL-2.0 + +import json as _json +import random +import re +import subprocess +import time + + +class cmd: + def __init__(self, comm, shell=True, fail=True, ns=None, background=False, host=None): + if ns: + comm = f'ip netns exec {ns} ' + comm + + self.stdout = None + self.stderr = None + self.ret = None + + self.comm = comm + if host: + self.proc = host.cmd(comm) + else: + self.proc = subprocess.Popen(comm, shell=shell, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if not background: + self.process(terminate=False, fail=fail) + + def process(self, terminate=True, fail=None): + if terminate: + self.proc.terminate() + stdout, stderr = self.proc.communicate(timeout=5) + self.stdout = stdout.decode("utf-8") + self.stderr = stderr.decode("utf-8") + self.proc.stdout.close() + self.proc.stderr.close() + self.ret = self.proc.returncode + + if self.proc.returncode != 0 and fail: + if len(stderr) > 0 and stderr[-1] == "\n": + stderr = stderr[:-1] + raise Exception("Command failed: %s\nSTDOUT: %s\nSTDERR: %s" % + (self.proc.args, stdout, stderr)) + + +class bkg(cmd): + def __init__(self, comm, shell=True, fail=True, ns=None, host=None, + exit_wait=False): + super().__init__(comm, background=True, + shell=shell, fail=fail, ns=ns, host=host) + self.terminate = not exit_wait + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + return self.process(terminate=self.terminate) + + +def ip(args, json=None, ns=None, host=None): + cmd_str = "ip " + if json: + cmd_str += '-j ' + cmd_str += args + cmd_obj = cmd(cmd_str, ns=ns, host=host) + if json: + return _json.loads(cmd_obj.stdout) + return cmd_obj + + +def rand_port(): + """ + Get unprivileged port, for now just random, one day we may decide to check if used. + """ + return random.randint(1024, 65535) + + +def wait_port_listen(port, proto="tcp", ns=None, host=None, sleep=0.005, deadline=5): + end = time.monotonic() + deadline + + pattern = f":{port:04X} .* " + if proto == "tcp": # for tcp protocol additionally check the socket state + pattern += "0A" + pattern = re.compile(pattern) + + while True: + data = cmd(f'cat /proc/net/{proto}*', ns=ns, host=host, shell=True).stdout + for row in data.split("\n"): + if pattern.search(row): + return + if time.monotonic() > end: + raise Exception("Waiting for port listen timed out") + time.sleep(sleep) diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py new file mode 100644 index 000000000000..1ace58370c06 --- /dev/null +++ b/tools/testing/selftests/net/lib/py/ynl.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: GPL-2.0 + +import sys +from pathlib import Path +from .consts import KSRC, KSFT_DIR +from .ksft import ksft_pr, ktap_result + +# Resolve paths +try: + if (KSFT_DIR / "kselftest-list.txt").exists(): + # Running in "installed" selftests + tools_full_path = KSFT_DIR + SPEC_PATH = KSFT_DIR / "net/lib/specs" + + sys.path.append(tools_full_path.as_posix()) + from net.lib.ynl.lib import YnlFamily, NlError + else: + # Running in tree + tools_full_path = KSRC / "tools" + SPEC_PATH = KSRC / "Documentation/netlink/specs" + + sys.path.append(tools_full_path.as_posix()) + from net.ynl.lib import YnlFamily, NlError +except ModuleNotFoundError as e: + ksft_pr("Failed importing `ynl` library from kernel sources") + ksft_pr(str(e)) + ktap_result(True, comment="SKIP") + sys.exit(4) + +# +# Wrapper classes, loading the right specs +# Set schema='' to avoid jsonschema validation, it's slow +# +class EthtoolFamily(YnlFamily): + def __init__(self): + super().__init__((SPEC_PATH / Path('ethtool.yaml')).as_posix(), + schema='') + + +class RtnlFamily(YnlFamily): + def __init__(self): + super().__init__((SPEC_PATH / Path('rt_link.yaml')).as_posix(), + schema='') + + +class NetdevFamily(YnlFamily): + def __init__(self): + super().__init__((SPEC_PATH / Path('netdev.yaml')).as_posix(), + schema='') diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index bc97ab33a00e..776d43a6922d 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -200,6 +200,58 @@ chk_msk_cestab() "${expected}" "${msg}" "" } +msk_info_get_value() +{ + local port="${1}" + local info="${2}" + + ss -N "${ns}" -inHM dport "${port}" | \ + mptcp_lib_get_info_value "${info}" "${info}" +} + +chk_msk_info() +{ + local port="${1}" + local info="${2}" + local cnt="${3}" + local msg="....chk ${info}" + local delta_ms=250 # half what we waited before, just to be sure + local now + + now=$(msk_info_get_value "${port}" "${info}") + + mptcp_lib_print_title "${msg}" + if { [ -z "${cnt}" ] || [ -z "${now}" ]; } && + ! mptcp_lib_expect_all_features; then + mptcp_lib_pr_skip "Feature probably not supported" + mptcp_lib_result_skip "${msg}" + elif [ "$((cnt + delta_ms))" -lt "${now}" ]; then + mptcp_lib_pr_ok + mptcp_lib_result_pass "${msg}" + else + mptcp_lib_pr_fail "value of ${info} changed by $((now - cnt))ms," \ + "expected at least ${delta_ms}ms" + mptcp_lib_result_fail "${msg}" + ret=${KSFT_FAIL} + fi +} + +chk_last_time_info() +{ + local port="${1}" + local data_sent data_recv ack_recv + + data_sent=$(msk_info_get_value "${port}" "last_data_sent") + data_recv=$(msk_info_get_value "${port}" "last_data_recv") + ack_recv=$(msk_info_get_value "${port}" "last_ack_recv") + + sleep 0.5 # wait to check after if the timestamps difference + + chk_msk_info "${port}" "last_data_sent" "${data_sent}" + chk_msk_info "${port}" "last_data_recv" "${data_recv}" + chk_msk_info "${port}" "last_ack_recv" "${ack_recv}" +} + wait_connected() { local listener_ns="${1}" @@ -233,6 +285,7 @@ echo "b" | \ 127.0.0.1 >/dev/null & wait_connected $ns 10000 chk_msk_nr 2 "after MPC handshake " +chk_last_time_info 10000 chk_msk_remote_key_nr 2 "....chk remote_key" chk_msk_fallback_nr 0 "....chk no fallback" chk_msk_inuse 2 diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 4c4248554826..b77fb7065bfb 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -147,7 +147,7 @@ cleanup() mptcp_lib_check_mptcp mptcp_lib_check_kallsyms -mptcp_lib_check_tools ip +mptcp_lib_check_tools ip tc sin=$(mktemp) sout=$(mktemp) @@ -383,12 +383,14 @@ do_transfer() local stat_cookierx_last local stat_csum_err_s local stat_csum_err_c + local stat_tcpfb_last_l stat_synrx_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX") stat_ackrx_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableACKRX") stat_cookietx_last=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesSent") stat_cookierx_last=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesRecv") stat_csum_err_s=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtDataCsumErr") stat_csum_err_c=$(mptcp_lib_get_counter "${connector_ns}" "MPTcpExtDataCsumErr") + stat_tcpfb_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableFallbackACK") timeout ${timeout_test} \ ip netns exec ${listener_ns} \ @@ -457,11 +459,13 @@ do_transfer() local stat_cookietx_now local stat_cookierx_now local stat_ooo_now + local stat_tcpfb_now_l stat_synrx_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX") stat_ackrx_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableACKRX") stat_cookietx_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesSent") stat_cookierx_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesRecv") stat_ooo_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtTCPOFOQueue") + stat_tcpfb_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableFallbackACK") expect_synrx=$((stat_synrx_last_l)) expect_ackrx=$((stat_ackrx_last_l)) @@ -508,6 +512,11 @@ do_transfer() fi fi + if [ ${stat_ooo_now} -eq 0 ] && [ ${stat_tcpfb_last_l} -ne ${stat_tcpfb_now_l} ]; then + mptcp_lib_pr_fail "unexpected fallback to TCP" + rets=1 + fi + if [ $cookies -eq 2 ];then if [ $stat_cookietx_last -ge $stat_cookietx_now ] ;then extra+=" WARN: CookieSent: did not advance" diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 5e9211e89825..fefa9173bdaa 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -31,7 +31,6 @@ timeout_poll=30 timeout_test=$((timeout_poll * 2 + 1)) capture=false checksum=false -ip_mptcp=0 check_invert=0 validate_checksum=false init=0 @@ -125,8 +124,8 @@ init_shapers() { local i for i in $(seq 1 4); do - tc -n $ns1 qdisc add dev ns1eth$i root netem rate 20mbit delay 1 - tc -n $ns2 qdisc add dev ns2eth$i root netem rate 20mbit delay 1 + tc -n $ns1 qdisc add dev ns1eth$i root netem rate 20mbit delay 1ms + tc -n $ns2 qdisc add dev ns2eth$i root netem rate 20mbit delay 1ms done } @@ -142,7 +141,7 @@ init() { mptcp_lib_check_mptcp mptcp_lib_check_kallsyms - mptcp_lib_check_tools ip ss "${iptables}" "${ip6tables}" + mptcp_lib_check_tools ip tc ss "${iptables}" "${ip6tables}" sin=$(mktemp) sout=$(mktemp) @@ -606,173 +605,65 @@ kill_events_pids() pm_nl_set_limits() { - local ns=$1 - local addrs=$2 - local subflows=$3 - - if [ $ip_mptcp -eq 1 ]; then - ip -n $ns mptcp limits set add_addr_accepted $addrs subflows $subflows - else - ip netns exec $ns ./pm_nl_ctl limits $addrs $subflows - fi + mptcp_lib_pm_nl_set_limits "${@}" } pm_nl_add_endpoint() { - local ns=$1 - local addr=$2 - local flags _flags - local port _port - local dev _dev - local id _id - local nr=2 - - local p - for p in "${@}" - do - if [ $p = "flags" ]; then - eval _flags=\$"$nr" - [ -n "$_flags" ]; flags="flags $_flags" - fi - if [ $p = "dev" ]; then - eval _dev=\$"$nr" - [ -n "$_dev" ]; dev="dev $_dev" - fi - if [ $p = "id" ]; then - eval _id=\$"$nr" - [ -n "$_id" ]; id="id $_id" - fi - if [ $p = "port" ]; then - eval _port=\$"$nr" - [ -n "$_port" ]; port="port $_port" - fi - - nr=$((nr + 1)) - done - - if [ $ip_mptcp -eq 1 ]; then - ip -n $ns mptcp endpoint add $addr ${_flags//","/" "} $dev $id $port - else - ip netns exec $ns ./pm_nl_ctl add $addr $flags $dev $id $port - fi + mptcp_lib_pm_nl_add_endpoint "${@}" } pm_nl_del_endpoint() { - local ns=$1 - local id=$2 - local addr=$3 - - if [ $ip_mptcp -eq 1 ]; then - [ $id -ne 0 ] && addr='' - ip -n $ns mptcp endpoint delete id $id $addr - else - ip netns exec $ns ./pm_nl_ctl del $id $addr - fi + mptcp_lib_pm_nl_del_endpoint "${@}" } pm_nl_flush_endpoint() { - local ns=$1 - - if [ $ip_mptcp -eq 1 ]; then - ip -n $ns mptcp endpoint flush - else - ip netns exec $ns ./pm_nl_ctl flush - fi + mptcp_lib_pm_nl_flush_endpoint "${@}" } pm_nl_show_endpoints() { - local ns=$1 - - if [ $ip_mptcp -eq 1 ]; then - ip -n $ns mptcp endpoint show - else - ip netns exec $ns ./pm_nl_ctl dump - fi + mptcp_lib_pm_nl_show_endpoints "${@}" } pm_nl_change_endpoint() { - local ns=$1 - local id=$2 - local flags=$3 - - if [ $ip_mptcp -eq 1 ]; then - ip -n $ns mptcp endpoint change id $id ${flags//","/" "} - else - ip netns exec $ns ./pm_nl_ctl set id $id flags $flags - fi + mptcp_lib_pm_nl_change_endpoint "${@}" } pm_nl_check_endpoint() { - local line expected_line local msg="$1" local ns=$2 local addr=$3 - local _flags="" - local flags - local _port - local port - local dev - local _id - local id + local flags dev id port print_check "${msg}" shift 3 while [ -n "$1" ]; do - if [ $1 = "flags" ]; then - _flags=$2 - [ -n "$_flags" ]; flags="flags $_flags" - shift - elif [ $1 = "dev" ]; then - [ -n "$2" ]; dev="dev $1" - shift - elif [ $1 = "id" ]; then - _id=$2 - [ -n "$_id" ]; id="id $_id" + case "${1}" in + "flags" | "dev" | "id" | "port") + eval "${1}"="${2}" shift - elif [ $1 = "port" ]; then - _port=$2 - [ -n "$_port" ]; port=" port $_port" - shift - fi + ;; + *) + ;; + esac shift done - if [ -z "$id" ]; then + if [ -z "${id}" ]; then test_fail "bad test - missing endpoint id" return fi - if [ $ip_mptcp -eq 1 ]; then - # get line and trim trailing whitespace - line=$(ip -n $ns mptcp endpoint show $id) - line="${line% }" - # the dump order is: address id flags port dev - [ -n "$addr" ] && expected_line="$addr" - expected_line+=" $id" - [ -n "$_flags" ] && expected_line+=" ${_flags//","/" "}" - [ -n "$dev" ] && expected_line+=" $dev" - [ -n "$port" ] && expected_line+=" $port" - else - line=$(ip netns exec $ns ./pm_nl_ctl get $_id) - # the dump order is: id flags dev address port - expected_line="$id" - [ -n "$flags" ] && expected_line+=" $flags" - [ -n "$dev" ] && expected_line+=" $dev" - [ -n "$addr" ] && expected_line+=" $addr" - [ -n "$_port" ] && expected_line+=" $_port" - fi - if [ "$line" = "$expected_line" ]; then - print_ok - else - fail_test "expected '$expected_line' found '$line'" - fi + check_output "mptcp_lib_pm_nl_get_endpoint ${ns} ${id}" \ + "$(mptcp_lib_pm_nl_format_endpoints \ + "${id},${addr},${flags//","/" "},${dev},${port}")" } pm_nl_set_endpoint() @@ -3212,7 +3103,7 @@ fail_tests() # multiple subflows if reset_with_fail "MP_FAIL MP_RST" 2; then - tc -n $ns2 qdisc add dev ns2eth1 root netem rate 1mbit delay 5 + tc -n $ns2 qdisc add dev ns2eth1 root netem rate 1mbit delay 5ms pm_nl_set_limits $ns1 0 1 pm_nl_set_limits $ns2 0 1 pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow @@ -3610,6 +3501,8 @@ endpoint_tests() local tests_pid=$! wait_mpj $ns2 + pm_nl_check_endpoint "creation" \ + $ns2 10.0.2.2 id 2 flags subflow dev ns2eth2 chk_subflow_nr "before delete" 2 chk_mptcp_info subflows 1 subflows 1 @@ -3700,7 +3593,7 @@ while getopts "${all_tests_args}cCih" opt; do checksum=true ;; i) - ip_mptcp=1 + mptcp_lib_set_ip_mptcp ;; h) usage diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index d529b4b37af8..ad2ebda5cb64 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -23,6 +23,7 @@ MPTCP_LIB_SUBTESTS=() MPTCP_LIB_SUBTESTS_DUPLICATED=0 MPTCP_LIB_TEST_COUNTER=0 MPTCP_LIB_TEST_FORMAT="%02u %-50s" +MPTCP_LIB_IP_MPTCP=0 # only if supported (or forced) and not disabled, see no-color.org if { [ -t 1 ] || [ "${SELFTESTS_MPTCP_LIB_COLOR_FORCE:-}" = "1" ]; } && @@ -384,6 +385,12 @@ mptcp_lib_check_tools() { exit ${KSFT_SKIP} fi ;; + "tc") + if ! tc -help &> /dev/null; then + mptcp_lib_pr_skip "Could not run test without tc tool" + exit ${KSFT_SKIP} + fi + ;; "ss") if ! ss -h | grep -q MPTCP; then mptcp_lib_pr_skip "ss tool does not support MPTCP" @@ -505,3 +512,131 @@ mptcp_lib_verify_listener_events() { mptcp_lib_check_expected "type" "family" "saddr" "sport" || rc="${?}" return "${rc}" } + +mptcp_lib_set_ip_mptcp() { + MPTCP_LIB_IP_MPTCP=1 +} + +mptcp_lib_is_ip_mptcp() { + [ "${MPTCP_LIB_IP_MPTCP}" = "1" ] +} + +# format: <id>,<ip>,<flags>,<dev> +mptcp_lib_pm_nl_format_endpoints() { + local entry id ip flags dev port + + for entry in "${@}"; do + IFS=, read -r id ip flags dev port <<< "${entry}" + if mptcp_lib_is_ip_mptcp; then + echo -n "${ip}" + [ -n "${port}" ] && echo -n " port ${port}" + echo -n " id ${id}" + [ -n "${flags}" ] && echo -n " ${flags}" + [ -n "${dev}" ] && echo -n " dev ${dev}" + echo " " # always a space at the end + else + echo -n "id ${id}" + echo -n " flags ${flags//" "/","}" + [ -n "${dev}" ] && echo -n " dev ${dev}" + echo -n " ${ip}" + [ -n "${port}" ] && echo -n " ${port}" + echo "" + fi + done +} + +mptcp_lib_pm_nl_get_endpoint() { + local ns=${1} + local id=${2} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp endpoint show id "${id}" + else + ip netns exec "${ns}" ./pm_nl_ctl get "${id}" + fi +} + +mptcp_lib_pm_nl_set_limits() { + local ns=${1} + local addrs=${2} + local subflows=${3} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp limits set add_addr_accepted "${addrs}" subflows "${subflows}" + else + ip netns exec "${ns}" ./pm_nl_ctl limits "${addrs}" "${subflows}" + fi +} + +mptcp_lib_pm_nl_add_endpoint() { + local ns=${1} + local addr=${2} + local flags dev id port + local nr=2 + + local p + for p in "${@}"; do + case "${p}" in + "flags" | "dev" | "id" | "port") + eval "${p}"=\$"${nr}" + ;; + esac + + nr=$((nr + 1)) + done + + if mptcp_lib_is_ip_mptcp; then + # shellcheck disable=SC2086 # blanks in flags, no double quote + ip -n "${ns}" mptcp endpoint add "${addr}" ${flags//","/" "} \ + ${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"} + else + ip netns exec "${ns}" ./pm_nl_ctl add "${addr}" ${flags:+flags "${flags}"} \ + ${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"} + fi +} + +mptcp_lib_pm_nl_del_endpoint() { + local ns=${1} + local id=${2} + local addr=${3} + + if mptcp_lib_is_ip_mptcp; then + [ "${id}" -ne 0 ] && addr='' + ip -n "${ns}" mptcp endpoint delete id "${id}" ${addr:+"${addr}"} + else + ip netns exec "${ns}" ./pm_nl_ctl del "${id}" "${addr}" + fi +} + +mptcp_lib_pm_nl_flush_endpoint() { + local ns=${1} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp endpoint flush + else + ip netns exec "${ns}" ./pm_nl_ctl flush + fi +} + +mptcp_lib_pm_nl_show_endpoints() { + local ns=${1} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp endpoint show + else + ip netns exec "${ns}" ./pm_nl_ctl dump + fi +} + +mptcp_lib_pm_nl_change_endpoint() { + local ns=${1} + local id=${2} + local flags=${3} + + if mptcp_lib_is_ip_mptcp; then + # shellcheck disable=SC2086 # blanks in flags, no double quote + ip -n "${ns}" mptcp endpoint change id "${id}" ${flags//","/" "} + else + ip netns exec "${ns}" ./pm_nl_ctl set id "${id}" flags "${flags}" + fi +} diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index e2d70c18786e..68899a303a1a 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -22,6 +22,28 @@ ns1="" ns2="" ns_sbox="" +usage() { + echo "Usage: $0 [ -i ] [ -h ]" + echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'" + echo -e "\t-h: help" +} + +while getopts "hi" option;do + case "$option" in + "h") + usage "$0" + exit ${KSFT_PASS} + ;; + "i") + mptcp_lib_set_ip_mptcp + ;; + "?") + usage "$0" + exit ${KSFT_FAIL} + ;; + esac +done + add_mark_rules() { local ns=$1 @@ -58,15 +80,15 @@ init() # let $ns2 reach any $ns1 address from any interface ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i - ip netns exec $ns1 ./pm_nl_ctl add 10.0.$i.1 flags signal - ip netns exec $ns1 ./pm_nl_ctl add dead:beef:$i::1 flags signal + mptcp_lib_pm_nl_add_endpoint "${ns1}" "10.0.${i}.1" flags signal + mptcp_lib_pm_nl_add_endpoint "${ns1}" "dead:beef:${i}::1" flags signal - ip netns exec $ns2 ./pm_nl_ctl add 10.0.$i.2 flags signal - ip netns exec $ns2 ./pm_nl_ctl add dead:beef:$i::2 flags signal + mptcp_lib_pm_nl_add_endpoint "${ns2}" "10.0.${i}.2" flags signal + mptcp_lib_pm_nl_add_endpoint "${ns2}" "dead:beef:${i}::2" flags signal done - ip netns exec $ns1 ./pm_nl_ctl limits 8 8 - ip netns exec $ns2 ./pm_nl_ctl limits 8 8 + mptcp_lib_pm_nl_set_limits "${ns1}" 8 8 + mptcp_lib_pm_nl_set_limits "${ns2}" 8 8 add_mark_rules $ns1 1 add_mark_rules $ns2 2 diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 6ab8c5d36340..2757378b1b13 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -1,28 +1,28 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -# Double quotes to prevent globbing and word splitting is recommended in new -# code but we accept it, especially because there were too many before having -# address all other issues detected by shellcheck. -#shellcheck disable=SC2086 - . "$(dirname "${0}")/mptcp_lib.sh" ret=0 usage() { - echo "Usage: $0 [ -h ]" + echo "Usage: $0 [ -i ] [ -h ]" + echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'" + echo -e "\t-h: help" } -optstring=h +optstring=hi while getopts "$optstring" option;do case "$option" in "h") - usage $0 + usage "$0" exit ${KSFT_PASS} ;; + "i") + mptcp_lib_set_ip_mptcp + ;; "?") - usage $0 + usage "$0" exit ${KSFT_FAIL} ;; esac @@ -35,7 +35,7 @@ err=$(mktemp) #shellcheck disable=SC2317 cleanup() { - rm -f $err + rm -f "${err}" mptcp_lib_ns_exit "${ns1}" } @@ -46,6 +46,76 @@ trap cleanup EXIT mptcp_lib_ns_init ns1 +format_limits() { + local accept="${1}" + local subflows="${2}" + + if mptcp_lib_is_ip_mptcp; then + # with a space at the end + printf "add_addr_accepted %d subflows %d \n" "${accept}" "${subflows}" + else + printf "accept %d\nsubflows %d\n" "${accept}" "${subflows}" + fi +} + +get_limits() { + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns1}" mptcp limits + else + ip netns exec "${ns1}" ./pm_nl_ctl limits + fi +} + +format_endpoints() { + mptcp_lib_pm_nl_format_endpoints "${@}" +} + +get_endpoint() { + # shellcheck disable=SC2317 # invoked indirectly + mptcp_lib_pm_nl_get_endpoint "${ns1}" "${@}" +} + +change_address() { + local addr=${1} + local flags=${2} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns1}" mptcp endpoint change "${addr}" "${flags}" + else + ip netns exec "${ns1}" ./pm_nl_ctl set "${addr}" flags "${flags}" + fi +} + +set_limits() +{ + mptcp_lib_pm_nl_set_limits "${ns1}" "${@}" +} + +add_endpoint() +{ + mptcp_lib_pm_nl_add_endpoint "${ns1}" "${@}" +} + +del_endpoint() +{ + mptcp_lib_pm_nl_del_endpoint "${ns1}" "${@}" +} + +flush_endpoint() +{ + mptcp_lib_pm_nl_flush_endpoint "${ns1}" +} + +show_endpoints() +{ + mptcp_lib_pm_nl_show_endpoints "${ns1}" +} + +change_endpoint() +{ + mptcp_lib_pm_nl_change_endpoint "${ns1}" "${@}" +} + check() { local cmd="$1" @@ -67,125 +137,126 @@ check() fi } -check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "defaults addr list" +check "show_endpoints" "" "defaults addr list" -default_limits="$(ip netns exec $ns1 ./pm_nl_ctl limits)" +default_limits="$(get_limits)" if mptcp_lib_expect_all_features; then - check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 -subflows 2" "defaults limits" + check "get_limits" "$(format_limits 0 2)" "defaults limits" fi -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 flags subflow dev lo -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 flags signal,backup -check "ip netns exec $ns1 ./pm_nl_ctl get 1" "id 1 flags 10.0.1.1" "simple add/get addr" +add_endpoint 10.0.1.1 +add_endpoint 10.0.1.2 flags subflow dev lo +add_endpoint 10.0.1.3 flags signal,backup +check "get_endpoint 1" "$(format_endpoints "1,10.0.1.1")" "simple add/get addr" -check "ip netns exec $ns1 ./pm_nl_ctl dump" \ -"id 1 flags 10.0.1.1 -id 2 flags subflow dev lo 10.0.1.2 -id 3 flags signal,backup 10.0.1.3" "dump addrs" +check "show_endpoints" \ + "$(format_endpoints "1,10.0.1.1" \ + "2,10.0.1.2,subflow,lo" \ + "3,10.0.1.3,signal backup")" "dump addrs" -ip netns exec $ns1 ./pm_nl_ctl del 2 -check "ip netns exec $ns1 ./pm_nl_ctl get 2" "" "simple del addr" -check "ip netns exec $ns1 ./pm_nl_ctl dump" \ -"id 1 flags 10.0.1.1 -id 3 flags signal,backup 10.0.1.3" "dump addrs after del" +del_endpoint 2 +check "get_endpoint 2" "" "simple del addr" +check "show_endpoints" \ + "$(format_endpoints "1,10.0.1.1" \ + "3,10.0.1.3,signal backup")" "dump addrs after del" -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 2>/dev/null -check "ip netns exec $ns1 ./pm_nl_ctl get 4" "" "duplicate addr" +add_endpoint 10.0.1.3 2>/dev/null +check "get_endpoint 4" "" "duplicate addr" -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 flags signal -check "ip netns exec $ns1 ./pm_nl_ctl get 4" "id 4 flags signal 10.0.1.4" "id addr increment" +add_endpoint 10.0.1.4 flags signal +check "get_endpoint 4" "$(format_endpoints "4,10.0.1.4,signal")" "id addr increment" for i in $(seq 5 9); do - ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.$i flags signal >/dev/null 2>&1 + add_endpoint "10.0.1.${i}" flags signal >/dev/null 2>&1 done -check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9" "hard addr limit" -check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit" +check "get_endpoint 9" "$(format_endpoints "9,10.0.1.9,signal")" "hard addr limit" +check "get_endpoint 10" "" "above hard addr limit" -ip netns exec $ns1 ./pm_nl_ctl del 9 +del_endpoint 9 for i in $(seq 10 255); do - ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 id $i - ip netns exec $ns1 ./pm_nl_ctl del $i + add_endpoint 10.0.0.9 id "${i}" + del_endpoint "${i}" done -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1 -id 3 flags signal,backup 10.0.1.3 -id 4 flags signal 10.0.1.4 -id 5 flags signal 10.0.1.5 -id 6 flags signal 10.0.1.6 -id 7 flags signal 10.0.1.7 -id 8 flags signal 10.0.1.8" "id limit" - -ip netns exec $ns1 ./pm_nl_ctl flush -check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs" - -ip netns exec $ns1 ./pm_nl_ctl limits 9 1 2>/dev/null -check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "rcv addrs above hard limit" - -ip netns exec $ns1 ./pm_nl_ctl limits 1 9 2>/dev/null -check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "subflows above hard limit" - -ip netns exec $ns1 ./pm_nl_ctl limits 8 8 -check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 8 -subflows 8" "set limits" - -ip netns exec $ns1 ./pm_nl_ctl flush -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 id 100 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.5 id 254 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.6 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.7 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.8 -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1 -id 2 flags 10.0.1.2 -id 3 flags 10.0.1.7 -id 4 flags 10.0.1.8 -id 100 flags 10.0.1.3 -id 101 flags 10.0.1.4 -id 254 flags 10.0.1.5 -id 255 flags 10.0.1.6" "set ids" - -ip netns exec $ns1 ./pm_nl_ctl flush -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.1 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.2 id 254 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.3 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.4 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.5 id 253 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.6 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.7 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.8 -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.0.1 -id 2 flags 10.0.0.4 -id 3 flags 10.0.0.6 -id 4 flags 10.0.0.7 -id 5 flags 10.0.0.8 -id 253 flags 10.0.0.5 -id 254 flags 10.0.0.2 -id 255 flags 10.0.0.3" "wrap-around ids" - -ip netns exec $ns1 ./pm_nl_ctl flush -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 flags subflow -ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags backup -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow,backup 10.0.1.1" "set flags (backup)" -ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags nobackup -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow 10.0.1.1" " (nobackup)" +check "show_endpoints" \ + "$(format_endpoints "1,10.0.1.1" \ + "3,10.0.1.3,signal backup" \ + "4,10.0.1.4,signal" \ + "5,10.0.1.5,signal" \ + "6,10.0.1.6,signal" \ + "7,10.0.1.7,signal" \ + "8,10.0.1.8,signal")" "id limit" + +flush_endpoint +check "show_endpoints" "" "flush addrs" + +set_limits 9 1 2>/dev/null +check "get_limits" "${default_limits}" "rcv addrs above hard limit" + +set_limits 1 9 2>/dev/null +check "get_limits" "${default_limits}" "subflows above hard limit" + +set_limits 8 8 +check "get_limits" "$(format_limits 8 8)" "set limits" + +flush_endpoint +add_endpoint 10.0.1.1 +add_endpoint 10.0.1.2 +add_endpoint 10.0.1.3 id 100 +add_endpoint 10.0.1.4 +add_endpoint 10.0.1.5 id 254 +add_endpoint 10.0.1.6 +add_endpoint 10.0.1.7 +add_endpoint 10.0.1.8 +check "show_endpoints" \ + "$(format_endpoints "1,10.0.1.1" \ + "2,10.0.1.2" \ + "3,10.0.1.7" \ + "4,10.0.1.8" \ + "100,10.0.1.3" \ + "101,10.0.1.4" \ + "254,10.0.1.5" \ + "255,10.0.1.6")" "set ids" + +flush_endpoint +add_endpoint 10.0.0.1 +add_endpoint 10.0.0.2 id 254 +add_endpoint 10.0.0.3 +add_endpoint 10.0.0.4 +add_endpoint 10.0.0.5 id 253 +add_endpoint 10.0.0.6 +add_endpoint 10.0.0.7 +add_endpoint 10.0.0.8 +check "show_endpoints" \ + "$(format_endpoints "1,10.0.0.1" \ + "2,10.0.0.4" \ + "3,10.0.0.6" \ + "4,10.0.0.7" \ + "5,10.0.0.8" \ + "253,10.0.0.5" \ + "254,10.0.0.2" \ + "255,10.0.0.3")" "wrap-around ids" + +flush_endpoint +add_endpoint 10.0.1.1 flags subflow +change_address 10.0.1.1 backup +check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow backup")" \ + "set flags (backup)" +change_address 10.0.1.1 nobackup +check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow")" \ + " (nobackup)" # fullmesh support has been added later -ip netns exec $ns1 ./pm_nl_ctl set id 1 flags fullmesh 2>/dev/null -if ip netns exec $ns1 ./pm_nl_ctl dump | grep -q "fullmesh" || +change_endpoint 1 fullmesh 2>/dev/null +if show_endpoints | grep -q "fullmesh" || mptcp_lib_expect_all_features; then - check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow,fullmesh 10.0.1.1" " (fullmesh)" - ip netns exec $ns1 ./pm_nl_ctl set id 1 flags nofullmesh - check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow 10.0.1.1" " (nofullmesh)" - ip netns exec $ns1 ./pm_nl_ctl set id 1 flags backup,fullmesh - check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow,backup,fullmesh 10.0.1.1" " (backup,fullmesh)" + check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow fullmesh")" \ + " (fullmesh)" + change_endpoint 1 nofullmesh + check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow")" \ + " (nofullmesh)" + change_endpoint 1 backup,fullmesh + check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow backup fullmesh")" \ + " (backup,fullmesh)" else for st in fullmesh nofullmesh backup,fullmesh; do st=" (${st})" diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 1b2366220388..4b14b4412166 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -27,10 +27,11 @@ capout="" size=0 usage() { - echo "Usage: $0 [ -b ] [ -c ] [ -d ]" + echo "Usage: $0 [ -b ] [ -c ] [ -d ] [ -i]" echo -e "\t-b: bail out after first error, otherwise runs al testcases" echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)" echo -e "\t-d: debug this script" + echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'" } # This function is used in the cleanup trap @@ -45,7 +46,7 @@ cleanup() } mptcp_lib_check_mptcp -mptcp_lib_check_tools ip +mptcp_lib_check_tools ip tc # "$ns1" ns2 ns3 # ns1eth1 ns2eth1 ns2eth3 ns3eth1 @@ -85,8 +86,8 @@ setup() ip -net "$ns1" route add default via 10.0.2.2 metric 101 ip -net "$ns1" route add default via dead:beef:2::2 metric 101 - ip netns exec "$ns1" ./pm_nl_ctl limits 1 1 - ip netns exec "$ns1" ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags subflow + mptcp_lib_pm_nl_set_limits "${ns1}" 1 1 + mptcp_lib_pm_nl_add_endpoint "${ns1}" 10.0.2.1 dev ns1eth2 flags subflow ip -net "$ns2" addr add 10.0.1.2/24 dev ns2eth1 ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad @@ -108,7 +109,7 @@ setup() ip -net "$ns3" route add default via 10.0.3.2 ip -net "$ns3" route add default via dead:beef:3::2 - ip netns exec "$ns3" ./pm_nl_ctl limits 1 1 + mptcp_lib_pm_nl_set_limits "${ns3}" 1 1 # debug build can slow down measurably the test program # we use quite tight time limit on the run-time, to ensure @@ -216,8 +217,8 @@ run_test() shift 4 local msg=$* - [ $delay1 -gt 0 ] && delay1="delay $delay1" || delay1="" - [ $delay2 -gt 0 ] && delay2="delay $delay2" || delay2="" + [ $delay1 -gt 0 ] && delay1="delay ${delay1}ms" || delay1="" + [ $delay2 -gt 0 ] && delay2="delay ${delay2}ms" || delay2="" for dev in ns1eth1 ns1eth2; do tc -n $ns1 qdisc del dev $dev root >/dev/null 2>&1 @@ -259,7 +260,7 @@ run_test() fi } -while getopts "bcdh" option;do +while getopts "bcdhi" option;do case "$option" in "h") usage $0 @@ -274,6 +275,9 @@ while getopts "bcdh" option;do "d") set -x ;; + "i") + mptcp_lib_set_ip_mptcp + ;; "?") usage $0 exit ${KSFT_FAIL} diff --git a/tools/testing/selftests/net/nat6to4.c b/tools/testing/selftests/net/nat6to4.bpf.c index ac54c36b25fc..ac54c36b25fc 100644 --- a/tools/testing/selftests/net/nat6to4.c +++ b/tools/testing/selftests/net/nat6to4.bpf.c diff --git a/tools/testing/selftests/netfilter/.gitignore b/tools/testing/selftests/net/netfilter/.gitignore index c2229b3e40d4..0a64d6d0e29a 100644 --- a/tools/testing/selftests/netfilter/.gitignore +++ b/tools/testing/selftests/net/netfilter/.gitignore @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -nf-queue -connect_close audit_logread +connect_close conntrack_dump_flush sctp_collision +nf_queue diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile new file mode 100644 index 000000000000..72c6001964a6 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: GPL-2.0 + +top_srcdir = ../../../../.. + +HOSTPKG_CONFIG := pkg-config +MNL_CFLAGS := $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null) +MNL_LDLIBS := $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl) + +TEST_PROGS := br_netfilter.sh bridge_brouter.sh +TEST_PROGS += conntrack_icmp_related.sh +TEST_PROGS += conntrack_ipip_mtu.sh +TEST_PROGS += conntrack_tcp_unreplied.sh +TEST_PROGS += conntrack_sctp_collision.sh +TEST_PROGS += conntrack_vrf.sh +TEST_PROGS += ipvs.sh +TEST_PROGS += nf_nat_edemux.sh +TEST_PROGS += nft_audit.sh +TEST_PROGS += nft_concat_range.sh +TEST_PROGS += nft_conntrack_helper.sh +TEST_PROGS += nft_fib.sh +TEST_PROGS += nft_flowtable.sh +TEST_PROGS += nft_meta.sh +TEST_PROGS += nft_nat.sh +TEST_PROGS += nft_nat_zones.sh +TEST_PROGS += nft_queue.sh +TEST_PROGS += nft_synproxy.sh +TEST_PROGS += nft_zones_many.sh +TEST_PROGS += rpath.sh +TEST_PROGS += xt_string.sh + +TEST_GEN_PROGS = conntrack_dump_flush + +TEST_GEN_FILES = audit_logread +TEST_GEN_FILES += connect_close nf_queue +TEST_GEN_FILES += sctp_collision + +include ../../lib.mk + +$(OUTPUT)/nf_queue: CFLAGS += $(MNL_CFLAGS) +$(OUTPUT)/nf_queue: LDLIBS += $(MNL_LDLIBS) + +$(OUTPUT)/conntrack_dump_flush: CFLAGS += $(MNL_CFLAGS) +$(OUTPUT)/conntrack_dump_flush: LDLIBS += $(MNL_LDLIBS) + +TEST_FILES := lib.sh + +TEST_INCLUDES := \ + ../lib.sh diff --git a/tools/testing/selftests/netfilter/audit_logread.c b/tools/testing/selftests/net/netfilter/audit_logread.c index a0a880fc2d9d..a0a880fc2d9d 100644 --- a/tools/testing/selftests/netfilter/audit_logread.c +++ b/tools/testing/selftests/net/netfilter/audit_logread.c diff --git a/tools/testing/selftests/net/netfilter/br_netfilter.sh b/tools/testing/selftests/net/netfilter/br_netfilter.sh new file mode 100755 index 000000000000..d7806753f5de --- /dev/null +++ b/tools/testing/selftests/net/netfilter/br_netfilter.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test for legacy br_netfilter module combined with connection tracking, +# a combination that doesn't really work. +# Multicast/broadcast packets race for hash table insertion. + +# eth0 br0 eth0 +# setup is: ns1 <->,ns0 <-> ns3 +# ns2 <-' `'-> ns4 + +source lib.sh + +checktool "nft --version" "run test without nft tool" + +cleanup() { + cleanup_all_ns +} + +trap cleanup EXIT + +setup_ns ns0 ns1 ns2 ns3 ns4 + +ret=0 + +do_ping() +{ + fromns="$1" + dstip="$2" + + if ! ip netns exec "$fromns" ping -c 1 -q "$dstip" > /dev/null; then + echo "ERROR: ping from $fromns to $dstip" + ip netns exec "$ns0" nft list ruleset + ret=1 + fi +} + +bcast_ping() +{ + fromns="$1" + dstip="$2" + + for i in $(seq 1 500); do + if ! ip netns exec "$fromns" ping -q -f -b -c 1 -q "$dstip" > /dev/null 2>&1; then + echo "ERROR: ping -b from $fromns to $dstip" + ip netns exec "$ns0" nft list ruleset + ret=1 + break + fi + done +} + +ip netns exec "$ns0" sysctl -q net.ipv4.conf.all.rp_filter=0 +ip netns exec "$ns0" sysctl -q net.ipv4.conf.default.rp_filter=0 + +if ! ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns1"; then + echo "SKIP: Can't create veth device" + exit $ksft_skip +fi + +ip link add veth2 netns "$ns0" type veth peer name eth0 netns "$ns2" +ip link add veth3 netns "$ns0" type veth peer name eth0 netns "$ns3" +ip link add veth4 netns "$ns0" type veth peer name eth0 netns "$ns4" + +for i in $(seq 1 4); do + ip -net "$ns0" link set "veth$i" up +done + +if ! ip -net "$ns0" link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1; then + echo "SKIP: Can't create bridge br0" + exit $ksft_skip +fi + +# make veth0,1,2 part of bridge. +for i in $(seq 1 3); do + ip -net "$ns0" link set "veth$i" master br0 +done + +# add a macvlan on top of the bridge. +MACVLAN_ADDR=ba:f3:13:37:42:23 +ip -net "$ns0" link add link br0 name macvlan0 type macvlan mode private +ip -net "$ns0" link set macvlan0 address ${MACVLAN_ADDR} +ip -net "$ns0" link set macvlan0 up +ip -net "$ns0" addr add 10.23.0.1/24 dev macvlan0 + +# add a macvlan on top of veth4. +MACVLAN_ADDR=ba:f3:13:37:42:24 +ip -net "$ns0" link add link veth4 name macvlan4 type macvlan mode passthru +ip -net "$ns0" link set macvlan4 address ${MACVLAN_ADDR} +ip -net "$ns0" link set macvlan4 up + +# make the macvlan part of the bridge. +# veth4 is not a bridge port, only the macvlan on top of it. +ip -net "$ns0" link set macvlan4 master br0 + +ip -net "$ns0" link set br0 up +ip -net "$ns0" addr add 10.0.0.1/24 dev br0 + +modprobe -q br_netfilter +if ! ip netns exec "$ns0" sysctl -q net.bridge.bridge-nf-call-iptables=1; then + echo "SKIP: bridge netfilter not available" + ret=$ksft_skip +fi + +# for testing, so namespaces will reply to ping -b probes. +ip netns exec "$ns0" sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0 + +# enable conntrack in ns0 and drop broadcast packets in forward to +# avoid them from getting confirmed in the postrouting hook before +# the cloned skb is passed up the stack. +ip netns exec "$ns0" nft -f - <<EOF +table ip filter { + chain input { + type filter hook input priority 1; policy accept + iifname br0 counter + ct state new accept + } +} + +table bridge filter { + chain forward { + type filter hook forward priority 0; policy accept + meta pkttype broadcast ip protocol icmp counter drop + } +} +EOF +if [ "$?" -ne 0 ];then + echo "SKIP: could not add nftables ruleset" + exit $ksft_skip +fi + +# place 1, 2 & 3 in same subnet, connected via ns0:br0. +# ns4 is placed in same subnet as well, but its not +# part of the bridge: the corresponding veth4 is not +# part of the bridge, only its macvlan interface. +for i in $(seq 1 4); do + eval ip -net \$ns"$i" link set eth0 up +done +for i in $(seq 1 2); do + eval ip -net \$ns"$i" addr add "10.0.0.1$i/24" dev eth0 +done + +ip -net "$ns3" addr add 10.23.0.13/24 dev eth0 +ip -net "$ns4" addr add 10.23.0.14/24 dev eth0 + +# test basic connectivity +do_ping "$ns1" 10.0.0.12 +do_ping "$ns3" 10.23.0.1 +do_ping "$ns4" 10.23.0.1 + +bcast_ping "$ns1" 10.0.0.255 + +# This should deliver broadcast to macvlan0, which is on top of ns0:br0. +bcast_ping "$ns3" 10.23.0.255 + +# same, this time via veth4:macvlan4. +bcast_ping "$ns4" 10.23.0.255 + +read t < /proc/sys/kernel/tainted +if [ "$t" -eq 0 ];then + echo PASS: kernel not tainted +else + echo ERROR: kernel is tainted + ret=1 +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/bridge_brouter.sh b/tools/testing/selftests/net/netfilter/bridge_brouter.sh new file mode 100755 index 000000000000..2549b6590693 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/bridge_brouter.sh @@ -0,0 +1,122 @@ +#!/bin/bash +# +# This test is for bridge 'brouting', i.e. make some packets being routed +# rather than getting bridged even though they arrive on interface that is +# part of a bridge. + +# eth0 br0 eth0 +# setup is: ns1 <-> nsbr <-> ns2 + +source lib.sh + +if ! ebtables -V > /dev/null 2>&1;then + echo "SKIP: Could not run test without ebtables" + exit $ksft_skip +fi + +cleanup() { + cleanup_all_ns +} + +trap cleanup EXIT + +setup_ns nsbr ns1 ns2 + +ip netns exec "$nsbr" sysctl -q net.ipv4.conf.default.rp_filter=0 +ip netns exec "$nsbr" sysctl -q net.ipv4.conf.all.rp_filter=0 +if ! ip link add veth0 netns "$nsbr" type veth peer name eth0 netns "$ns1"; then + echo "SKIP: Can't create veth device" + exit $ksft_skip +fi +ip link add veth1 netns "$nsbr" type veth peer name eth0 netns "$ns2" + +if ! ip -net "$nsbr" link add br0 type bridge; then + echo "SKIP: Can't create bridge br0" + exit $ksft_skip +fi + +ip -net "$nsbr" link set veth0 up +ip -net "$nsbr" link set veth1 up + +ip -net "$nsbr" link set veth0 master br0 +ip -net "$nsbr" link set veth1 master br0 +ip -net "$nsbr" link set br0 up +ip -net "$nsbr" addr add 10.0.0.1/24 dev br0 + +# place both in same subnet, ${ns1} and ${ns2} connected via ${nsbr}:br0 +ip -net "$ns1" link set eth0 up +ip -net "$ns2" link set eth0 up +ip -net "$ns1" addr add 10.0.0.11/24 dev eth0 +ip -net "$ns2" addr add 10.0.0.12/24 dev eth0 + +test_ebtables_broute() +{ + # redirect is needed so the dstmac is rewritten to the bridge itself, + # ip stack won't process OTHERHOST (foreign unicast mac) packets. + if ! ip netns exec "$nsbr" ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP; then + echo "SKIP: Could not add ebtables broute redirect rule" + return $ksft_skip + fi + + ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth0.forwarding=0 + + # ping net${ns1}, expected to not work (ip forwarding is off) + if ip netns exec "$ns1" ping -q -c 1 10.0.0.12 -W 0.5 > /dev/null 2>&1; then + echo "ERROR: ping works, should have failed" 1>&2 + return 1 + fi + + # enable forwarding on both interfaces. + # neither needs an ip address, but at least the bridge needs + # an ip address in same network segment as ${ns1} and ${ns2} (${nsbr} + # needs to be able to determine route for to-be-forwarded packet). + ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth0.forwarding=1 + ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth1.forwarding=1 + + if ! ip netns exec "$ns1" ping -q -c 1 10.0.0.12 > /dev/null; then + echo "ERROR: ping did not work, but it should (broute+forward)" 1>&2 + return 1 + fi + + echo "PASS: ${ns1}/${ns2} connectivity with active broute rule" + ip netns exec "$nsbr" ebtables -t broute -F + + # ping net${ns1}, expected to work (frames are bridged) + if ! ip netns exec "$ns1" ping -q -c 1 10.0.0.12 > /dev/null; then + echo "ERROR: ping did not work, but it should (bridged)" 1>&2 + return 1 + fi + + ip netns exec "$nsbr" ebtables -t filter -A FORWARD -p ipv4 --ip-protocol icmp -j DROP + + # ping net${ns1}, expected to not work (DROP in bridge forward) + if ip netns exec "$ns1" ping -q -c 1 10.0.0.12 -W 0.5 > /dev/null 2>&1; then + echo "ERROR: ping works, should have failed (icmp forward drop)" 1>&2 + return 1 + fi + + # re-activate brouter + ip netns exec "$nsbr" ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP + + if ! ip netns exec "$ns2" ping -q -c 1 10.0.0.11 > /dev/null; then + echo "ERROR: ping did not work, but it should (broute+forward 2)" 1>&2 + return 1 + fi + + echo "PASS: ${ns1}/${ns2} connectivity with active broute rule and bridge forward drop" + return 0 +} + +# test basic connectivity +if ! ip netns exec "$ns1" ping -c 1 -q 10.0.0.12 > /dev/null; then + echo "ERROR: Could not reach ${ns2} from ${ns1}" 1>&2 + exit 1 +fi + +if ! ip netns exec "$ns2" ping -c 1 -q 10.0.0.11 > /dev/null; then + echo "ERROR: Could not reach ${ns1} from ${ns2}" 1>&2 + exit 1 +fi + +test_ebtables_broute +exit $? diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config new file mode 100644 index 000000000000..60b86c7f3ea1 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/config @@ -0,0 +1,87 @@ +CONFIG_AUDIT=y +CONFIG_BPF_SYSCALL=y +CONFIG_BRIDGE=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_NETFILTER=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_CGROUP_BPF=y +CONFIG_DUMMY=m +CONFIG_INET_ESP=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP_NF_RAW=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP_SCTP=m +CONFIG_IP_VS=m +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_RR=m +CONFIG_IPV6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_MACVLAN=m +CONFIG_NAMESPACES=y +CONFIG_NET_CLS_U32=m +CONFIG_NET_L3_MASTER_DEV=y +CONFIG_NET_NS=y +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_IPIP=m +CONFIG_NET_VRF=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_SYNPROXY=m +CONFIG_NETFILTER_XTABLES=m +CONFIG_NETFILTER_XT_NAT=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_MARK=y +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_FLOW_TABLE=m +CONFIG_NF_LOG_IPV4=m +CONFIG_NF_LOG_IPV6=m +CONFIG_NF_NAT=m +CONFIG_NF_NAT_REDIRECT=y +CONFIG_NF_NAT_MASQUERADE=y +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_BRIDGE=m +CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_IPV4=y +CONFIG_NF_TABLES_IPV6=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NFT_BRIDGE_META=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_CT=m +CONFIG_NFT_FIB=m +CONFIG_NFT_FIB_INET=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_LOG=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_NAT=m +CONFIG_NFT_NUMGEN=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m +CONFIG_NFT_REDIR=m +CONFIG_NFT_SYNPROXY=m +CONFIG_VETH=m +CONFIG_VLAN_8021Q=m +CONFIG_XFRM_USER=m +CONFIG_XFRM_STATISTICS=y diff --git a/tools/testing/selftests/netfilter/connect_close.c b/tools/testing/selftests/net/netfilter/connect_close.c index 1c3b0add54c4..1c3b0add54c4 100644 --- a/tools/testing/selftests/netfilter/connect_close.c +++ b/tools/testing/selftests/net/netfilter/connect_close.c diff --git a/tools/testing/selftests/netfilter/conntrack_dump_flush.c b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c index b11ea8ee6719..bd9317bf5ada 100644 --- a/tools/testing/selftests/netfilter/conntrack_dump_flush.c +++ b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c @@ -10,7 +10,7 @@ #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> #include <linux/netfilter/nf_conntrack_tcp.h> -#include "../kselftest_harness.h" +#include "../../kselftest_harness.h" #define TEST_ZONE_ID 123 #define NF_CT_DEFAULT_ZONE_ID 0 @@ -313,13 +313,11 @@ FIXTURE_SETUP(conntrack_dump_flush) self->sock = mnl_socket_open(NETLINK_NETFILTER); if (!self->sock) { perror("mnl_socket_open"); - exit(EXIT_FAILURE); + SKIP(return, "cannot open netlink_netfilter socket"); } - if (mnl_socket_bind(self->sock, 0, MNL_SOCKET_AUTOPID) < 0) { - perror("mnl_socket_bind"); - exit(EXIT_FAILURE); - } + ret = mnl_socket_bind(self->sock, 0, MNL_SOCKET_AUTOPID); + EXPECT_EQ(ret, 0); ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); if (ret < 0 && errno == EPERM) diff --git a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh b/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh index 76645aaf2b58..c63d840ead61 100755 --- a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh @@ -14,35 +14,32 @@ # check the icmp errors are propagated to the correct host as per # nat of "established" icmp-echo "connection". -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 +source lib.sh -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! nft --version > /dev/null 2>&1;then echo "SKIP: Could not run test without nft tool" exit $ksft_skip fi -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - cleanup() { - for i in 1 2;do ip netns del nsclient$i;done - for i in 1 2;do ip netns del nsrouter$i;done + cleanup_all_ns } trap cleanup EXIT -ipv4() { - echo -n 192.168.$1.2 -} +setup_ns nsclient1 nsclient2 nsrouter1 nsrouter2 + +ret=0 + +add_addr() +{ + ns=$1 + dev=$2 + i=$3 -ipv6 () { - echo -n dead:$1::2 + ip -net "$ns" link set "$dev" up + ip -net "$ns" addr add "192.168.$i.2/24" dev "$dev" + ip -net "$ns" addr add "dead:$i::2/64" dev "$dev" nodad } check_counter() @@ -52,10 +49,9 @@ check_counter() expect=$3 local lret=0 - cnt=$(ip netns exec $ns nft list counter inet filter "$name" | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns" nft list counter inet filter "$name" | grep -q "$expect"; then echo "ERROR: counter $name in $ns has unexpected value (expected $expect)" 1>&2 - ip netns exec $ns nft list counter inet filter "$name" 1>&2 + ip netns exec "$ns" nft list counter inet filter "$name" 1>&2 lret=1 fi @@ -65,9 +61,8 @@ check_counter() check_unknown() { expect="packets 0 bytes 0" - for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do - check_counter $n "unknown" "$expect" - if [ $? -ne 0 ] ;then + for n in ${nsclient1} ${nsclient2} ${nsrouter1} ${nsrouter2}; do + if ! check_counter "$n" "unknown" "$expect"; then return 1 fi done @@ -75,61 +70,48 @@ check_unknown() return 0 } -for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do - ip netns add $n - ip -net $n link set lo up -done - -DEV=veth0 -ip link add $DEV netns nsclient1 type veth peer name eth1 netns nsrouter1 DEV=veth0 -ip link add $DEV netns nsclient2 type veth peer name eth1 netns nsrouter2 +ip link add "$DEV" netns "$nsclient1" type veth peer name eth1 netns "$nsrouter1" +ip link add "$DEV" netns "$nsclient2" type veth peer name eth1 netns "$nsrouter2" +ip link add "$DEV" netns "$nsrouter1" type veth peer name eth2 netns "$nsrouter2" -DEV=veth0 -ip link add $DEV netns nsrouter1 type veth peer name eth2 netns nsrouter2 +add_addr "$nsclient1" $DEV 1 +add_addr "$nsclient2" $DEV 2 -DEV=veth0 -for i in 1 2; do - ip -net nsclient$i link set $DEV up - ip -net nsclient$i addr add $(ipv4 $i)/24 dev $DEV - ip -net nsclient$i addr add $(ipv6 $i)/64 dev $DEV -done - -ip -net nsrouter1 link set eth1 up -ip -net nsrouter1 link set veth0 up +ip -net "$nsrouter1" link set eth1 up +ip -net "$nsrouter1" link set $DEV up -ip -net nsrouter2 link set eth1 up -ip -net nsrouter2 link set eth2 up +ip -net "$nsrouter2" link set eth1 mtu 1280 up +ip -net "$nsrouter2" link set eth2 up -ip -net nsclient1 route add default via 192.168.1.1 -ip -net nsclient1 -6 route add default via dead:1::1 +ip -net "$nsclient1" route add default via 192.168.1.1 +ip -net "$nsclient1" -6 route add default via dead:1::1 -ip -net nsclient2 route add default via 192.168.2.1 -ip -net nsclient2 route add default via dead:2::1 +ip -net "$nsclient2" route add default via 192.168.2.1 +ip -net "$nsclient2" route add default via dead:2::1 +ip -net "$nsclient2" link set veth0 mtu 1280 -i=3 -ip -net nsrouter1 addr add 192.168.1.1/24 dev eth1 -ip -net nsrouter1 addr add 192.168.3.1/24 dev veth0 -ip -net nsrouter1 addr add dead:1::1/64 dev eth1 -ip -net nsrouter1 addr add dead:3::1/64 dev veth0 -ip -net nsrouter1 route add default via 192.168.3.10 -ip -net nsrouter1 -6 route add default via dead:3::10 +ip -net "$nsrouter1" addr add 192.168.1.1/24 dev eth1 +ip -net "$nsrouter1" addr add 192.168.3.1/24 dev veth0 +ip -net "$nsrouter1" addr add dead:1::1/64 dev eth1 nodad +ip -net "$nsrouter1" addr add dead:3::1/64 dev veth0 nodad +ip -net "$nsrouter1" route add default via 192.168.3.10 +ip -net "$nsrouter1" -6 route add default via dead:3::10 -ip -net nsrouter2 addr add 192.168.2.1/24 dev eth1 -ip -net nsrouter2 addr add 192.168.3.10/24 dev eth2 -ip -net nsrouter2 addr add dead:2::1/64 dev eth1 -ip -net nsrouter2 addr add dead:3::10/64 dev eth2 -ip -net nsrouter2 route add default via 192.168.3.1 -ip -net nsrouter2 route add default via dead:3::1 +ip -net "$nsrouter2" addr add 192.168.2.1/24 dev eth1 +ip -net "$nsrouter2" addr add 192.168.3.10/24 dev eth2 +ip -net "$nsrouter2" addr add dead:2::1/64 dev eth1 nodad +ip -net "$nsrouter2" addr add dead:3::10/64 dev eth2 nodad +ip -net "$nsrouter2" route add default via 192.168.3.1 +ip -net "$nsrouter2" route add default via dead:3::1 -sleep 2 for i in 4 6; do - ip netns exec nsrouter1 sysctl -q net.ipv$i.conf.all.forwarding=1 - ip netns exec nsrouter2 sysctl -q net.ipv$i.conf.all.forwarding=1 + ip netns exec "$nsrouter1" sysctl -q net.ipv$i.conf.all.forwarding=1 + ip netns exec "$nsrouter2" sysctl -q net.ipv$i.conf.all.forwarding=1 done -for netns in nsrouter1 nsrouter2; do -ip netns exec $netns nft -f - <<EOF +for netns in "$nsrouter1" "$nsrouter2"; do +ip netns exec "$netns" nft -f - <<EOF table inet filter { counter unknown { } counter related { } @@ -144,7 +126,7 @@ table inet filter { EOF done -ip netns exec nsclient1 nft -f - <<EOF +ip netns exec "$nsclient1" nft -f - <<EOF table inet filter { counter unknown { } counter related { } @@ -164,7 +146,7 @@ table inet filter { } EOF -ip netns exec nsclient2 nft -f - <<EOF +ip netns exec "$nsclient2" nft -f - <<EOF table inet filter { counter unknown { } counter new { } @@ -189,11 +171,10 @@ table inet filter { } EOF - # make sure NAT core rewrites adress of icmp error if nat is used according to # conntrack nat information (icmp error will be directed at nsrouter1 address, # but it needs to be routed to nsclient1 address). -ip netns exec nsrouter1 nft -f - <<EOF +ip netns exec "$nsrouter1" nft -f - <<EOF table ip nat { chain postrouting { type nat hook postrouting priority 0; policy accept; @@ -208,44 +189,32 @@ table ip6 nat { } EOF -ip netns exec nsrouter2 ip link set eth1 mtu 1280 -ip netns exec nsclient2 ip link set veth0 mtu 1280 -sleep 1 - -ip netns exec nsclient1 ping -c 1 -s 1000 -q -M do 192.168.2.2 >/dev/null -if [ $? -ne 0 ]; then +if ! ip netns exec "$nsclient1" ping -c 1 -s 1000 -q -M "do" 192.168.2.2 >/dev/null; then echo "ERROR: netns ip routing/connectivity broken" 1>&2 - cleanup exit 1 fi -ip netns exec nsclient1 ping6 -q -c 1 -s 1000 dead:2::2 >/dev/null -if [ $? -ne 0 ]; then +if ! ip netns exec "$nsclient1" ping -c 1 -s 1000 -q dead:2::2 >/dev/null; then echo "ERROR: netns ipv6 routing/connectivity broken" 1>&2 - cleanup exit 1 fi -check_unknown -if [ $? -ne 0 ]; then +if ! check_unknown; then ret=1 fi expect="packets 0 bytes 0" -for netns in nsrouter1 nsrouter2 nsclient1;do - check_counter "$netns" "related" "$expect" - if [ $? -ne 0 ]; then +for netns in "$nsrouter1" "$nsrouter2" "$nsclient1";do + if ! check_counter "$netns" "related" "$expect"; then ret=1 fi done expect="packets 2 bytes 2076" -check_counter nsclient2 "new" "$expect" -if [ $? -ne 0 ]; then +if ! check_counter "$nsclient2" "new" "$expect"; then ret=1 fi -ip netns exec nsclient1 ping -q -c 1 -s 1300 -M do 192.168.2.2 > /dev/null -if [ $? -eq 0 ]; then +if ip netns exec "$nsclient1" ping -W 0.5 -q -c 1 -s 1300 -M "do" 192.168.2.2 > /dev/null; then echo "ERROR: ping should have failed with PMTU too big error" 1>&2 ret=1 fi @@ -253,30 +222,26 @@ fi # nsrouter2 should have generated the icmp error, so # related counter should be 0 (its in forward). expect="packets 0 bytes 0" -check_counter "nsrouter2" "related" "$expect" -if [ $? -ne 0 ]; then +if ! check_counter "$nsrouter2" "related" "$expect"; then ret=1 fi # but nsrouter1 should have seen it, same for nsclient1. expect="packets 1 bytes 576" -for netns in nsrouter1 nsclient1;do - check_counter "$netns" "related" "$expect" - if [ $? -ne 0 ]; then +for netns in ${nsrouter1} ${nsclient1};do + if ! check_counter "$netns" "related" "$expect"; then ret=1 fi done -ip netns exec nsclient1 ping6 -c 1 -s 1300 dead:2::2 > /dev/null -if [ $? -eq 0 ]; then +if ip netns exec "${nsclient1}" ping6 -W 0.5 -c 1 -s 1300 dead:2::2 > /dev/null; then echo "ERROR: ping6 should have failed with PMTU too big error" 1>&2 ret=1 fi expect="packets 2 bytes 1856" -for netns in nsrouter1 nsclient1;do - check_counter "$netns" "related" "$expect" - if [ $? -ne 0 ]; then +for netns in "${nsrouter1}" "${nsclient1}";do + if ! check_counter "$netns" "related" "$expect"; then ret=1 fi done @@ -288,21 +253,19 @@ else fi # add 'bad' route, expect icmp REDIRECT to be generated -ip netns exec nsclient1 ip route add 192.168.1.42 via 192.168.1.1 -ip netns exec nsclient1 ip route add dead:1::42 via dead:1::1 +ip netns exec "${nsclient1}" ip route add 192.168.1.42 via 192.168.1.1 +ip netns exec "${nsclient1}" ip route add dead:1::42 via dead:1::1 -ip netns exec "nsclient1" ping -q -c 2 192.168.1.42 > /dev/null +ip netns exec "$nsclient1" ping -W 1 -q -i 0.5 -c 2 192.168.1.42 > /dev/null expect="packets 1 bytes 112" -check_counter nsclient1 "redir4" "$expect" -if [ $? -ne 0 ];then +if ! check_counter "$nsclient1" "redir4" "$expect"; then ret=1 fi -ip netns exec "nsclient1" ping -c 1 dead:1::42 > /dev/null +ip netns exec "$nsclient1" ping -W 1 -c 1 dead:1::42 > /dev/null expect="packets 1 bytes 192" -check_counter nsclient1 "redir6" "$expect" -if [ $? -ne 0 ];then +if ! check_counter "$nsclient1" "redir6" "$expect"; then ret=1 fi diff --git a/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh index eb9553e4986b..9832a5d0198a 100755 --- a/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh @@ -1,8 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh # Conntrack needs to reassemble fragments in order to have complete # packets for rule matching. Reassembly can lead to packet loss. @@ -23,56 +22,44 @@ ksft_skip=4 # between Client A and Client B over WAN. Wanrouter has MTU 1400 set # on its interfaces. -rnd=$(mktemp -u XXXXXXXX) rx=$(mktemp) -r_a="ns-ra-$rnd" -r_b="ns-rb-$rnd" -r_w="ns-rw-$rnd" -c_a="ns-ca-$rnd" -c_b="ns-cb-$rnd" - -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - checktool "iptables --version" "run test without iptables" -checktool "ip -Version" "run test without ip tool" -checktool "which socat" "run test without socat" -checktool "ip netns add ${r_a}" "create net namespace" +checktool "socat -h" "run test without socat" -for n in ${r_b} ${r_w} ${c_a} ${c_b};do - ip netns add ${n} -done +setup_ns r_a r_b r_w c_a c_b cleanup() { - for n in ${r_a} ${r_b} ${r_w} ${c_a} ${c_b};do - ip netns del ${n} - done - rm -f ${rx} + cleanup_all_ns + rm -f "$rx" } trap cleanup EXIT +listener_ready() +{ + ns="$1" + port="$2" + ss -N "$ns" -lnu -o "sport = :$port" | grep -q "$port" +} + test_path() { msg="$1" - ip netns exec ${c_b} socat -t 3 - udp4-listen:5000,reuseaddr > ${rx} < /dev/null & + ip netns exec "$c_b" socat -t 3 - udp4-listen:5000,reuseaddr > "$rx" < /dev/null & + + busywait $BUSYWAIT_TIMEOUT listener_ready "$c_b" 5000 - sleep 1 for i in 1 2 3; do head -c1400 /dev/zero | tr "\000" "a" | \ - ip netns exec ${c_a} socat -t 1 -u STDIN UDP:192.168.20.2:5000 + ip netns exec "$c_a" socat -t 1 -u STDIN UDP:192.168.20.2:5000 done wait - bytes=$(wc -c < ${rx}) + bytes=$(wc -c < "$rx") - if [ $bytes -eq 1400 ];then + if [ "$bytes" -eq 1400 ];then echo "OK: PMTU $msg connection tracking" else echo "FAIL: PMTU $msg connection tracking: got $bytes, expected 1400" @@ -91,24 +78,24 @@ test_path() { # 10.4.4.1 via 10.2.2.254 (Router B via Wanrouter) # No iptables rules at all. -ip link add veth0 netns ${r_a} type veth peer name veth0 netns ${r_w} -ip link add veth1 netns ${r_a} type veth peer name veth0 netns ${c_a} +ip link add veth0 netns "$r_a" type veth peer name veth0 netns "$r_w" +ip link add veth1 netns "$r_a" type veth peer name veth0 netns "$c_a" l_addr="10.2.2.1" r_addr="10.4.4.1" -ip netns exec ${r_a} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip +ip netns exec "$r_a" ip link add ipip0 type ipip local "$l_addr" remote "$r_addr" mode ipip || exit $ksft_skip for dev in lo veth0 veth1 ipip0; do - ip -net ${r_a} link set $dev up + ip -net "$r_a" link set "$dev" up done -ip -net ${r_a} addr add 10.2.2.1/24 dev veth0 -ip -net ${r_a} addr add 192.168.10.1/24 dev veth1 +ip -net "$r_a" addr add 10.2.2.1/24 dev veth0 +ip -net "$r_a" addr add 192.168.10.1/24 dev veth1 -ip -net ${r_a} route add 192.168.20.0/24 dev ipip0 -ip -net ${r_a} route add 10.4.4.0/24 via 10.2.2.254 +ip -net "$r_a" route add 192.168.20.0/24 dev ipip0 +ip -net "$r_a" route add 10.4.4.0/24 via 10.2.2.254 -ip netns exec ${r_a} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null +ip netns exec "$r_a" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null # Detailed setup for Router B # --------------------------- @@ -121,49 +108,46 @@ ip netns exec ${r_a} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null # 10.2.2.1 via 10.4.4.254 (Router A via Wanrouter) # No iptables rules at all. -ip link add veth0 netns ${r_b} type veth peer name veth1 netns ${r_w} -ip link add veth1 netns ${r_b} type veth peer name veth0 netns ${c_b} +ip link add veth0 netns "$r_b" type veth peer name veth1 netns "$r_w" +ip link add veth1 netns "$r_b" type veth peer name veth0 netns "$c_b" l_addr="10.4.4.1" r_addr="10.2.2.1" -ip netns exec ${r_b} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip +ip netns exec "$r_b" ip link add ipip0 type ipip local "${l_addr}" remote "${r_addr}" mode ipip || exit $ksft_skip -for dev in lo veth0 veth1 ipip0; do - ip -net ${r_b} link set $dev up +for dev in veth0 veth1 ipip0; do + ip -net "$r_b" link set $dev up done -ip -net ${r_b} addr add 10.4.4.1/24 dev veth0 -ip -net ${r_b} addr add 192.168.20.1/24 dev veth1 +ip -net "$r_b" addr add 10.4.4.1/24 dev veth0 +ip -net "$r_b" addr add 192.168.20.1/24 dev veth1 -ip -net ${r_b} route add 192.168.10.0/24 dev ipip0 -ip -net ${r_b} route add 10.2.2.0/24 via 10.4.4.254 -ip netns exec ${r_b} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null +ip -net "$r_b" route add 192.168.10.0/24 dev ipip0 +ip -net "$r_b" route add 10.2.2.0/24 via 10.4.4.254 +ip netns exec "$r_b" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null # Client A -ip -net ${c_a} addr add 192.168.10.2/24 dev veth0 -ip -net ${c_a} link set dev lo up -ip -net ${c_a} link set dev veth0 up -ip -net ${c_a} route add default via 192.168.10.1 +ip -net "$c_a" addr add 192.168.10.2/24 dev veth0 +ip -net "$c_a" link set dev veth0 up +ip -net "$c_a" route add default via 192.168.10.1 # Client A -ip -net ${c_b} addr add 192.168.20.2/24 dev veth0 -ip -net ${c_b} link set dev veth0 up -ip -net ${c_b} link set dev lo up -ip -net ${c_b} route add default via 192.168.20.1 +ip -net "$c_b" addr add 192.168.20.2/24 dev veth0 +ip -net "$c_b" link set dev veth0 up +ip -net "$c_b" route add default via 192.168.20.1 # Wan -ip -net ${r_w} addr add 10.2.2.254/24 dev veth0 -ip -net ${r_w} addr add 10.4.4.254/24 dev veth1 +ip -net "$r_w" addr add 10.2.2.254/24 dev veth0 +ip -net "$r_w" addr add 10.4.4.254/24 dev veth1 -ip -net ${r_w} link set dev lo up -ip -net ${r_w} link set dev veth0 up mtu 1400 -ip -net ${r_w} link set dev veth1 up mtu 1400 +ip -net "$r_w" link set dev veth0 up mtu 1400 +ip -net "$r_w" link set dev veth1 up mtu 1400 -ip -net ${r_a} link set dev veth0 mtu 1400 -ip -net ${r_b} link set dev veth0 mtu 1400 +ip -net "$r_a" link set dev veth0 mtu 1400 +ip -net "$r_b" link set dev veth0 mtu 1400 -ip netns exec ${r_w} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null +ip netns exec "$r_w" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null # Path MTU discovery # ------------------ @@ -203,5 +187,5 @@ test_path "without" #packet is too big (1400) for the tunnel PMTU (1380) to Router B, it is #dropped on Router A before sending. -ip netns exec ${r_a} iptables -A FORWARD -m conntrack --ctstate NEW +ip netns exec "$r_a" iptables -A FORWARD -m conntrack --ctstate NEW test_path "with" diff --git a/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh new file mode 100755 index 000000000000..d860f7d9744b --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Testing For SCTP COLLISION SCENARIO as Below: +# +# 14:35:47.655279 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT] [init tag: 2017837359] +# 14:35:48.353250 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT] [init tag: 1187206187] +# 14:35:48.353275 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT ACK] [init tag: 2017837359] +# 14:35:48.353283 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [COOKIE ECHO] +# 14:35:48.353977 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [COOKIE ACK] +# 14:35:48.855335 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT ACK] [init tag: 164579970] +# +# TOPO: SERVER_NS (link0)<--->(link1) ROUTER_NS (link2)<--->(link3) CLIENT_NS + +source lib.sh + +CLIENT_IP="198.51.200.1" +CLIENT_PORT=1234 + +SERVER_IP="198.51.100.1" +SERVER_PORT=1234 + +CLIENT_GW="198.51.200.2" +SERVER_GW="198.51.100.2" + +# setup the topo +setup() { + setup_ns CLIENT_NS SERVER_NS ROUTER_NS + ip -n "$SERVER_NS" link add link0 type veth peer name link1 netns "$ROUTER_NS" + ip -n "$CLIENT_NS" link add link3 type veth peer name link2 netns "$ROUTER_NS" + + ip -n "$SERVER_NS" link set link0 up + ip -n "$SERVER_NS" addr add $SERVER_IP/24 dev link0 + ip -n "$SERVER_NS" route add $CLIENT_IP dev link0 via $SERVER_GW + + ip -n "$ROUTER_NS" link set link1 up + ip -n "$ROUTER_NS" link set link2 up + ip -n "$ROUTER_NS" addr add $SERVER_GW/24 dev link1 + ip -n "$ROUTER_NS" addr add $CLIENT_GW/24 dev link2 + ip net exec "$ROUTER_NS" sysctl -wq net.ipv4.ip_forward=1 + + ip -n "$CLIENT_NS" link set link3 up + ip -n "$CLIENT_NS" addr add $CLIENT_IP/24 dev link3 + ip -n "$CLIENT_NS" route add $SERVER_IP dev link3 via $CLIENT_GW + + # simulate the delay on OVS upcall by setting up a delay for INIT_ACK with + # tc on $SERVER_NS side + tc -n "$SERVER_NS" qdisc add dev link0 root handle 1: htb r2q 64 + tc -n "$SERVER_NS" class add dev link0 parent 1: classid 1:1 htb rate 100mbit + tc -n "$SERVER_NS" filter add dev link0 parent 1: protocol ip u32 match ip protocol 132 \ + 0xff match u8 2 0xff at 32 flowid 1:1 + if ! tc -n "$SERVER_NS" qdisc add dev link0 parent 1:1 handle 10: netem delay 1200ms; then + echo "SKIP: Cannot add netem qdisc" + exit $ksft_skip + fi + + # simulate the ctstate check on OVS nf_conntrack + ip net exec "$ROUTER_NS" iptables -A FORWARD -m state --state INVALID,UNTRACKED -j DROP + ip net exec "$ROUTER_NS" iptables -A INPUT -p sctp -j DROP + + # use a smaller number for assoc's max_retrans to reproduce the issue + modprobe -q sctp + ip net exec "$CLIENT_NS" sysctl -wq net.sctp.association_max_retrans=3 +} + +cleanup() { + ip net exec "$CLIENT_NS" pkill sctp_collision >/dev/null 2>&1 + ip net exec "$SERVER_NS" pkill sctp_collision >/dev/null 2>&1 + cleanup_all_ns +} + +do_test() { + ip net exec "$SERVER_NS" ./sctp_collision server \ + $SERVER_IP $SERVER_PORT $CLIENT_IP $CLIENT_PORT & + ip net exec "$CLIENT_NS" ./sctp_collision client \ + $CLIENT_IP $CLIENT_PORT $SERVER_IP $SERVER_PORT +} + +# NOTE: one way to work around the issue is set a smaller hb_interval +# ip net exec $CLIENT_NS sysctl -wq net.sctp.hb_interval=3500 + +# run the test case +trap cleanup EXIT +setup && \ +echo "Test for SCTP Collision in nf_conntrack:" && \ +do_test && echo "PASS!" +exit $? diff --git a/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh new file mode 100755 index 000000000000..1f862c089028 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh @@ -0,0 +1,153 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Check that UNREPLIED tcp conntrack will eventually timeout. +# + +source lib.sh + +if ! nft --version > /dev/null 2>&1;then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +if ! conntrack --version > /dev/null 2>&1;then + echo "SKIP: Could not run test without conntrack tool" + exit $ksft_skip +fi + +ret=0 + +cleanup() { + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + + cleanup_all_ns +} + +ipv4() { + echo -n 192.168."$1".2 +} + +check_counter() +{ + ns=$1 + name=$2 + expect=$3 + local lret=0 + + if ! ip netns exec "$ns2" nft list counter inet filter "$name" | grep -q "$expect"; then + echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2 + ip netns exec "$ns2" nft list counter inet filter "$name" 1>&2 + lret=1 + fi + + return $lret +} + +trap cleanup EXIT + +# Create test namespaces +setup_ns ns1 ns2 + +# Connect the namespace to the host using a veth pair +ip -net "$ns1" link add name veth1 type veth peer name veth2 +ip -net "$ns1" link set netns "$ns2" dev veth2 + +ip -net "$ns1" link set up dev lo +ip -net "$ns2" link set up dev lo +ip -net "$ns1" link set up dev veth1 +ip -net "$ns2" link set up dev veth2 + +ip -net "$ns2" addr add 10.11.11.2/24 dev veth2 +ip -net "$ns2" route add default via 10.11.11.1 + +ip netns exec "$ns2" sysctl -q net.ipv4.conf.veth2.forwarding=1 + +# add a rule inside NS so we enable conntrack +ip netns exec "$ns1" nft -f - <<EOF +table inet filter { + chain input { + type filter hook input priority 0; policy accept; + ct state established accept + } +} +EOF + +ip -net "$ns1" addr add 10.11.11.1/24 dev veth1 +ip -net "$ns1" route add 10.99.99.99 via 10.11.11.2 + +# Check connectivity works +ip netns exec "$ns1" ping -q -c 2 10.11.11.2 >/dev/null || exit 1 + +ip netns exec "$ns2" socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT & + +ip netns exec "$ns2" nft -f - <<EOF +table inet filter { + counter connreq { } + counter redir { } + chain input { + type filter hook input priority 0; policy accept; + ct state new tcp flags syn ip daddr 10.99.99.99 tcp dport 80 counter name "connreq" accept + ct state new ct status dnat tcp dport 8080 counter name "redir" accept + } +} +EOF +if [ $? -ne 0 ]; then + echo "ERROR: Could not load nft rules" + exit 1 +fi + +ip netns exec "$ns2" sysctl -q net.netfilter.nf_conntrack_tcp_timeout_syn_sent=10 + +echo "INFO: connect $ns1 -> $ns2 to the virtual ip" +ip netns exec "$ns1" bash -c 'for i in $(seq 1 $BUSYWAIT_TIMEOUT) ; do + socat -u STDIN TCP:10.99.99.99:80 < /dev/null + sleep 0.1 + done' & + +ip netns exec "$ns2" nft -f - <<EOF +table inet nat { + chain prerouting { + type nat hook prerouting priority 0; policy accept; + ip daddr 10.99.99.99 tcp dport 80 redirect to :8080 + } +} +EOF +if [ $? -ne 0 ]; then + echo "ERROR: Could not load nat redirect" + exit 1 +fi + +count=$(ip netns exec "$ns2" conntrack -L -p tcp --dport 80 2>/dev/null | wc -l) +if [ "$count" -eq 0 ]; then + echo "ERROR: $ns2 did not pick up tcp connection from peer" + exit 1 +fi + +wait_for_redirect() +{ + count=$(ip netns exec "$ns2" conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l) + if [ "$count" -gt 0 ]; then + return 0 + fi + + return 1 +} +echo "INFO: NAT redirect added in ns $ns2, waiting for $BUSYWAIT_TIMEOUT ms for nat to take effect" + +busywait $BUSYWAIT_TIMEOUT wait_for_redirect +ret=$? + +expect="packets 1 bytes 60" +if ! check_counter "$ns2" "redir" "$expect"; then + ret=1 +fi + +if [ $ret -eq 0 ];then + echo "PASS: redirection counter has expected values" +else + echo "ERROR: no tcp connection was redirected" +fi + +exit $ret diff --git a/tools/testing/selftests/netfilter/conntrack_vrf.sh b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh index 8b5ea9234588..073e8e62d350 100755 --- a/tools/testing/selftests/netfilter/conntrack_vrf.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # This script demonstrates interaction of conntrack and vrf. # The vrf driver calls the netfilter hooks again, with oif/iif @@ -28,84 +28,67 @@ # that was supposed to be fixed by the commit mentioned above to make sure # that any fix to test case 1 won't break masquerade again. -ksft_skip=4 +source lib.sh IP0=172.30.30.1 IP1=172.30.30.2 PFXL=30 ret=0 -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" -ns1="ns1-$sfx" - cleanup() { ip netns pids $ns0 | xargs kill 2>/dev/null ip netns pids $ns1 | xargs kill 2>/dev/null - ip netns del $ns0 $ns1 + cleanup_all_ns } -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add "$ns0" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns0" - exit $ksft_skip -fi -ip netns add "$ns1" +checktool "nft --version" "run test without nft" +checktool "conntrack --version" "run test without conntrack" +checktool "socat -h" "run test without socat" trap cleanup EXIT -ip netns exec $ns0 sysctl -q -w net.ipv4.conf.default.rp_filter=0 -ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0 -ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0 +setup_ns ns0 ns1 + +ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.default.rp_filter=0 +ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.rp_filter=0 +ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.rp_filter=0 -ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1; then echo "SKIP: Could not add veth device" exit $ksft_skip fi -ip -net $ns0 li add tvrf type vrf table 9876 -if [ $? -ne 0 ];then +if ! ip -net "$ns0" li add tvrf type vrf table 9876; then echo "SKIP: Could not add vrf device" exit $ksft_skip fi -ip -net $ns0 li set lo up +ip -net "$ns0" li set veth0 master tvrf +ip -net "$ns0" li set tvrf up +ip -net "$ns0" li set veth0 up +ip -net "$ns1" li set veth0 up -ip -net $ns0 li set veth0 master tvrf -ip -net $ns0 li set tvrf up -ip -net $ns0 li set veth0 up -ip -net $ns1 li set veth0 up +ip -net "$ns0" addr add $IP0/$PFXL dev veth0 +ip -net "$ns1" addr add $IP1/$PFXL dev veth0 -ip -net $ns0 addr add $IP0/$PFXL dev veth0 -ip -net $ns1 addr add $IP1/$PFXL dev veth0 +listener_ready() +{ + local ns="$1" -ip netns exec $ns1 iperf3 -s > /dev/null 2>&1& -if [ $? -ne 0 ];then - echo "SKIP: Could not start iperf3" - exit $ksft_skip -fi + ss -N "$ns" -l -n -t -o "sport = :55555" | grep -q "55555" +} + +ip netns exec "$ns1" socat -u -4 TCP-LISTEN:55555,reuseaddr,fork STDOUT > /dev/null & +busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1" # test vrf ingress handling. # The incoming connection should be placed in conntrack zone 1, # as decided by the first iteration of the ruleset. test_ct_zone_in() { -ip netns exec $ns0 nft -f - <<EOF +ip netns exec "$ns0" nft -f - <<EOF table testct { chain rawpre { type filter hook prerouting priority raw; @@ -126,21 +109,21 @@ table testct { } } EOF - ip netns exec $ns1 ping -W 1 -c 1 -I veth0 $IP0 > /dev/null + ip netns exec "$ns1" ping -W 1 -c 1 -I veth0 "$IP0" > /dev/null # should be in zone 1, not zone 2 - count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l) - if [ $count -eq 1 ]; then + count=$(ip netns exec "$ns0" conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l) + if [ "$count" -eq 1 ]; then echo "PASS: entry found in conntrack zone 1" else echo "FAIL: entry not found in conntrack zone 1" - count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l) - if [ $count -eq 1 ]; then + count=$(ip netns exec "$ns0" conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l) + if [ "$count" -eq 1 ]; then echo "FAIL: entry found in zone 2 instead" else echo "FAIL: entry not in zone 1 or 2, dumping table" - ip netns exec $ns0 conntrack -L - ip netns exec $ns0 nft list ruleset + ip netns exec "$ns0" conntrack -L + ip netns exec "$ns0" nft list ruleset fi fi } @@ -153,12 +136,12 @@ test_masquerade_vrf() local qdisc=$1 if [ "$qdisc" != "default" ]; then - tc -net $ns0 qdisc add dev tvrf root $qdisc + tc -net "$ns0" qdisc add dev tvrf root "$qdisc" fi - ip netns exec $ns0 conntrack -F 2>/dev/null + ip netns exec "$ns0" conntrack -F 2>/dev/null -ip netns exec $ns0 nft -f - <<EOF +ip netns exec "$ns0" nft -f - <<EOF flush ruleset table ip nat { chain rawout { @@ -179,25 +162,23 @@ table ip nat { } } EOF - ip netns exec $ns0 ip vrf exec tvrf iperf3 -t 1 -c $IP1 >/dev/null - if [ $? -ne 0 ]; then - echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on vrf device" + if ! ip netns exec "$ns0" ip vrf exec tvrf socat -u -4 STDIN TCP:"$IP1":55555 < /dev/null > /dev/null;then + echo "FAIL: connect failure with masquerade + sport rewrite on vrf device" ret=1 return fi # must also check that nat table was evaluated on second (lower device) iteration. - ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' && - ip netns exec $ns0 nft list table ip nat |grep -q 'untracked counter packets [1-9]' - if [ $? -eq 0 ]; then - echo "PASS: iperf3 connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)" + if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 1' && + ip netns exec "$ns0" nft list table ip nat |grep -q 'untracked counter packets [1-9]'; then + echo "PASS: connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)" else echo "FAIL: vrf rules have unexpected counter value" ret=1 fi if [ "$qdisc" != "default" ]; then - tc -net $ns0 qdisc del dev tvrf root + tc -net "$ns0" qdisc del dev tvrf root fi } @@ -206,8 +187,8 @@ EOF # oifname is the lower device (veth0 in this case). test_masquerade_veth() { - ip netns exec $ns0 conntrack -F 2>/dev/null -ip netns exec $ns0 nft -f - <<EOF + ip netns exec "$ns0" conntrack -F 2>/dev/null +ip netns exec "$ns0" nft -f - <<EOF flush ruleset table ip nat { chain postrouting { @@ -216,17 +197,15 @@ table ip nat { } } EOF - ip netns exec $ns0 ip vrf exec tvrf iperf3 -t 1 -c $IP1 > /dev/null - if [ $? -ne 0 ]; then - echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on veth device" + if ! ip netns exec "$ns0" ip vrf exec tvrf socat -u -4 STDIN TCP:"$IP1":55555 < /dev/null > /dev/null;then + echo "FAIL: connect failure with masquerade + sport rewrite on veth device" ret=1 return fi # must also check that nat table was evaluated on second (lower device) iteration. - ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' - if [ $? -eq 0 ]; then - echo "PASS: iperf3 connect with masquerade + sport rewrite on veth device" + if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 1'; then + echo "PASS: connect with masquerade + sport rewrite on veth device" else echo "FAIL: vrf masq rule has unexpected counter value" ret=1 diff --git a/tools/testing/selftests/net/netfilter/ipvs.sh b/tools/testing/selftests/net/netfilter/ipvs.sh new file mode 100755 index 000000000000..4ceee9fb3949 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/ipvs.sh @@ -0,0 +1,211 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# End-to-end ipvs test suite +# Topology: +#--------------------------------------------------------------+ +# | | +# ns0 | ns1 | +# ----------- | ----------- ----------- | +# | veth01 | --------- | veth10 | | veth12 | | +# ----------- peer ----------- ----------- | +# | | | | +# ----------- | | | +# | br0 | |----------------- peer |--------------| +# ----------- | | | +# | | | | +# ---------- peer ---------- ----------- | +# | veth02 | --------- | veth20 | | veth21 | | +# ---------- | ---------- ----------- | +# | ns2 | +# | | +#--------------------------------------------------------------+ +# +# We assume that all network driver are loaded +# + +source lib.sh + +ret=0 +GREEN='\033[0;92m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +readonly port=8080 + +readonly vip_v4=207.175.44.110 +readonly cip_v4=10.0.0.2 +readonly gip_v4=10.0.0.1 +readonly dip_v4=172.16.0.1 +readonly rip_v4=172.16.0.2 +readonly sip_v4=10.0.0.3 + +readonly infile="$(mktemp)" +readonly outfile="$(mktemp)" +readonly datalen=32 + +sysipvsnet="/proc/sys/net/ipv4/vs/" +if [ ! -d $sysipvsnet ]; then + if ! modprobe -q ip_vs; then + echo "skip: could not run test without ipvs module" + exit $ksft_skip + fi +fi + +checktool "ipvsadm -v" "run test without ipvsadm" +checktool "socat -h" "run test without socat" + +setup() { + setup_ns ns0 ns1 ns2 + + ip link add veth01 netns "${ns0}" type veth peer name veth10 netns "${ns1}" + ip link add veth02 netns "${ns0}" type veth peer name veth20 netns "${ns2}" + ip link add veth12 netns "${ns1}" type veth peer name veth21 netns "${ns2}" + + ip netns exec "${ns0}" ip link set veth01 up + ip netns exec "${ns0}" ip link set veth02 up + ip netns exec "${ns0}" ip link add br0 type bridge + ip netns exec "${ns0}" ip link set veth01 master br0 + ip netns exec "${ns0}" ip link set veth02 master br0 + ip netns exec "${ns0}" ip link set br0 up + ip netns exec "${ns0}" ip addr add "${cip_v4}/24" dev br0 + + ip netns exec "${ns1}" ip link set veth10 up + ip netns exec "${ns1}" ip addr add "${gip_v4}/24" dev veth10 + ip netns exec "${ns1}" ip link set veth12 up + ip netns exec "${ns1}" ip addr add "${dip_v4}/24" dev veth12 + + ip netns exec "${ns2}" ip link set veth21 up + ip netns exec "${ns2}" ip addr add "${rip_v4}/24" dev veth21 + ip netns exec "${ns2}" ip link set veth20 up + ip netns exec "${ns2}" ip addr add "${sip_v4}/24" dev veth20 + + sleep 1 + + dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none +} + +cleanup() { + cleanup_all_ns + + if [ -f "${outfile}" ]; then + rm "${outfile}" + fi + if [ -f "${infile}" ]; then + rm "${infile}" + fi +} + +server_listen() { + ip netns exec "$ns2" socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT > "${outfile}" & + server_pid=$! + sleep 0.2 +} + +client_connect() { + ip netns exec "${ns0}" timeout 2 socat -u -4 STDIN TCP:"${vip_v4}":"${port}" < "${infile}" +} + +verify_data() { + wait "${server_pid}" + cmp "$infile" "$outfile" 2>/dev/null +} + +test_service() { + server_listen + client_connect + verify_data +} + + +test_dr() { + ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0 + + ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=1 + ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr + ip netns exec "${ns1}" ipvsadm -a -t "${vip_v4}:${port}" -r "${rip_v4}:${port}" + ip netns exec "${ns1}" ip addr add "${vip_v4}/32" dev lo:1 + + # avoid incorrect arp response + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_ignore=1 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_announce=2 + # avoid reverse route lookup + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.veth21.rp_filter=0 + ip netns exec "${ns2}" ip addr add "${vip_v4}/32" dev lo:1 + + test_service +} + +test_nat() { + ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0 + + ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=1 + ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr + ip netns exec "${ns1}" ipvsadm -a -m -t "${vip_v4}:${port}" -r "${rip_v4}:${port}" + ip netns exec "${ns1}" ip addr add "${vip_v4}/32" dev lo:1 + + ip netns exec "${ns2}" ip link del veth20 + ip netns exec "${ns2}" ip route add default via "${dip_v4}" dev veth21 + + test_service +} + +test_tun() { + ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0 + + ip netns exec "${ns1}" modprobe -q ipip + ip netns exec "${ns1}" ip link set tunl0 up + ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=0 + ip netns exec "${ns1}" sysctl -qw net.ipv4.conf.all.send_redirects=0 + ip netns exec "${ns1}" sysctl -qw net.ipv4.conf.default.send_redirects=0 + ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr + ip netns exec "${ns1}" ipvsadm -a -i -t "${vip_v4}:${port}" -r ${rip_v4}:${port} + ip netns exec "${ns1}" ip addr add ${vip_v4}/32 dev lo:1 + + ip netns exec "${ns2}" modprobe -q ipip + ip netns exec "${ns2}" ip link set tunl0 up + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_ignore=1 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_announce=2 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.tunl0.rp_filter=0 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.veth21.rp_filter=0 + ip netns exec "${ns2}" ip addr add "${vip_v4}/32" dev lo:1 + + test_service +} + +run_tests() { + local errors= + + echo "Testing DR mode..." + cleanup + setup + test_dr + errors=$(( $errors + $? )) + + echo "Testing NAT mode..." + cleanup + setup + test_nat + errors=$(( $errors + $? )) + + echo "Testing Tunnel mode..." + cleanup + setup + test_tun + errors=$(( $errors + $? )) + + return $errors +} + +trap cleanup EXIT + +run_tests + +if [ $? -ne 0 ]; then + echo -e "$(basename $0): ${RED}FAIL${NC}" + exit 1 +fi +echo -e "$(basename $0): ${GREEN}PASS${NC}" +exit 0 diff --git a/tools/testing/selftests/net/netfilter/lib.sh b/tools/testing/selftests/net/netfilter/lib.sh new file mode 100644 index 000000000000..bedd35298e15 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/lib.sh @@ -0,0 +1,10 @@ +net_netfilter_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "$net_netfilter_dir/../lib.sh" + +checktool (){ + if ! $1 > /dev/null 2>&1; then + echo "SKIP: Could not $2" + exit $ksft_skip + fi +} diff --git a/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh new file mode 100755 index 000000000000..1014551dd769 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test NAT source port clash resolution +# + +source lib.sh +ret=0 +socatpid=0 + +cleanup() +{ + [ "$socatpid" -gt 0 ] && kill "$socatpid" + + cleanup_all_ns +} + +checktool "socat -h" "run test without socat" +checktool "iptables --version" "run test without iptables" + +trap cleanup EXIT + +setup_ns ns1 ns2 + +# Connect the namespaces using a veth pair +ip link add name veth2 type veth peer name veth1 +ip link set netns "$ns1" dev veth1 +ip link set netns "$ns2" dev veth2 + +ip netns exec "$ns1" ip link set up dev lo +ip netns exec "$ns1" ip link set up dev veth1 +ip netns exec "$ns1" ip addr add 192.168.1.1/24 dev veth1 + +ip netns exec "$ns2" ip link set up dev lo +ip netns exec "$ns2" ip link set up dev veth2 +ip netns exec "$ns2" ip addr add 192.168.1.2/24 dev veth2 + +# Create a server in one namespace +ip netns exec "$ns1" socat -u TCP-LISTEN:5201,fork OPEN:/dev/null,wronly=1 & +socatpid=$! + +# Restrict source port to just one so we don't have to exhaust +# all others. +ip netns exec "$ns2" sysctl -q net.ipv4.ip_local_port_range="10000 10000" + +# add a virtual IP using DNAT +ip netns exec "$ns2" iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201 + +# ... and route it to the other namespace +ip netns exec "$ns2" ip route add 10.96.0.1 via 192.168.1.1 + +# add a persistent connection from the other namespace +ip netns exec "$ns2" socat -t 10 - TCP:192.168.1.1:5201 > /dev/null & + +sleep 1 + +# ip daddr:dport will be rewritten to 192.168.1.1 5201 +# NAT must reallocate source port 10000 because +# 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use +echo test | ip netns exec "$ns2" socat -t 3 -u STDIN TCP:10.96.0.1:443,connect-timeout=3 >/dev/null +ret=$? + +# Check socat can connect to 10.96.0.1:443 (aka 192.168.1.1:5201). +if [ $ret -eq 0 ]; then + echo "PASS: socat can connect via NAT'd address" +else + echo "FAIL: socat cannot connect via NAT'd address" +fi + +# check sport clashres. +ip netns exec "$ns1" iptables -t nat -A PREROUTING -p tcp --dport 5202 -j REDIRECT --to-ports 5201 +ip netns exec "$ns1" iptables -t nat -A PREROUTING -p tcp --dport 5203 -j REDIRECT --to-ports 5201 + +sleep 5 | ip netns exec "$ns2" socat -t 5 -u STDIN TCP:192.168.1.1:5202,connect-timeout=5 >/dev/null & + +# if connect succeeds, client closes instantly due to EOF on stdin. +# if connect hangs, it will time out after 5s. +echo | ip netns exec "$ns2" socat -t 3 -u STDIN TCP:192.168.1.1:5203,connect-timeout=5 >/dev/null & +cpid2=$! + +time_then=$(date +%s) +wait $cpid2 +rv=$? +time_now=$(date +%s) + +# Check how much time has elapsed, expectation is for +# 'cpid2' to connect and then exit (and no connect delay). +delta=$((time_now - time_then)) + +if [ $delta -lt 2 ] && [ $rv -eq 0 ]; then + echo "PASS: could connect to service via redirected ports" +else + echo "FAIL: socat cannot connect to service via redirect ($delta seconds elapsed, returned $rv)" + ret=1 +fi + +exit $ret diff --git a/tools/testing/selftests/netfilter/nf-queue.c b/tools/testing/selftests/net/netfilter/nf_queue.c index 9e56b9d47037..9e56b9d47037 100644 --- a/tools/testing/selftests/netfilter/nf-queue.c +++ b/tools/testing/selftests/net/netfilter/nf_queue.c diff --git a/tools/testing/selftests/netfilter/nft_audit.sh b/tools/testing/selftests/net/netfilter/nft_audit.sh index 99ed5bd6e840..902f8114bc80 100755 --- a/tools/testing/selftests/netfilter/nft_audit.sh +++ b/tools/testing/selftests/net/netfilter/nft_audit.sh @@ -6,11 +6,34 @@ SKIP_RC=4 RC=0 +if [ -r /var/run/auditd.pid ];then + read pid < /var/run/auditd.pid + p=$(pgrep ^auditd$) + + if [ "$pid" -eq "$p" ]; then + echo "SKIP: auditd is running" + exit $SKIP_RC + fi +fi + nft --version >/dev/null 2>&1 || { echo "SKIP: missing nft tool" exit $SKIP_RC } +# nft must be recent enough to support "reset" keyword. +nft --check -f /dev/stdin >/dev/null 2>&1 <<EOF +add table t +add chain t c +reset rules t c +EOF + +if [ "$?" -ne 0 ];then + echo -n "SKIP: nft reset feature test failed: " + nft --version + exit $SKIP_RC +fi + # Run everything in a separate network namespace [ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } @@ -73,7 +96,7 @@ done for ((i = 0; i < 500; i++)); do echo "add rule t2 c3 counter accept comment \"rule $i\"" -done >$rulefile +done > "$rulefile" do_test "nft -f $rulefile" \ 'table=t2 family=2 entries=500 op=nft_register_rule' @@ -101,7 +124,7 @@ do_test 'nft add counter t2 c1; add counter t2 c2' \ for ((i = 3; i <= 500; i++)); do echo "add counter t2 c$i" -done >$rulefile +done > "$rulefile" do_test "nft -f $rulefile" \ 'table=t2 family=2 entries=498 op=nft_register_obj' @@ -115,7 +138,7 @@ do_test 'nft add quota t2 q1 { 10 bytes }; add quota t2 q2 { 10 bytes }' \ for ((i = 3; i <= 500; i++)); do echo "add quota t2 q$i { 10 bytes }" -done >$rulefile +done > "$rulefile" do_test "nft -f $rulefile" \ 'table=t2 family=2 entries=498 op=nft_register_obj' @@ -157,7 +180,7 @@ table=t2 family=2 entries=135 op=nft_reset_rule' # resetting sets and elements -elem=(22 ,80 ,443) +elem=(22 ",80" ",443") relem="" for i in {1..3}; do relem+="${elem[((i - 1))]}" diff --git a/tools/testing/selftests/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh index e908009576c7..2b6661519055 100755 --- a/tools/testing/selftests/netfilter/nft_concat_range.sh +++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # # nft_concat_range.sh - Tests for sets with concatenation of ranged fields @@ -7,10 +7,10 @@ # # Author: Stefano Brivio <sbrivio@redhat.com> # -# shellcheck disable=SC2154,SC2034,SC2016,SC2030,SC2031 +# shellcheck disable=SC2154,SC2034,SC2016,SC2030,SC2031,SC2317 # ^ Configuration and templates sourced with eval, counters reused in subshells -KSELFTEST_SKIP=4 +source lib.sh # Available test groups: # - reported_issues: check for issues that were reported in the past @@ -66,7 +66,7 @@ src start 1 count 5 src_delta 2000 -tools sendip nc bash +tools sendip bash proto udp race_repeat 3 @@ -91,7 +91,7 @@ src start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 3 @@ -116,7 +116,7 @@ src start 10 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp6 race_repeat 3 @@ -141,7 +141,7 @@ src start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 0 @@ -163,7 +163,7 @@ src mac start 10 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp6 race_repeat 0 @@ -185,7 +185,7 @@ src mac proto start 10 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp6 race_repeat 0 @@ -207,7 +207,7 @@ src addr4 start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 3 @@ -227,7 +227,7 @@ src addr6 port start 10 count 5 src_delta 2000 -tools sendip socat nc +tools sendip socat proto udp6 race_repeat 3 @@ -247,7 +247,7 @@ src mac proto addr4 start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 0 @@ -264,7 +264,7 @@ src mac start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 0 @@ -286,7 +286,7 @@ src mac addr4 start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 0 @@ -337,7 +337,7 @@ src addr4 start 1 count 5 src_delta 2000 -tools sendip socat nc +tools sendip socat proto udp race_repeat 3 @@ -363,7 +363,7 @@ src mac start 1 count 1 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 0 @@ -473,8 +473,6 @@ setup_veth() { B() { ip netns exec B "$@" >/dev/null 2>&1 } - - sleep 2 } # Fill in set template and initialise set @@ -488,12 +486,6 @@ check_tools() { __tools= for tool in ${tools}; do - if [ "${tool}" = "nc" ] && [ "${proto}" = "udp6" ] && \ - ! nc -u -w0 1.1.1.1 1 2>/dev/null; then - # Some GNU netcat builds might not support IPv6 - __tools="${__tools} netcat-openbsd" - continue - fi __tools="${__tools} ${tool}" command -v "${tool}" >/dev/null && return 0 @@ -554,30 +546,7 @@ setup_send_udp() { ip addr add "${dst_addr4}" dev veth_a 2>/dev/null [ -z "${dst_port}" ] && dst_port=12345 - echo "test4" | B socat -t 0.01 STDIN UDP4-DATAGRAM:${dst_addr4}:${dst_port}"${__socatbind}" - - src_addr4= - src_port= - } - elif command -v nc >/dev/null; then - if nc -u -w0 1.1.1.1 1 2>/dev/null; then - # OpenBSD netcat - nc_opt="-w0" - else - # GNU netcat - nc_opt="-q0" - fi - - send_udp() { - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}" dev veth_b - __src_addr4="-s ${src_addr4}" - fi - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - [ -n "${src_port}" ] && src_port="-p ${src_port}" - - echo "" | B nc -u "${nc_opt}" "${__src_addr4}" \ - "${src_port}" "${dst_addr4}" "${dst_port}" + echo "test4" | B socat -t 0.01 STDIN UDP4-DATAGRAM:"$dst_addr4":"$dst_port""${__socatbind}" src_addr4= src_port= @@ -632,11 +601,7 @@ setup_send_udp6() { __socatbind6= if [ -n "${src_addr6}" ]; then - if [ -n "${src_addr6} != "${src_addr6_added} ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - - src_addr6_added=${src_addr6} - fi + B ip addr add "${src_addr6}" dev veth_b nodad __socatbind6=",bind=[${src_addr6}]" @@ -645,26 +610,7 @@ setup_send_udp6() { fi fi - echo "test6" | B socat -t 0.01 STDIN UDP6-DATAGRAM:[${dst_addr6}]:${dst_port}"${__socatbind6}" - } - elif command -v nc >/dev/null && nc -u -w0 1.1.1.1 1 2>/dev/null; then - # GNU netcat might not work with IPv6, try next tool - send_udp6() { - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - if [ -n "${src_addr6}" ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - else - src_addr6="2001:db8::2" - fi - [ -n "${src_port}" ] && src_port="-p ${src_port}" - - # shellcheck disable=SC2086 # this needs split options - echo "" | B nc -u w0 "-s${src_addr6}" ${src_port} \ - ${dst_addr6} ${dst_port} - - src_addr6= - src_port= + echo "test6" | B socat -t 0.01 STDIN UDP6-DATAGRAM:["$dst_addr6"]:"$dst_port""${__socatbind6}" } elif [ -z "$(bash -c 'type -p')" ]; then send_udp6() { @@ -679,10 +625,17 @@ setup_send_udp6() { fi } +listener_ready() +{ + port="$1" + ss -lnt -o "sport = :$port" | grep -q "$port" +} + # Set up function to send TCP traffic on IPv4 setup_flood_tcp() { if command -v iperf3 >/dev/null; then flood_tcp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -699,7 +652,7 @@ setup_flood_tcp() { # shellcheck disable=SC2086 # this needs split options iperf3 -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B iperf3 -c "${dst_addr4}" ${dst_port} ${src_port} \ @@ -711,6 +664,7 @@ setup_flood_tcp() { } elif command -v iperf >/dev/null; then flood_tcp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -727,7 +681,7 @@ setup_flood_tcp() { # shellcheck disable=SC2086 # this needs split options iperf -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B iperf -c "${dst_addr4}" ${dst_port} ${src_addr4} \ @@ -739,6 +693,7 @@ setup_flood_tcp() { } elif command -v netperf >/dev/null; then flood_tcp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -755,7 +710,7 @@ setup_flood_tcp() { # shellcheck disable=SC2086 # this needs split options netserver -4 ${dst_port} -L "${dst_addr4}" \ >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "${n_port}" # shellcheck disable=SC2086 # this needs split options B netperf -4 -H "${dst_addr4}" ${dst_port} \ @@ -774,6 +729,7 @@ setup_flood_tcp() { setup_flood_tcp6() { if command -v iperf3 >/dev/null; then flood_tcp6() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr6}" ]; then B ip addr add "${src_addr6}" dev veth_b nodad @@ -790,7 +746,7 @@ setup_flood_tcp6() { # shellcheck disable=SC2086 # this needs split options iperf3 -s -DB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "${n_port}" # shellcheck disable=SC2086 # this needs split options B iperf3 -c "${dst_addr6}" ${dst_port} \ @@ -802,6 +758,7 @@ setup_flood_tcp6() { } elif command -v iperf >/dev/null; then flood_tcp6() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr6}" ]; then B ip addr add "${src_addr6}" dev veth_b nodad @@ -818,7 +775,7 @@ setup_flood_tcp6() { # shellcheck disable=SC2086 # this needs split options iperf -s -VDB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B iperf -c "${dst_addr6}" -V ${dst_port} \ @@ -830,6 +787,7 @@ setup_flood_tcp6() { } elif command -v netperf >/dev/null; then flood_tcp6() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr6}" ]; then B ip addr add "${src_addr6}" dev veth_b nodad @@ -846,7 +804,7 @@ setup_flood_tcp6() { # shellcheck disable=SC2086 # this needs split options netserver -6 ${dst_port} -L "${dst_addr6}" \ >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B netperf -6 -H "${dst_addr6}" ${dst_port} \ @@ -865,6 +823,7 @@ setup_flood_tcp6() { setup_flood_udp() { if command -v iperf3 >/dev/null; then flood_udp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -881,7 +840,7 @@ setup_flood_udp() { # shellcheck disable=SC2086 # this needs split options iperf3 -s -DB "${dst_addr4}" ${dst_port} - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B iperf3 -u -c "${dst_addr4}" -Z -b 100M -l16 -t1000 \ @@ -893,6 +852,7 @@ setup_flood_udp() { } elif command -v iperf >/dev/null; then flood_udp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -909,7 +869,7 @@ setup_flood_udp() { # shellcheck disable=SC2086 # this needs split options iperf -u -sDB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B iperf -u -c "${dst_addr4}" -b 100M -l1 -t1000 \ @@ -921,6 +881,7 @@ setup_flood_udp() { } elif command -v netperf >/dev/null; then flood_udp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -937,7 +898,7 @@ setup_flood_udp() { # shellcheck disable=SC2086 # this needs split options netserver -4 ${dst_port} -L "${dst_addr4}" \ >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B netperf -4 -H "${dst_addr4}" ${dst_port} \ @@ -982,6 +943,7 @@ cleanup() { ip link del dummy0 2>/dev/null ip route del default 2>/dev/null ip -6 route del default 2>/dev/null + ip netns pids B 2>/dev/null | xargs kill 2>/dev/null ip netns del B 2>/dev/null ip link del veth_a 2>/dev/null timeout= @@ -989,15 +951,14 @@ cleanup() { killall iperf 2>/dev/null killall netperf 2>/dev/null killall netserver 2>/dev/null - rm -f ${tmp} - sleep 2 + rm -f "$tmp" } # Entry point for setup functions setup() { if [ "$(id -u)" -ne 0 ]; then echo " need to run as root" - exit ${KSELFTEST_SKIP} + exit ${ksft_skip} fi cleanup @@ -1258,7 +1219,7 @@ send_nomatch() { # - check that packets outside range don't match it # - remove some elements, check that packets don't match anymore test_correctness() { - setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + setup veth send_"${proto}" set || return ${ksft_skip} range_size=1 for i in $(seq "${start}" $((start + count))); do @@ -1273,7 +1234,7 @@ test_correctness() { srcend=$((end + src_delta)) add "$(format)" || return 1 - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do send_match "${j}" $((j + src_delta)) || return 1 done send_nomatch $((end + 1)) $((end + 1 + src_delta)) || return 1 @@ -1281,7 +1242,7 @@ test_correctness() { # Delete elements now and then if [ $((i % 3)) -eq 0 ]; then del "$(format)" || return 1 - for j in $(seq ${start} \ + for j in $(seq "$start" \ $((range_size / 2 + 1)) ${end}); do send_nomatch "${j}" $((j + src_delta)) \ || return 1 @@ -1307,12 +1268,12 @@ test_concurrency() { proto=${flood_proto} tools=${flood_tools} chain_spec=${flood_spec} - setup veth flood_"${proto}" set || return ${KSELFTEST_SKIP} + setup veth flood_"${proto}" set || return ${ksft_skip} range_size=1 cstart=${start} flood_pids= - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1325,7 +1286,7 @@ test_concurrency() { start=$((end + range_size)) done - sleep 10 + sleep $((RANDOM%10)) pids= for c in $(seq 1 "$(nproc)"); do ( @@ -1335,7 +1296,7 @@ test_concurrency() { # $start needs to be local to this subshell # shellcheck disable=SC2030 start=${cstart} - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1350,7 +1311,7 @@ test_concurrency() { range_size=1 start=${cstart} - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1366,7 +1327,7 @@ test_concurrency() { range_size=1 start=${cstart} - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1379,7 +1340,7 @@ test_concurrency() { range_size=1 start=${cstart} - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1407,18 +1368,18 @@ test_concurrency() { # - add all the elements with 3s timeout while checking that packets match # - wait 3s after the last insertion, check that packets don't match any entry test_timeout() { - setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + setup veth send_"${proto}" set || return ${ksft_skip} timeout=3 range_size=1 - for i in $(seq "${start}" $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) add "$(format)" || return 1 - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do send_match "${j}" $((j + src_delta)) || return 1 done @@ -1426,12 +1387,12 @@ test_timeout() { start=$((end + range_size)) done sleep 3 - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do send_nomatch "${j}" $((j + src_delta)) || return 1 done @@ -1450,13 +1411,13 @@ test_performance() { chain_spec=${perf_spec} dst="${perf_dst}" src="${perf_src}" - setup veth perf set || return ${KSELFTEST_SKIP} + setup veth perf set || return ${ksft_skip} first=${start} range_size=1 for set in test norange noconcat; do start=${first} - for i in $(seq ${start} $((start + perf_entries))); do + for i in $(seq "$start" $((start + perf_entries))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1464,7 +1425,7 @@ test_performance() { if [ $((end / 65534)) -gt $((start / 65534)) ]; then start=${end} end=$((end + 1)) - elif [ ${start} -eq ${end} ]; then + elif [ "$start" -eq "$end" ]; then end=$((start + 1)) fi @@ -1475,7 +1436,7 @@ test_performance() { nft -f "${tmp}" done - perf $((end - 1)) ${srcstart} + perf $((end - 1)) "$srcstart" sleep 2 @@ -1522,11 +1483,11 @@ test_bug_flush_remove_add() { set_cmd='{ set s { type ipv4_addr . inet_service; flags interval; }; }' elem1='{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }' elem2='{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }' - for i in `seq 1 100`; do - nft add table t ${set_cmd} || return ${KSELFTEST_SKIP} - nft add element t s ${elem1} 2>/dev/null || return 1 + for i in $(seq 1 100); do + nft add table t "$set_cmd" || return ${ksft_skip} + nft add element t s "$elem1" 2>/dev/null || return 1 nft flush set t s 2>/dev/null || return 1 - nft add element t s ${elem2} 2>/dev/null || return 1 + nft add element t s "$elem2" 2>/dev/null || return 1 done nft flush ruleset } @@ -1534,7 +1495,7 @@ test_bug_flush_remove_add() { # - add ranged element, check that packets match it # - reload the set, check packets still match test_bug_reload() { - setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + setup veth send_"${proto}" set || return ${ksft_skip} rstart=${start} range_size=1 @@ -1573,7 +1534,7 @@ test_bug_reload() { srcstart=$((start + src_delta)) srcend=$((end + src_delta)) - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do send_match "${j}" $((j + src_delta)) || return 1 done @@ -1596,7 +1557,7 @@ trap cleanup EXIT # Entry point for test runs passed=0 for name in ${TESTS}; do - printf "TEST: %s\n" "$(echo ${name} | tr '_' ' ')" + printf "TEST: %s\n" "$(echo "$name" | tr '_' ' ')" if [ "${name}" = "reported_issues" ]; then SUBTESTS="${BUGS}" else @@ -1635,11 +1596,11 @@ for name in ${TESTS}; do printf "[FAIL]\n" err_flush exit 1 - elif [ $ret -eq ${KSELFTEST_SKIP} ]; then + elif [ $ret -eq ${ksft_skip} ]; then printf "[SKIP]\n" err_flush fi done done -[ ${passed} -eq 0 ] && exit ${KSELFTEST_SKIP} || exit 0 +[ ${passed} -eq 0 ] && exit ${ksft_skip} || exit 0 diff --git a/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh new file mode 100755 index 000000000000..abcaa7337197 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh @@ -0,0 +1,171 @@ +#!/bin/bash +# +# This tests connection tracking helper assignment: +# 1. can attach ftp helper to a connection from nft ruleset. +# 2. auto-assign still works. +# +# Kselftest framework requirement - SKIP code is 4. + +source lib.sh + +ret=0 + +testipv6=1 + +checktool "socat -h" "run test without socat" +checktool "conntrack --version" "run test without conntrack" +checktool "nft --version" "run test without nft" + +cleanup() +{ + ip netns pids "$ns1" | xargs kill 2>/dev/null + + ip netns del "$ns1" + ip netns del "$ns2" +} + +trap cleanup EXIT + +setup_ns ns1 ns2 + +if ! ip link add veth0 netns "$ns1" type veth peer name veth0 netns "$ns2" > /dev/null 2>&1;then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi + +ip -net "$ns1" link set veth0 up +ip -net "$ns2" link set veth0 up + +ip -net "$ns1" addr add 10.0.1.1/24 dev veth0 +ip -net "$ns1" addr add dead:1::1/64 dev veth0 nodad + +ip -net "$ns2" addr add 10.0.1.2/24 dev veth0 +ip -net "$ns2" addr add dead:1::2/64 dev veth0 nodad + +load_ruleset_family() { + local family=$1 + local ns=$2 + +ip netns exec "$ns" nft -f - <<EOF +table $family raw { + ct helper ftp { + type "ftp" protocol tcp + } + chain pre { + type filter hook prerouting priority 0; policy accept; + tcp dport 2121 ct helper set "ftp" + } + chain output { + type filter hook output priority 0; policy accept; + tcp dport 2121 ct helper set "ftp" + } +} +EOF + return $? +} + +check_for_helper() +{ + local netns=$1 + local message=$2 + local port=$3 + + if echo "$message" |grep -q 'ipv6';then + local family="ipv6" + else + local family="ipv4" + fi + + if ! ip netns exec "$netns" conntrack -L -f $family -p tcp --dport "$port" 2> /dev/null |grep -q 'helper=ftp';then + if [ "$autoassign" -eq 0 ] ;then + echo "FAIL: ${netns} did not show attached helper $message" 1>&2 + ret=1 + else + echo "PASS: ${netns} did not show attached helper $message" 1>&2 + fi + else + if [ "$autoassign" -eq 0 ] ;then + echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2 + else + echo "FAIL: ${netns} connection on port $port has ftp helper attached" 1>&2 + ret=1 + fi + fi + + return 0 +} + +listener_ready() +{ + ns="$1" + port="$2" + proto="$3" + ss -N "$ns" -lnt -o "sport = :$port" | grep -q "$port" +} + +test_helper() +{ + local port=$1 + local autoassign=$2 + + if [ "$autoassign" -eq 0 ] ;then + msg="set via ruleset" + else + msg="auto-assign" + fi + + ip netns exec "$ns2" socat -t 3 -u -4 TCP-LISTEN:"$port",reuseaddr STDOUT > /dev/null & + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" "$port" "-4" + + ip netns exec "$ns1" socat -u -4 STDIN TCP:10.0.1.2:"$port" < /dev/null > /dev/null + + check_for_helper "$ns1" "ip $msg" "$port" "$autoassign" + check_for_helper "$ns2" "ip $msg" "$port" "$autoassign" + + if [ $testipv6 -eq 0 ] ;then + return 0 + fi + + ip netns exec "$ns1" conntrack -F 2> /dev/null + ip netns exec "$ns2" conntrack -F 2> /dev/null + + ip netns exec "$ns2" socat -t 3 -u -6 TCP-LISTEN:"$port",reuseaddr STDOUT > /dev/null & + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns2" "$port" "-6" + + ip netns exec "$ns1" socat -t 3 -u -6 STDIN TCP:"[dead:1::2]":"$port" < /dev/null > /dev/null + + check_for_helper "$ns1" "ipv6 $msg" "$port" + check_for_helper "$ns2" "ipv6 $msg" "$port" +} + +if ! load_ruleset_family ip "$ns1"; then + echo "FAIL: ${ns1} cannot load ip ruleset" 1>&2 + exit 1 +fi + +if ! load_ruleset_family ip6 "$ns1"; then + echo "SKIP: ${ns1} cannot load ip6 ruleset" 1>&2 + testipv6=0 +fi + +if ! load_ruleset_family inet "${ns2}"; then + echo "SKIP: ${ns1} cannot load inet ruleset" 1>&2 + if ! load_ruleset_family ip "${ns2}"; then + echo "FAIL: ${ns2} cannot load ip ruleset" 1>&2 + exit 1 + fi + + if [ "$testipv6" -eq 1 ] ;then + if ! load_ruleset_family ip6 "$ns2"; then + echo "FAIL: ${ns2} cannot load ip6 ruleset" 1>&2 + exit 1 + fi + fi +fi + +test_helper 2121 0 +ip netns exec "$ns1" sysctl -qe 'net.netfilter.nf_conntrack_helper=1' +ip netns exec "$ns2" sysctl -qe 'net.netfilter.nf_conntrack_helper=1' +test_helper 21 1 + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_fib.sh b/tools/testing/selftests/net/netfilter/nft_fib.sh new file mode 100755 index 000000000000..ce1451c275fd --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_fib.sh @@ -0,0 +1,234 @@ +#!/bin/bash +# +# This tests the fib expression. +# +# Kselftest framework requirement - SKIP code is 4. + +source lib.sh + +ret=0 + +timeout=4 + +log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) + +cleanup() +{ + cleanup_all_ns + + [ "$log_netns" -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns +} + +checktool "nft --version" "run test without nft" + +setup_ns nsrouter ns1 ns2 + +trap cleanup EXIT + +if dmesg | grep -q ' nft_rpfilter: ';then + dmesg -c | grep ' nft_rpfilter: ' + echo "WARN: a previous test run has failed" 1>&2 +fi + +sysctl -q net.netfilter.nf_log_all_netns=1 + +load_ruleset() { + local netns=$1 + +ip netns exec "$netns" nft -f /dev/stdin <<EOF +table inet filter { + chain prerouting { + type filter hook prerouting priority 0; policy accept; + fib saddr . iif oif missing counter log prefix "$netns nft_rpfilter: " drop + } +} +EOF +} + +load_pbr_ruleset() { + local netns=$1 + +ip netns exec "$netns" nft -f /dev/stdin <<EOF +table inet filter { + chain forward { + type filter hook forward priority raw; + fib saddr . iif oif gt 0 accept + log drop + } +} +EOF +} + +load_ruleset_count() { + local netns=$1 + +ip netns exec "$netns" nft -f /dev/stdin <<EOF +table inet filter { + chain prerouting { + type filter hook prerouting priority 0; policy accept; + ip daddr 1.1.1.1 fib saddr . iif oif missing counter drop + ip6 daddr 1c3::c01d fib saddr . iif oif missing counter drop + } +} +EOF +} + +check_drops() { + if dmesg | grep -q ' nft_rpfilter: ';then + dmesg | grep ' nft_rpfilter: ' + echo "FAIL: rpfilter did drop packets" + return 1 + fi + + return 0 +} + +check_fib_counter() { + local want=$1 + local ns=$2 + local address=$3 + + if ! ip netns exec "$ns" nft list table inet filter | grep 'fib saddr . iif' | grep "$address" | grep -q "packets $want";then + echo "Netns $ns fib counter doesn't match expected packet count of $want for $address" 1>&2 + ip netns exec "$ns" nft list table inet filter + return 1 + fi + + if [ "$want" -gt 0 ]; then + echo "PASS: fib expression did drop packets for $address" + fi + + return 0 +} + +load_ruleset "$nsrouter" +load_ruleset "$ns1" +load_ruleset "$ns2" + +if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi +ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2" + +ip -net "$nsrouter" link set veth0 up +ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad + +ip -net "$nsrouter" link set veth1 up +ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1 +ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad + +ip -net "$ns1" link set eth0 up +ip -net "$ns2" link set eth0 up + +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" route add default via dead:1::1 + +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns2" route add default via dead:2::1 + +test_ping() { + local daddr4=$1 + local daddr6=$2 + + if ! ip netns exec "$ns1" ping -c 1 -q "$daddr4" > /dev/null; then + check_drops + echo "FAIL: ${ns1} cannot reach $daddr4, ret $ret" 1>&2 + return 1 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q "$daddr6" > /dev/null; then + check_drops + echo "FAIL: ${ns1} cannot reach $daddr6, ret $ret" 1>&2 + return 1 + fi + + return 0 +} + +ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null + +test_ping 10.0.2.1 dead:2::1 || exit 1 +check_drops || exit 1 + +test_ping 10.0.2.99 dead:2::99 || exit 1 +check_drops || exit 1 + +echo "PASS: fib expression did not cause unwanted packet drops" + +ip netns exec "$nsrouter" nft flush table inet filter + +ip -net "$ns1" route del default +ip -net "$ns1" -6 route del default + +ip -net "$ns1" addr del 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr del dead:1::99/64 dev eth0 + +ip -net "$ns1" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns1" addr add dead:2::99/64 dev eth0 nodad + +ip -net "$ns1" route add default via 10.0.2.1 +ip -net "$ns1" -6 route add default via dead:2::1 + +ip -net "$nsrouter" addr add dead:2::1/64 dev veth0 nodad + +# switch to ruleset that doesn't log, this time +# its expected that this does drop the packets. +load_ruleset_count "$nsrouter" + +# ns1 has a default route, but nsrouter does not. +# must not check return value, ping to 1.1.1.1 will +# fail. +check_fib_counter 0 "$nsrouter" 1.1.1.1 || exit 1 +check_fib_counter 0 "$nsrouter" 1c3::c01d || exit 1 + +ip netns exec "$ns1" ping -W 0.5 -c 1 -q 1.1.1.1 > /dev/null +check_fib_counter 1 "$nsrouter" 1.1.1.1 || exit 1 + +ip netns exec "$ns1" ping -W 0.5 -i 0.1 -c 3 -q 1c3::c01d > /dev/null +check_fib_counter 3 "$nsrouter" 1c3::c01d || exit 1 + +# delete all rules +ip netns exec "$ns1" nft flush ruleset +ip netns exec "$ns2" nft flush ruleset +ip netns exec "$nsrouter" nft flush ruleset + +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad + +ip -net "$ns1" addr del 10.0.2.99/24 dev eth0 +ip -net "$ns1" addr del dead:2::99/64 dev eth0 + +ip -net "$nsrouter" addr del dead:2::1/64 dev veth0 + +# ... pbr ruleset for the router, check iif+oif. +if ! load_pbr_ruleset "$nsrouter";then + echo "SKIP: Could not load fib forward ruleset" + exit $ksft_skip +fi + +ip -net "$nsrouter" rule add from all table 128 +ip -net "$nsrouter" rule add from all iif veth0 table 129 +ip -net "$nsrouter" route add table 128 to 10.0.1.0/24 dev veth0 +ip -net "$nsrouter" route add table 129 to 10.0.2.0/24 dev veth1 + +# drop main ipv4 table +ip -net "$nsrouter" -4 rule delete table main + +if ! test_ping 10.0.2.99 dead:2::99;then + ip -net "$nsrouter" nft list ruleset + echo "FAIL: fib mismatch in pbr setup" + exit 1 +fi + +echo "PASS: fib expression forward check with policy based routing" +exit 0 diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index a32f490f7539..86d516e8acd6 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -14,14 +14,8 @@ # nft_flowtable.sh -o8000 -l1500 -r2000 # -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -nsr1="nsr1-$sfx" -nsr2="nsr2-$sfx" - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh + ret=0 nsin="" @@ -30,52 +24,41 @@ ns2out="" log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - checktool "nft --version" "run test without nft tool" -checktool "ip -Version" "run test without ip tool" -checktool "which nc" "run test without nc (netcat)" -checktool "ip netns add $nsr1" "create net namespace $nsr1" +checktool "socat -h" "run test without socat" -ip netns add $ns1 -ip netns add $ns2 -ip netns add $nsr2 +setup_ns ns1 ns2 nsr1 nsr2 cleanup() { - ip netns del $ns1 - ip netns del $ns2 - ip netns del $nsr1 - ip netns del $nsr2 + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + + cleanup_all_ns rm -f "$nsin" "$ns1out" "$ns2out" - [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns + [ "$log_netns" -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns="$log_netns" } trap cleanup EXIT sysctl -q net.netfilter.nf_log_all_netns=1 -ip link add veth0 netns $nsr1 type veth peer name eth0 netns $ns1 -ip link add veth1 netns $nsr1 type veth peer name veth0 netns $nsr2 +ip link add veth0 netns "$nsr1" type veth peer name eth0 netns "$ns1" +ip link add veth1 netns "$nsr1" type veth peer name veth0 netns "$nsr2" -ip link add veth1 netns $nsr2 type veth peer name eth0 netns $ns2 +ip link add veth1 netns "$nsr2" type veth peer name eth0 netns "$ns2" -for dev in lo veth0 veth1; do - ip -net $nsr1 link set $dev up - ip -net $nsr2 link set $dev up +for dev in veth0 veth1; do + ip -net "$nsr1" link set "$dev" up + ip -net "$nsr2" link set "$dev" up done -ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 -ip -net $nsr1 addr add dead:1::1/64 dev veth0 +ip -net "$nsr1" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad -ip -net $nsr2 addr add 10.0.2.1/24 dev veth1 -ip -net $nsr2 addr add dead:2::1/64 dev veth1 +ip -net "$nsr2" addr add 10.0.2.1/24 dev veth1 +ip -net "$nsr2" addr add dead:2::1/64 dev veth1 nodad # set different MTUs so we need to push packets coming from ns1 (large MTU) # to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1), @@ -107,56 +90,63 @@ do esac done -if ! ip -net $nsr1 link set veth0 mtu $omtu; then +if ! ip -net "$nsr1" link set veth0 mtu "$omtu"; then + exit 1 +fi + +ip -net "$ns1" link set eth0 mtu "$omtu" + +if ! ip -net "$nsr2" link set veth1 mtu "$rmtu"; then exit 1 fi -ip -net $ns1 link set eth0 mtu $omtu +if ! ip -net "$nsr1" link set veth1 mtu "$lmtu"; then + exit 1 +fi -if ! ip -net $nsr2 link set veth1 mtu $rmtu; then +if ! ip -net "$nsr2" link set veth0 mtu "$lmtu"; then exit 1 fi -ip -net $ns2 link set eth0 mtu $rmtu +ip -net "$ns2" link set eth0 mtu "$rmtu" # transfer-net between nsr1 and nsr2. # these addresses are not used for connections. -ip -net $nsr1 addr add 192.168.10.1/24 dev veth1 -ip -net $nsr1 addr add fee1:2::1/64 dev veth1 +ip -net "$nsr1" addr add 192.168.10.1/24 dev veth1 +ip -net "$nsr1" addr add fee1:2::1/64 dev veth1 nodad -ip -net $nsr2 addr add 192.168.10.2/24 dev veth0 -ip -net $nsr2 addr add fee1:2::2/64 dev veth0 +ip -net "$nsr2" addr add 192.168.10.2/24 dev veth0 +ip -net "$nsr2" addr add fee1:2::2/64 dev veth0 nodad for i in 0 1; do - ip netns exec $nsr1 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null - ip netns exec $nsr2 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null + ip netns exec "$nsr1" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null + ip netns exec "$nsr2" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null done -for ns in $ns1 $ns2;do - ip -net $ns link set lo up - ip -net $ns link set eth0 up +for ns in "$ns1" "$ns2";do + ip -net "$ns" link set eth0 up - if ! ip netns exec $ns sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then + if ! ip netns exec "$ns" sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then echo "ERROR: Check Originator/Responder values (problem during address addition)" exit 1 fi # don't set ip DF bit for first two tests - ip netns exec $ns sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null + ip netns exec "$ns" sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null done -ip -net $ns1 addr add 10.0.1.99/24 dev eth0 -ip -net $ns2 addr add 10.0.2.99/24 dev eth0 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns2 route add default via 10.0.2.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0 -ip -net $ns2 addr add dead:2::99/64 dev eth0 -ip -net $ns1 route add default via dead:1::1 -ip -net $ns2 route add default via dead:2::1 +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad +ip -net "$ns1" route add default via dead:1::1 +ip -net "$ns2" route add default via dead:2::1 -ip -net $nsr1 route add default via 192.168.10.2 -ip -net $nsr2 route add default via 192.168.10.1 +ip -net "$nsr1" route add default via 192.168.10.2 +ip -net "$nsr2" route add default via 192.168.10.1 -ip netns exec $nsr1 nft -f - <<EOF +ip netns exec "$nsr1" nft -f - <<EOF table inet filter { flowtable f1 { hook ingress priority 0 @@ -188,7 +178,7 @@ if [ $? -ne 0 ]; then exit $ksft_skip fi -ip netns exec $ns2 nft -f - <<EOF +ip netns exec "$ns2" nft -f - <<EOF table inet filter { counter ip4dscp0 { } counter ip4dscp3 { } @@ -204,25 +194,22 @@ table inet filter { EOF if [ $? -ne 0 ]; then - echo "SKIP: Could not load nft ruleset" + echo -n "SKIP: Could not load ruleset: " + nft --version exit $ksft_skip fi # test basic connectivity -if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then +if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then echo "ERROR: $ns1 cannot reach ns2" 1>&2 exit 1 fi -if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then +if ! ip netns exec "$ns2" ping -c 1 -q 10.0.1.99 > /dev/null; then echo "ERROR: $ns2 cannot reach $ns1" 1>&2 exit 1 fi -if [ $ret -eq 0 ];then - echo "PASS: netns routing/connectivity: $ns1 can reach $ns2" -fi - nsin=$(mktemp) ns1out=$(mktemp) ns2out=$(mktemp) @@ -248,23 +235,27 @@ check_counters() local what=$1 local ok=1 - local orig=$(ip netns exec $nsr1 nft reset counter inet filter routed_orig | grep packets) - local repl=$(ip netns exec $nsr1 nft reset counter inet filter routed_repl | grep packets) + local orig repl + orig=$(ip netns exec "$nsr1" nft reset counter inet filter routed_orig | grep packets) + repl=$(ip netns exec "$nsr1" nft reset counter inet filter routed_repl | grep packets) local orig_cnt=${orig#*bytes} local repl_cnt=${repl#*bytes} - local fs=$(du -sb $nsin) + local fs + fs=$(du -sb "$nsin") local max_orig=${fs%%/*} local max_repl=$((max_orig/4)) - if [ $orig_cnt -gt $max_orig ];then + # flowtable fastpath should bypass normal routing one, i.e. the counters in forward hook + # should always be lower than the size of the transmitted file (max_orig). + if [ "$orig_cnt" -gt "$max_orig" ];then echo "FAIL: $what: original counter $orig_cnt exceeds expected value $max_orig" 1>&2 ret=1 ok=0 fi - if [ $repl_cnt -gt $max_repl ];then + if [ "$repl_cnt" -gt $max_repl ];then echo "FAIL: $what: reply counter $repl_cnt exceeds expected value $max_repl" 1>&2 ret=1 ok=0 @@ -280,39 +271,40 @@ check_dscp() local what=$1 local ok=1 - local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp3 | grep packets) + local counter + counter=$(ip netns exec "$ns2" nft reset counter inet filter ip4dscp3 | grep packets) local pc4=${counter%*bytes*} local pc4=${pc4#*packets} - local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp0 | grep packets) + counter=$(ip netns exec "$ns2" nft reset counter inet filter ip4dscp0 | grep packets) local pc4z=${counter%*bytes*} local pc4z=${pc4z#*packets} case "$what" in "dscp_none") - if [ $pc4 -gt 0 ] || [ $pc4z -eq 0 ]; then + if [ "$pc4" -gt 0 ] || [ "$pc4z" -eq 0 ]; then echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; "dscp_fwd") - if [ $pc4 -eq 0 ] || [ $pc4z -eq 0 ]; then + if [ "$pc4" -eq 0 ] || [ "$pc4z" -eq 0 ]; then echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; "dscp_ingress") - if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then + if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; "dscp_egress") - if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then + if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 ret=1 ok=0 @@ -324,7 +316,7 @@ check_dscp() ok=0 esac - if [ $ok -eq 1 ] ;then + if [ "$ok" -eq 1 ] ;then echo "PASS: $what: dscp packet counters match" fi } @@ -345,6 +337,11 @@ check_transfer() return 0 } +listener_ready() +{ + ss -N "$nsb" -lnt -o "sport = :12345" | grep -q 12345 +} + test_tcp_forwarding_ip() { local nsa=$1 @@ -353,40 +350,23 @@ test_tcp_forwarding_ip() local dstport=$4 local lret=0 - ip netns exec $nsb nc -w 5 -l -p 12345 < "$nsin" > "$ns2out" & + timeout 10 ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$nsin" > "$ns2out" & lpid=$! - sleep 1 - ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$nsin" > "$ns1out" & - cpid=$! - - sleep 1 - - prev="$(ls -l $ns1out $ns2out)" - sleep 1 - - while [[ "$prev" != "$(ls -l $ns1out $ns2out)" ]]; do - sleep 1; - prev="$(ls -l $ns1out $ns2out)" - done + busywait 1000 listener_ready - if test -d /proc/"$lpid"/; then - kill $lpid - fi - - if test -d /proc/"$cpid"/; then - kill $cpid - fi + timeout 10 ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$nsin" > "$ns1out" wait $lpid - wait $cpid if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then lret=1 + ret=1 fi if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then lret=1 + ret=1 fi return $lret @@ -403,7 +383,7 @@ test_tcp_forwarding_set_dscp() { check_dscp "dscp_none" -ip netns exec $nsr1 nft -f - <<EOF +ip netns exec "$nsr1" nft -f - <<EOF table netdev dscpmangle { chain setdscp0 { type filter hook ingress device "veth0" priority 0; policy accept @@ -415,12 +395,12 @@ if [ $? -eq 0 ]; then test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 check_dscp "dscp_ingress" - ip netns exec $nsr1 nft delete table netdev dscpmangle + ip netns exec "$nsr1" nft delete table netdev dscpmangle else echo "SKIP: Could not load netdev:ingress for veth0" fi -ip netns exec $nsr1 nft -f - <<EOF +ip netns exec "$nsr1" nft -f - <<EOF table netdev dscpmangle { chain setdscp0 { type filter hook egress device "veth1" priority 0; policy accept @@ -432,14 +412,14 @@ if [ $? -eq 0 ]; then test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 check_dscp "dscp_egress" - ip netns exec $nsr1 nft flush table netdev dscpmangle + ip netns exec "$nsr1" nft flush table netdev dscpmangle else echo "SKIP: Could not load netdev:egress for veth1" fi # partial. If flowtable really works, then both dscp-is-0 and dscp-is-cs3 # counters should have seen packets (before and after ft offload kicks in). - ip netns exec $nsr1 nft -a insert rule inet filter forward ip dscp set cs3 + ip netns exec "$nsr1" nft -a insert rule inet filter forward ip dscp set cs3 test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 check_dscp "dscp_fwd" } @@ -455,8 +435,8 @@ test_tcp_forwarding_nat() pmtu=$3 what=$4 - if [ $lret -eq 0 ] ; then - if [ $pmtu -eq 1 ] ;then + if [ "$lret" -eq 0 ] ; then + if [ "$pmtu" -eq 1 ] ;then check_counters "flow offload for ns1/ns2 with masquerade and pmtu discovery $what" else echo "PASS: flow offload for ns1/ns2 with masquerade $what" @@ -464,9 +444,9 @@ test_tcp_forwarding_nat() test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666 lret=$? - if [ $pmtu -eq 1 ] ;then + if [ "$pmtu" -eq 1 ] ;then check_counters "flow offload for ns1/ns2 with dnat and pmtu discovery $what" - elif [ $lret -eq 0 ] ; then + elif [ "$lret" -eq 0 ] ; then echo "PASS: flow offload for ns1/ns2 with dnat $what" fi fi @@ -481,25 +461,25 @@ make_file "$nsin" # Due to MTU mismatch in both directions, all packets (except small packets like pure # acks) have to be handled by normal forwarding path. Therefore, packet counters # are not checked. -if test_tcp_forwarding $ns1 $ns2; then +if test_tcp_forwarding "$ns1" "$ns2"; then echo "PASS: flow offloaded for ns1/ns2" else echo "FAIL: flow offload for ns1/ns2:" 1>&2 - ip netns exec $nsr1 nft list ruleset + ip netns exec "$nsr1" nft list ruleset ret=1 fi # delete default route, i.e. ns2 won't be able to reach ns1 and # will depend on ns1 being masqueraded in nsr1. # expect ns1 has nsr1 address. -ip -net $ns2 route del default via 10.0.2.1 -ip -net $ns2 route del default via dead:2::1 -ip -net $ns2 route add 192.168.10.1 via 10.0.2.1 +ip -net "$ns2" route del default via 10.0.2.1 +ip -net "$ns2" route del default via dead:2::1 +ip -net "$ns2" route add 192.168.10.1 via 10.0.2.1 # Second test: # Same, but with NAT enabled. Same as in first test: we expect normal forward path # to handle most packets. -ip netns exec $nsr1 nft -f - <<EOF +ip netns exec "$nsr1" nft -f - <<EOF table ip nat { chain prerouting { type nat hook prerouting priority 0; policy accept; @@ -513,14 +493,14 @@ table ip nat { } EOF -if ! test_tcp_forwarding_set_dscp $ns1 $ns2 0 ""; then +if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 ""; then echo "FAIL: flow offload for ns1/ns2 with dscp update" 1>&2 exit 0 fi -if ! test_tcp_forwarding_nat $ns1 $ns2 0 ""; then +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 0 ""; then echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2 - ip netns exec $nsr1 nft list ruleset + ip netns exec "$nsr1" nft list ruleset ret=1 fi @@ -528,35 +508,40 @@ fi # Same as second test, but with PMTU discovery enabled. This # means that we expect the fastpath to handle packets as soon # as the endpoints adjust the packet size. -ip netns exec $ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null -ip netns exec $ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null +ip netns exec "$ns1" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null +ip netns exec "$ns2" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null # reset counters. # With pmtu in-place we'll also check that nft counters # are lower than file size and packets were forwarded via flowtable layer. # For earlier tests (large mtus), packets cannot be handled via flowtable # (except pure acks and other small packets). -ip netns exec $nsr1 nft reset counters table inet filter >/dev/null +ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null -if ! test_tcp_forwarding_nat $ns1 $ns2 1 ""; then +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 ""; then echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 - ip netns exec $nsr1 nft list ruleset + ip netns exec "$nsr1" nft list ruleset fi # Another test: # Add bridge interface br0 to Router1, with NAT enabled. -ip -net $nsr1 link add name br0 type bridge -ip -net $nsr1 addr flush dev veth0 -ip -net $nsr1 link set up dev veth0 -ip -net $nsr1 link set veth0 master br0 -ip -net $nsr1 addr add 10.0.1.1/24 dev br0 -ip -net $nsr1 addr add dead:1::1/64 dev br0 -ip -net $nsr1 link set up dev br0 +test_bridge() { +if ! ip -net "$nsr1" link add name br0 type bridge 2>/dev/null;then + echo "SKIP: could not add bridge br0" + [ "$ret" -eq 0 ] && ret=$ksft_skip + return +fi +ip -net "$nsr1" addr flush dev veth0 +ip -net "$nsr1" link set up dev veth0 +ip -net "$nsr1" link set veth0 master br0 +ip -net "$nsr1" addr add 10.0.1.1/24 dev br0 +ip -net "$nsr1" addr add dead:1::1/64 dev br0 nodad +ip -net "$nsr1" link set up dev br0 -ip netns exec $nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null +ip netns exec "$nsr1" sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null # br0 with NAT enabled. -ip netns exec $nsr1 nft -f - <<EOF +ip netns exec "$nsr1" nft -f - <<EOF flush table ip nat table ip nat { chain prerouting { @@ -571,56 +556,59 @@ table ip nat { } EOF -if ! test_tcp_forwarding_nat $ns1 $ns2 1 "on bridge"; then +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "on bridge"; then echo "FAIL: flow offload for ns1/ns2 with bridge NAT" 1>&2 - ip netns exec $nsr1 nft list ruleset + ip netns exec "$nsr1" nft list ruleset ret=1 fi # Another test: # Add bridge interface br0 to Router1, with NAT and VLAN. -ip -net $nsr1 link set veth0 nomaster -ip -net $nsr1 link set down dev veth0 -ip -net $nsr1 link add link veth0 name veth0.10 type vlan id 10 -ip -net $nsr1 link set up dev veth0 -ip -net $nsr1 link set up dev veth0.10 -ip -net $nsr1 link set veth0.10 master br0 - -ip -net $ns1 addr flush dev eth0 -ip -net $ns1 link add link eth0 name eth0.10 type vlan id 10 -ip -net $ns1 link set eth0 up -ip -net $ns1 link set eth0.10 up -ip -net $ns1 addr add 10.0.1.99/24 dev eth0.10 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0.10 - -if ! test_tcp_forwarding_nat $ns1 $ns2 1 "bridge and VLAN"; then +ip -net "$nsr1" link set veth0 nomaster +ip -net "$nsr1" link set down dev veth0 +ip -net "$nsr1" link add link veth0 name veth0.10 type vlan id 10 +ip -net "$nsr1" link set up dev veth0 +ip -net "$nsr1" link set up dev veth0.10 +ip -net "$nsr1" link set veth0.10 master br0 + +ip -net "$ns1" addr flush dev eth0 +ip -net "$ns1" link add link eth0 name eth0.10 type vlan id 10 +ip -net "$ns1" link set eth0 up +ip -net "$ns1" link set eth0.10 up +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0.10 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" addr add dead:1::99/64 dev eth0.10 nodad + +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "bridge and VLAN"; then echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2 - ip netns exec $nsr1 nft list ruleset + ip netns exec "$nsr1" nft list ruleset ret=1 fi # restore test topology (remove bridge and VLAN) -ip -net $nsr1 link set veth0 nomaster -ip -net $nsr1 link set veth0 down -ip -net $nsr1 link set veth0.10 down -ip -net $nsr1 link delete veth0.10 type vlan -ip -net $nsr1 link delete br0 type bridge -ip -net $ns1 addr flush dev eth0.10 -ip -net $ns1 link set eth0.10 down -ip -net $ns1 link set eth0 down -ip -net $ns1 link delete eth0.10 type vlan +ip -net "$nsr1" link set veth0 nomaster +ip -net "$nsr1" link set veth0 down +ip -net "$nsr1" link set veth0.10 down +ip -net "$nsr1" link delete veth0.10 type vlan +ip -net "$nsr1" link delete br0 type bridge +ip -net "$ns1" addr flush dev eth0.10 +ip -net "$ns1" link set eth0.10 down +ip -net "$ns1" link set eth0 down +ip -net "$ns1" link delete eth0.10 type vlan # restore address in ns1 and nsr1 -ip -net $ns1 link set eth0 up -ip -net $ns1 addr add 10.0.1.99/24 dev eth0 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0 -ip -net $ns1 route add default via dead:1::1 -ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 -ip -net $nsr1 addr add dead:1::1/64 dev veth0 -ip -net $nsr1 link set up dev veth0 +ip -net "$ns1" link set eth0 up +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" route add default via dead:1::1 +ip -net "$nsr1" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad +ip -net "$nsr1" link set up dev veth0 +} + +test_bridge KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1) KEY_AES="0x"$(ps -af | md5sum | cut -d " " -f 1) @@ -640,33 +628,43 @@ do_esp() { local spi_out=$6 local spi_in=$7 - ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet - ip -net $ns xfrm state add src $me dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet + ip -net "$ns" xfrm state add src "$remote" dst "$me" proto esp spi "$spi_in" enc aes "$KEY_AES" auth sha1 "$KEY_SHA" mode tunnel sel src "$rnet" dst "$lnet" + ip -net "$ns" xfrm state add src "$me" dst "$remote" proto esp spi "$spi_out" enc aes "$KEY_AES" auth sha1 "$KEY_SHA" mode tunnel sel src "$lnet" dst "$rnet" # to encrypt packets as they go out (includes forwarded packets that need encapsulation) - ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow + ip -net "$ns" xfrm policy add src "$lnet" dst "$rnet" dir out tmpl src "$me" dst "$remote" proto esp mode tunnel priority 1 action allow # to fwd decrypted packets after esp processing: - ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow - + ip -net "$ns" xfrm policy add src "$rnet" dst "$lnet" dir fwd tmpl src "$remote" dst "$me" proto esp mode tunnel priority 1 action allow } -do_esp $nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2 +do_esp "$nsr1" 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 "$SPI1" "$SPI2" -do_esp $nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1 +do_esp "$nsr2" 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 "$SPI2" "$SPI1" -ip netns exec $nsr1 nft delete table ip nat +ip netns exec "$nsr1" nft delete table ip nat # restore default routes -ip -net $ns2 route del 192.168.10.1 via 10.0.2.1 -ip -net $ns2 route add default via 10.0.2.1 -ip -net $ns2 route add default via dead:2::1 +ip -net "$ns2" route del 192.168.10.1 via 10.0.2.1 +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns2" route add default via dead:2::1 -if test_tcp_forwarding $ns1 $ns2; then +if test_tcp_forwarding "$ns1" "$ns2"; then check_counters "ipsec tunnel mode for ns1/ns2" else echo "FAIL: ipsec tunnel mode for ns1/ns2" - ip netns exec $nsr1 nft list ruleset 1>&2 - ip netns exec $nsr1 cat /proc/net/xfrm_stat 1>&2 + ip netns exec "$nsr1" nft list ruleset 1>&2 + ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2 +fi + +if [ "$1" = "" ]; then + low=1280 + mtu=$((65536 - low)) + o=$(((RANDOM%mtu) + low)) + l=$(((RANDOM%mtu) + low)) + r=$(((RANDOM%mtu) + low)) + + echo "re-run with random mtus: -o $o -l $l -r $r" + $0 -o "$o" -l "$l" -r "$r" fi exit $ret diff --git a/tools/testing/selftests/netfilter/nft_meta.sh b/tools/testing/selftests/net/netfilter/nft_meta.sh index f33154c04d34..71505b6cb252 100755 --- a/tools/testing/selftests/netfilter/nft_meta.sh +++ b/tools/testing/selftests/net/netfilter/nft_meta.sh @@ -91,10 +91,10 @@ check_one_counter() local want="packets $2" local verbose="$3" - if ! ip netns exec "$ns0" nft list counter inet filter $cname | grep -q "$want"; then + if ! ip netns exec "$ns0" nft list counter inet filter "$cname" | grep -q "$want"; then echo "FAIL: $cname, want \"$want\", got" ret=1 - ip netns exec "$ns0" nft list counter inet filter $cname + ip netns exec "$ns0" nft list counter inet filter "$cname" fi } diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/net/netfilter/nft_nat.sh index dd40d9f6f259..9e39de26455f 100755 --- a/tools/testing/selftests/netfilter/nft_nat.sh +++ b/tools/testing/selftests/net/netfilter/nft_nat.sh @@ -3,77 +3,60 @@ # This test is for basic NAT functionality: snat, dnat, redirect, masquerade. # -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh + ret=0 test_inet_nat=true -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" -ns1="ns1-$sfx" -ns2="ns2-$sfx" +checktool "nft --version" "run test without nft tool" +checktool "socat -h" "run test without socat" cleanup() { - for i in 0 1 2; do ip netns del ns$i-"$sfx";done -} + ip netns pids "$ns0" | xargs kill 2>/dev/null + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi + rm -f "$INFILE" "$OUTFILE" -ip netns add "$ns0" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns0" - exit $ksft_skip -fi + cleanup_all_ns +} trap cleanup EXIT -ip netns add "$ns1" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns1" - exit $ksft_skip -fi +INFILE=$(mktemp) +OUTFILE=$(mktemp) -ip netns add "$ns2" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns2" - exit $ksft_skip -fi +setup_ns ns0 ns1 ns2 -ip link add veth0 netns "$ns0" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! ip link add veth0 netns "$ns0" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1;then echo "SKIP: No virtual ethernet pair device support in kernel" exit $ksft_skip fi ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns2" -ip -net "$ns0" link set lo up ip -net "$ns0" link set veth0 up ip -net "$ns0" addr add 10.0.1.1/24 dev veth0 -ip -net "$ns0" addr add dead:1::1/64 dev veth0 +ip -net "$ns0" addr add dead:1::1/64 dev veth0 nodad ip -net "$ns0" link set veth1 up ip -net "$ns0" addr add 10.0.2.1/24 dev veth1 -ip -net "$ns0" addr add dead:2::1/64 dev veth1 - -for i in 1 2; do - ip -net ns$i-$sfx link set lo up - ip -net ns$i-$sfx link set eth0 up - ip -net ns$i-$sfx addr add 10.0.$i.99/24 dev eth0 - ip -net ns$i-$sfx route add default via 10.0.$i.1 - ip -net ns$i-$sfx addr add dead:$i::99/64 dev eth0 - ip -net ns$i-$sfx route add default via dead:$i::1 -done +ip -net "$ns0" addr add dead:2::1/64 dev veth1 nodad + +do_config() +{ + ns="$1" + subnet="$2" + + ip -net "$ns" link set eth0 up + ip -net "$ns" addr add "10.0.$subnet.99/24" dev eth0 + ip -net "$ns" route add default via "10.0.$subnet.1" + ip -net "$ns" addr add "dead:$subnet::99/64" dev eth0 nodad + ip -net "$ns" route add default via "dead:$subnet::1" +} + +do_config "$ns1" 1 +do_config "$ns2" 2 bad_counter() { @@ -83,7 +66,7 @@ bad_counter() local tag=$4 echo "ERROR: $counter counter in $ns has unexpected value (expected $expect) at $tag" 1>&2 - ip netns exec $ns nft list counter inet filter $counter 1>&2 + ip netns exec "$ns" nft list counter inet filter "$counter" 1>&2 } check_counters() @@ -91,26 +74,23 @@ check_counters() ns=$1 local lret=0 - cnt=$(ip netns exec $ns nft list counter inet filter ns0in | grep -q "packets 1 bytes 84") - if [ $? -ne 0 ]; then - bad_counter $ns ns0in "packets 1 bytes 84" "check_counters 1" + if ! ip netns exec "$ns" nft list counter inet filter ns0in | grep -q "packets 1 bytes 84";then + bad_counter "$ns" ns0in "packets 1 bytes 84" "check_counters 1" lret=1 fi - cnt=$(ip netns exec $ns nft list counter inet filter ns0out | grep -q "packets 1 bytes 84") - if [ $? -ne 0 ]; then - bad_counter $ns ns0out "packets 1 bytes 84" "check_counters 2" + + if ! ip netns exec "$ns" nft list counter inet filter ns0out | grep -q "packets 1 bytes 84";then + bad_counter "$ns" ns0out "packets 1 bytes 84" "check_counters 2" lret=1 fi expect="packets 1 bytes 104" - cnt=$(ip netns exec $ns nft list counter inet filter ns0in6 | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter $ns ns0in6 "$expect" "check_counters 3" + if ! ip netns exec "$ns" nft list counter inet filter ns0in6 | grep -q "$expect";then + bad_counter "$ns" ns0in6 "$expect" "check_counters 3" lret=1 fi - cnt=$(ip netns exec $ns nft list counter inet filter ns0out6 | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter $ns ns0out6 "$expect" "check_counters 4" + if ! ip netns exec "$ns" nft list counter inet filter ns0out6 | grep -q "$expect";then + bad_counter "$ns" ns0out6 "$expect" "check_counters 4" lret=1 fi @@ -122,41 +102,35 @@ check_ns0_counters() local ns=$1 local lret=0 - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter ns0in | grep -q "packets 0 bytes 0";then bad_counter "$ns0" ns0in "packets 0 bytes 0" "check_ns0_counters 1" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in6 | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter ns0in6 | grep -q "packets 0 bytes 0";then bad_counter "$ns0" ns0in6 "packets 0 bytes 0" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter ns0out | grep -q "packets 0 bytes 0";then bad_counter "$ns0" ns0out "packets 0 bytes 0" "check_ns0_counters 2" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out6 | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter ns0out6 | grep -q "packets 0 bytes 0";then bad_counter "$ns0" ns0out6 "packets 0 bytes 0" "check_ns0_counters3 " lret=1 fi for dir in "in" "out" ; do expect="packets 1 bytes 84" - cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" $ns$dir "$expect" "check_ns0_counters 4" + if ! ip netns exec "$ns0" nft list counter inet filter "${ns}${dir}" | grep -q "$expect";then + bad_counter "$ns0" "$ns${dir}" "$expect" "check_ns0_counters 4" lret=1 fi expect="packets 1 bytes 104" - cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir}6 | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" $ns$dir6 "$expect" "check_ns0_counters 5" + if ! ip netns exec "$ns0" nft list counter inet filter "${ns}${dir}6" | grep -q "$expect";then + bad_counter "$ns0" "$ns${dir}6" "$expect" "check_ns0_counters 5" lret=1 fi done @@ -166,8 +140,8 @@ check_ns0_counters() reset_counters() { - for i in 0 1 2;do - ip netns exec ns$i-$sfx nft reset counters inet > /dev/null + for i in "$ns0" "$ns1" "$ns2" ;do + ip netns exec "$i" nft reset counters inet > /dev/null done } @@ -177,7 +151,7 @@ test_local_dnat6() local lret=0 local IPF="" - if [ $family = "inet" ];then + if [ "$family" = "inet" ];then IPF="ip6" fi @@ -195,8 +169,7 @@ EOF fi # ping netns1, expect rewrite to netns2 - ip netns exec "$ns0" ping -q -c 1 dead:1::99 > /dev/null - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" ping -q -c 1 dead:1::99 > /dev/null;then lret=1 echo "ERROR: ping6 failed" return $lret @@ -204,8 +177,7 @@ EOF expect="packets 0 bytes 0" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat6 1" lret=1 fi @@ -213,8 +185,7 @@ EOF expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat6 2" lret=1 fi @@ -223,8 +194,7 @@ EOF # expect 0 count in ns1 expect="packets 0 bytes 0" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat6 3" lret=1 fi @@ -233,8 +203,7 @@ EOF # expect 1 packet in ns2 expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat6 4" lret=1 fi @@ -252,7 +221,7 @@ test_local_dnat() local lret=0 local IPF="" - if [ $family = "inet" ];then + if [ "$family" = "inet" ];then IPF="ip" fi @@ -265,7 +234,7 @@ table $family nat { } EOF if [ $? -ne 0 ]; then - if [ $family = "inet" ];then + if [ "$family" = "inet" ];then echo "SKIP: inet nat tests" test_inet_nat=false return $ksft_skip @@ -275,8 +244,7 @@ EOF fi # ping netns1, expect rewrite to netns2 - ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null;then lret=1 echo "ERROR: ping failed" return $lret @@ -284,18 +252,16 @@ EOF expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat 1" + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns1$dir" "$expect" "test_local_dnat 1" lret=1 fi done expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 2" + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns2$dir" "$expect" "test_local_dnat 2" lret=1 fi done @@ -303,9 +269,8 @@ EOF # expect 0 count in ns1 expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat 3" + if ! ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect";then + bad_counter "$ns1" "ns0$dir" "$expect" "test_local_dnat 3" lret=1 fi done @@ -313,20 +278,18 @@ EOF # expect 1 packet in ns2 expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 4" + if ! ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect";then + bad_counter "$ns2" "ns0$dir" "$expect" "test_local_dnat 4" lret=1 fi done test $lret -eq 0 && echo "PASS: ping to $ns1 was $family NATted to $ns2" - ip netns exec "$ns0" nft flush chain $family nat output + ip netns exec "$ns0" nft flush chain "$family" nat output reset_counters - ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null;then lret=1 echo "ERROR: ping failed" return $lret @@ -334,16 +297,14 @@ EOF expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then bad_counter "$ns1" ns1$dir "$expect" "test_local_dnat 5" lret=1 fi done expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 6" lret=1 fi @@ -352,8 +313,7 @@ EOF # expect 1 count in ns1 expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then bad_counter "$ns0" ns0$dir "$expect" "test_local_dnat 7" lret=1 fi @@ -362,8 +322,7 @@ EOF # expect 0 packet in ns2 expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 8" lret=1 fi @@ -374,13 +333,19 @@ EOF return $lret } +listener_ready() +{ + local ns="$1" + local port="$2" + local proto="$3" + ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port" +} + test_local_dnat_portonly() { local family=$1 local daddr=$2 local lret=0 - local sr_s - local sr_r ip netns exec "$ns0" nft -f /dev/stdin <<EOF table $family nat { @@ -392,7 +357,7 @@ table $family nat { } EOF if [ $? -ne 0 ]; then - if [ $family = "inet" ];then + if [ "$family" = "inet" ];then echo "SKIP: inet port test" test_inet_nat=false return @@ -401,17 +366,16 @@ EOF return fi - echo SERVER-$family | ip netns exec "$ns1" timeout 5 socat -u STDIN TCP-LISTEN:2000 & - sc_s=$! + echo "SERVER-$family" | ip netns exec "$ns1" timeout 3 socat -u STDIN TCP-LISTEN:2000 & - sleep 1 + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1" 2000 "-t" - result=$(ip netns exec "$ns0" timeout 1 socat TCP:$daddr:2000 STDOUT) + result=$(ip netns exec "$ns0" timeout 1 socat -u TCP:"$daddr":2000 STDOUT) if [ "$result" = "SERVER-inet" ];then echo "PASS: inet port rewrite without l3 address" else - echo "ERROR: inet port rewrite" + echo "ERROR: inet port rewrite without l3 address, got $result" ret=1 fi } @@ -424,24 +388,20 @@ test_masquerade6() ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 via ipv6" return 1 - lret=1 fi expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns2$dir "$expect" "test_masquerade6 1" + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns2$dir" "$expect" "test_masquerade6 1" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 2" + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade6 2" lret=1 fi done @@ -462,8 +422,7 @@ EOF return $ksft_skip fi - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags" lret=1 fi @@ -471,14 +430,12 @@ EOF # ns1 should have seen packets from ns0, due to masquerade expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 3" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 4" lret=1 fi @@ -487,27 +444,23 @@ EOF # ns1 should not have seen packets from ns2, due to masquerade expect="packets 0 bytes 0" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 5" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_masquerade6 6" + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns1$dir" "$expect" "test_masquerade6 6" lret=1 fi done - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 with active ipv6 masquerade $natflags (attempt 2)" lret=1 fi - ip netns exec "$ns0" nft flush chain $family nat postrouting - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft flush chain "$family" nat postrouting;then echo "ERROR: Could not flush $family nat postrouting" 1>&2 lret=1 fi @@ -526,23 +479,20 @@ test_masquerade() ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from "$ns2" $natflags" + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then + echo "ERROR: cannot ping $ns1 from $ns2 $natflags" lret=1 fi expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns2$dir "$expect" "test_masquerade 1" + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns2$dir" "$expect" "test_masquerade 1" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 2" + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade 2" lret=1 fi done @@ -563,8 +513,7 @@ EOF return $ksft_skip fi - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags" lret=1 fi @@ -572,15 +521,13 @@ EOF # ns1 should have seen packets from ns0, due to masquerade expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 3" + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns0$dir" "$expect" "test_masquerade 3" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 4" + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade 4" lret=1 fi done @@ -588,27 +535,23 @@ EOF # ns1 should not have seen packets from ns2, due to masquerade expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 5" + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns0$dir" "$expect" "test_masquerade 5" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_masquerade 6" + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns1$dir" "$expect" "test_masquerade 6" lret=1 fi done - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 with active ip masquerade $natflags (attempt 2)" lret=1 fi - ip netns exec "$ns0" nft flush chain $family nat postrouting - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft flush chain "$family" nat postrouting; then echo "ERROR: Could not flush $family nat postrouting" 1>&2 lret=1 fi @@ -625,22 +568,19 @@ test_redirect6() ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then echo "ERROR: cannnot ping $ns1 from $ns2 via ipv6" lret=1 fi expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns1" ns2$dir "$expect" "test_redirect6 1" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then bad_counter "$ns2" ns1$dir "$expect" "test_redirect6 2" lret=1 fi @@ -662,8 +602,7 @@ EOF return $ksft_skip fi - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 via ipv6 with active $family redirect" lret=1 fi @@ -671,8 +610,7 @@ EOF # ns1 should have seen no packets from ns2, due to redirection expect="packets 0 bytes 0" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 3" lret=1 fi @@ -681,15 +619,13 @@ EOF # ns0 should have seen packets from ns2, due to masquerade expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 4" lret=1 fi done - ip netns exec "$ns0" nft delete table $family nat - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft delete table "$family" nat;then echo "ERROR: Could not delete $family nat table" 1>&2 lret=1 fi @@ -707,22 +643,19 @@ test_redirect() ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2" lret=1 fi expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" $ns2$dir "$expect" "test_redirect 1" + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "$ns2$dir" "$expect" "test_redirect 1" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect";then bad_counter "$ns2" ns1$dir "$expect" "test_redirect 2" lret=1 fi @@ -744,8 +677,7 @@ EOF return $ksft_skip fi - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 with active $family ip redirect" lret=1 fi @@ -754,8 +686,7 @@ EOF expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_redirect 3" lret=1 fi @@ -764,15 +695,13 @@ EOF # ns0 should have seen packets from ns2, due to masquerade expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns0" ns0$dir "$expect" "test_redirect 4" lret=1 fi done - ip netns exec "$ns0" nft delete table $family nat - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft delete table "$family" nat;then echo "ERROR: Could not delete $family nat table" 1>&2 lret=1 fi @@ -803,13 +732,13 @@ test_port_shadow() # make shadow entry, from client (ns2), going to (ns1), port 41404, sport 1405. echo "fake-entry" | ip netns exec "$ns2" timeout 1 socat -u STDIN UDP:"$daddrc":41404,sourceport=1405 - echo ROUTER | ip netns exec "$ns0" timeout 5 socat -u STDIN UDP4-LISTEN:1405 & - sc_r=$! + echo ROUTER | ip netns exec "$ns0" timeout 3 socat -T 3 -u STDIN UDP4-LISTEN:1405 2>/dev/null & + local sc_r=$! + echo CLIENT | ip netns exec "$ns2" timeout 3 socat -T 3 -u STDIN UDP4-LISTEN:1405,reuseport 2>/dev/null & + local sc_c=$! - echo CLIENT | ip netns exec "$ns2" timeout 5 socat -u STDIN UDP4-LISTEN:1405,reuseport & - sc_c=$! - - sleep 0.3 + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns0" 1405 "-u" + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns2" 1405 "-u" # ns1 tries to connect to ns0:1405. With default settings this should connect # to client, it matches the conntrack entry created above. @@ -846,7 +775,7 @@ table $family filter { EOF test_port_shadow "port-filter" "ROUTER" - ip netns exec "$ns0" nft delete table $family filter + ip netns exec "$ns0" nft delete table "$family" filter } # This prevents port shadow of router service via notrack. @@ -868,7 +797,7 @@ table $family raw { EOF test_port_shadow "port-notrack" "ROUTER" - ip netns exec "$ns0" nft delete table $family raw + ip netns exec "$ns0" nft delete table "$family" raw } # This prevents port shadow of router service via sport remap. @@ -886,21 +815,19 @@ table $family pat { EOF test_port_shadow "pat" "ROUTER" - ip netns exec "$ns0" nft delete table $family pat + ip netns exec "$ns0" nft delete table "$family" pat } test_port_shadowing() { local family="ip" - conntrack -h >/dev/null 2>&1 - if [ $? -ne 0 ];then + if ! conntrack -h >/dev/null 2>&1;then echo "SKIP: Could not run nat port shadowing test without conntrack tool" return fi - socat -h > /dev/null 2>&1 - if [ $? -ne 0 ];then + if ! socat -h > /dev/null 2>&1;then echo "SKIP: Could not run nat port shadowing test without socat tool" return fi @@ -946,8 +873,7 @@ test_stateless_nat_ip() ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 before loading stateless rules" return 1 fi @@ -981,23 +907,20 @@ EOF reset_counters - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null; then echo "ERROR: cannot ping $ns1 from $ns2 with stateless rules" lret=1 fi # ns1 should have seen packets from .2.2, due to stateless rewrite. expect="packets 1 bytes 84" - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect";then bad_counter "$ns1" ns0insl "$expect" "test_stateless 1" lret=1 fi for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect";then bad_counter "$ns2" ns1$dir "$expect" "test_stateless 2" lret=1 fi @@ -1006,14 +929,12 @@ EOF # ns1 should not have seen packets from ns2, due to masquerade expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_stateless 3" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect";then bad_counter "$ns0" ns1$dir "$expect" "test_stateless 4" lret=1 fi @@ -1021,8 +942,7 @@ EOF reset_counters - socat -h > /dev/null 2>&1 - if [ $? -ne 0 ];then + if ! socat -h > /dev/null 2>&1;then echo "SKIP: Could not run stateless nat frag test without socat tool" if [ $lret -eq 0 ]; then return $ksft_skip @@ -1032,42 +952,36 @@ EOF return $lret fi - local tmpfile=$(mktemp) - dd if=/dev/urandom of=$tmpfile bs=4096 count=1 2>/dev/null + dd if=/dev/urandom of="$INFILE" bs=4096 count=1 2>/dev/null - local outfile=$(mktemp) - ip netns exec "$ns1" timeout 3 socat -u UDP4-RECV:4233 OPEN:$outfile < /dev/null & - sc_r=$! + ip netns exec "$ns1" timeout 3 socat -u UDP4-RECV:4233 OPEN:"$OUTFILE" < /dev/null 2>/dev/null & + + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1" 4233 "-u" - sleep 1 # re-do with large ping -> ip fragmentation - ip netns exec "$ns2" timeout 3 socat - UDP4-SENDTO:"10.0.1.99:4233" < "$tmpfile" > /dev/null - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" timeout 3 socat -u STDIN UDP4-SENDTO:"10.0.1.99:4233" < "$INFILE" > /dev/null;then echo "ERROR: failed to test udp $ns1 to $ns2 with stateless ip nat" 1>&2 lret=1 fi wait - cmp "$tmpfile" "$outfile" - if [ $? -ne 0 ]; then - ls -l "$tmpfile" "$outfile" + if ! cmp "$INFILE" "$OUTFILE";then + ls -l "$INFILE" "$OUTFILE" echo "ERROR: in and output file mismatch when checking udp with stateless nat" 1>&2 lret=1 fi - rm -f "$tmpfile" "$outfile" + :> "$OUTFILE" # ns1 should have seen packets from 2.2, due to stateless rewrite. expect="packets 3 bytes 4164" - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect";then bad_counter "$ns1" ns0insl "$expect" "test_stateless 5" lret=1 fi - ip netns exec "$ns0" nft delete table ip stateless - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft delete table ip stateless; then echo "ERROR: Could not delete table ip stateless" 1>&2 lret=1 fi @@ -1078,8 +992,8 @@ EOF } # ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99 -for i in 0 1 2; do -ip netns exec ns$i-$sfx nft -f /dev/stdin <<EOF +for i in "$ns0" "$ns1" "$ns2" ;do +ip netns exec "$i" nft -f /dev/stdin <<EOF table inet filter { counter ns0in {} counter ns1in {} @@ -1145,7 +1059,7 @@ done # special case for stateless nat check, counter needs to # be done before (input) ip defragmentation -ip netns exec ns1-$sfx nft -f /dev/stdin <<EOF +ip netns exec "$ns1" nft -f /dev/stdin <<EOF table inet filter { counter ns0insl {} @@ -1156,31 +1070,49 @@ table inet filter { } EOF -sleep 3 -# test basic connectivity -for i in 1 2; do - ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99 > /dev/null - if [ $? -ne 0 ];then - echo "ERROR: Could not reach other namespace(s)" 1>&2 - ret=1 - fi - - ip netns exec "$ns0" ping -c 1 -q dead:$i::99 > /dev/null - if [ $? -ne 0 ];then - echo "ERROR: Could not reach other namespace(s) via ipv6" 1>&2 - ret=1 - fi - check_counters ns$i-$sfx - if [ $? -ne 0 ]; then - ret=1 - fi - - check_ns0_counters ns$i - if [ $? -ne 0 ]; then - ret=1 - fi - reset_counters -done +ping_basic() +{ + i="$1" + if ! ip netns exec "$ns0" ping -c 1 -q 10.0."$i".99 > /dev/null;then + echo "ERROR: Could not reach other namespace(s)" 1>&2 + ret=1 + fi + + if ! ip netns exec "$ns0" ping -c 1 -q dead:"$i"::99 > /dev/null;then + echo "ERROR: Could not reach other namespace(s) via ipv6" 1>&2 + ret=1 + fi +} + +test_basic_conn() +{ + local nsexec + name="$1" + + nsexec=$(eval echo \$"$1") + + ping_basic 1 + ping_basic 2 + + if ! check_counters "$nsexec";then + return 1 + fi + + if ! check_ns0_counters "$name";then + return 1 + fi + + reset_counters + return 0 +} + +if ! test_basic_conn "ns1" ; then + echo "ERROR: basic test for ns1 failed" 1>&2 + exit 1 +fi +if ! test_basic_conn "ns2"; then + echo "ERROR: basic test for ns1 failed" 1>&2 +fi if [ $ret -eq 0 ];then echo "PASS: netns routing/connectivity: $ns0 can reach $ns1 and $ns2" diff --git a/tools/testing/selftests/netfilter/nft_nat_zones.sh b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh index b9ab37380f33..549f264b41f3 100755 --- a/tools/testing/selftests/netfilter/nft_nat_zones.sh +++ b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh @@ -3,15 +3,14 @@ # Test connection tracking zone and NAT source port reallocation support. # -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh # Don't increase too much, 2000 clients should work # just fine but script can then take several minutes with # KASAN/debug builds. maxclients=100 -have_iperf=1 +have_socat=0 ret=0 # client1---. @@ -31,12 +30,6 @@ ret=0 # NAT Gateway is supposed to do port reallocation for each of the # connections. -sfx=$(mktemp -u "XXXXXXXX") -gw="ns-gw-$sfx" -cl1="ns-cl1-$sfx" -cl2="ns-cl2-$sfx" -srv="ns-srv-$sfx" - v4gc1=$(sysctl -n net.ipv4.neigh.default.gc_thresh1 2>/dev/null) v4gc2=$(sysctl -n net.ipv4.neigh.default.gc_thresh2 2>/dev/null) v4gc3=$(sysctl -n net.ipv4.neigh.default.gc_thresh3 2>/dev/null) @@ -46,61 +39,29 @@ v6gc3=$(sysctl -n net.ipv6.neigh.default.gc_thresh3 2>/dev/null) cleanup() { - ip netns del $gw - ip netns del $srv - for i in $(seq 1 $maxclients); do - ip netns del ns-cl$i-$sfx 2>/dev/null - done - - sysctl -q net.ipv4.neigh.default.gc_thresh1=$v4gc1 2>/dev/null - sysctl -q net.ipv4.neigh.default.gc_thresh2=$v4gc2 2>/dev/null - sysctl -q net.ipv4.neigh.default.gc_thresh3=$v4gc3 2>/dev/null - sysctl -q net.ipv6.neigh.default.gc_thresh1=$v6gc1 2>/dev/null - sysctl -q net.ipv6.neigh.default.gc_thresh2=$v6gc2 2>/dev/null - sysctl -q net.ipv6.neigh.default.gc_thresh3=$v6gc3 2>/dev/null + cleanup_all_ns + + sysctl -q net.ipv4.neigh.default.gc_thresh1="$v4gc1" 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh2="$v4gc2" 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh3="$v4gc3" 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh1="$v6gc1" 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh2="$v6gc2" 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh3="$v6gc3" 2>/dev/null } -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi +checktool "nft --version" echo "run test without nft tool" +checktool "conntrack -V" "run test without conntrack tool" -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip +if socat -h >/dev/null 2>&1; then + have_socat=1 fi -conntrack -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without conntrack tool" - exit $ksft_skip -fi - -iperf3 -v >/dev/null 2>&1 -if [ $? -ne 0 ];then - have_iperf=0 -fi - -ip netns add "$gw" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $gw" - exit $ksft_skip -fi -ip -net "$gw" link set lo up +setup_ns gw srv trap cleanup EXIT -ip netns add "$srv" -if [ $? -ne 0 ];then - echo "SKIP: Could not create server netns $srv" - exit $ksft_skip -fi - ip link add veth0 netns "$gw" type veth peer name eth0 netns "$srv" ip -net "$gw" link set veth0 up -ip -net "$srv" link set lo up ip -net "$srv" link set eth0 up sysctl -q net.ipv6.neigh.default.gc_thresh1=512 2>/dev/null @@ -110,55 +71,49 @@ sysctl -q net.ipv4.neigh.default.gc_thresh1=512 2>/dev/null sysctl -q net.ipv4.neigh.default.gc_thresh2=1024 2>/dev/null sysctl -q net.ipv4.neigh.default.gc_thresh3=4096 2>/dev/null -for i in $(seq 1 $maxclients);do - cl="ns-cl$i-$sfx" +for i in $(seq 1 "$maxclients");do + setup_ns "cl$i" - ip netns add "$cl" - if [ $? -ne 0 ];then - echo "SKIP: Could not create client netns $cl" - exit $ksft_skip - fi - ip link add veth$i netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1 - if [ $? -ne 0 ];then + cl=$(eval echo \$cl"$i") + if ! ip link add veth"$i" netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1;then echo "SKIP: No virtual ethernet pair device support in kernel" exit $ksft_skip fi done -for i in $(seq 1 $maxclients);do - cl="ns-cl$i-$sfx" - echo netns exec "$cl" ip link set lo up +for i in $(seq 1 "$maxclients");do + cl=$(eval echo \$cl"$i") echo netns exec "$cl" ip link set eth0 up echo netns exec "$cl" sysctl -q net.ipv4.tcp_syn_retries=2 - echo netns exec "$gw" ip link set veth$i up - echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.arp_ignore=2 - echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.rp_filter=0 + echo netns exec "$gw" ip link set "veth$i" up + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth"$i".arp_ignore=2 + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth"$i".rp_filter=0 # clients have same IP addresses. echo netns exec "$cl" ip addr add 10.1.0.3/24 dev eth0 - echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 + echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 nodad echo netns exec "$cl" ip route add default via 10.1.0.2 dev eth0 echo netns exec "$cl" ip route add default via dead:1::2 dev eth0 # NB: same addresses on client-facing interfaces. - echo netns exec "$gw" ip addr add 10.1.0.2/24 dev veth$i - echo netns exec "$gw" ip addr add dead:1::2/64 dev veth$i + echo netns exec "$gw" ip addr add 10.1.0.2/24 dev "veth$i" + echo netns exec "$gw" ip addr add dead:1::2/64 dev "veth$i" nodad # gw: policy routing - echo netns exec "$gw" ip route add 10.1.0.0/24 dev veth$i table $((1000+i)) - echo netns exec "$gw" ip route add dead:1::0/64 dev veth$i table $((1000+i)) + echo netns exec "$gw" ip route add 10.1.0.0/24 dev "veth$i" table $((1000+i)) + echo netns exec "$gw" ip route add dead:1::0/64 dev "veth$i" table $((1000+i)) echo netns exec "$gw" ip route add 10.3.0.0/24 dev veth0 table $((1000+i)) echo netns exec "$gw" ip route add dead:3::0/64 dev veth0 table $((1000+i)) - echo netns exec "$gw" ip rule add fwmark $i lookup $((1000+i)) + echo netns exec "$gw" ip rule add fwmark "$i" lookup $((1000+i)) done | ip -batch /dev/stdin ip -net "$gw" addr add 10.3.0.1/24 dev veth0 -ip -net "$gw" addr add dead:3::1/64 dev veth0 +ip -net "$gw" addr add dead:3::1/64 dev veth0 nodad ip -net "$srv" addr add 10.3.0.99/24 dev eth0 -ip -net "$srv" addr add dead:3::99/64 dev eth0 +ip -net "$srv" addr add dead:3::99/64 dev eth0 nodad -ip netns exec $gw nft -f /dev/stdin<<EOF +ip netns exec "$gw" nft -f /dev/stdin<<EOF table inet raw { map iiftomark { type ifname : mark @@ -203,18 +158,22 @@ table inet raw { } } EOF +if [ "$?" -ne 0 ];then + echo "SKIP: Could not add nftables rules" + exit $ksft_skip +fi ( echo add element inet raw iiftomark \{ for i in $(seq 1 $((maxclients-1))); do - echo \"veth$i\" : $i, + echo \"veth"$i"\" : "$i", done - echo \"veth$maxclients\" : $maxclients \} + echo \"veth"$maxclients"\" : "$maxclients" \} echo add element inet raw iiftozone \{ for i in $(seq 1 $((maxclients-1))); do - echo \"veth$i\" : $i, + echo \"veth"$i"\" : "$i", done echo \"veth$maxclients\" : $maxclients \} -) | ip netns exec $gw nft -f /dev/stdin +) | ip netns exec "$gw" nft -f /dev/stdin ip netns exec "$gw" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null ip netns exec "$gw" sysctl -q net.ipv6.conf.all.forwarding=1 > /dev/null @@ -224,73 +183,72 @@ ip netns exec "$gw" sysctl -q net.ipv4.conf.all.rp_filter=0 >/dev/null ip netns exec "$gw" sysctl -q net.ipv4.fwmark_reflect=1 > /dev/null ip netns exec "$gw" sysctl -q net.ipv6.fwmark_reflect=1 > /dev/null -for i in $(seq 1 $maxclients); do - cl="ns-cl$i-$sfx" - ip netns exec $cl ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 & - if [ $? -ne 0 ]; then - echo FAIL: Ping failure from $cl 1>&2 - ret=1 - break - fi +for i in $(seq 1 "$maxclients"); do + cl=$(eval echo \$cl"$i") + ip netns exec "$cl" ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 & done -wait +wait || ret=1 -for i in $(seq 1 $maxclients); do - ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }" - if [ $? -ne 0 ];then +[ "$ret" -ne 0 ] && "FAIL: Ping failure from $cl" 1>&2 + +for i in $(seq 1 "$maxclients"); do + if ! ip netns exec "$gw" nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }"; then ret=1 echo "FAIL: counter icmp mismatch for veth$i" 1>&2 - ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2 + ip netns exec "$gw" nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2 break fi done -ip netns exec $gw nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" -if [ $? -ne 0 ];then +if ! ip netns exec "$gw" nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * maxclients)) bytes $((252 * maxclients)) }"; then ret=1 - echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" - ip netns exec $gw nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2 + echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * maxclients)) bytes $((252 * maxclients)) }" + ip netns exec "$gw" nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2 fi -if [ $ret -eq 0 ]; then +if [ $ret -eq 0 ]; then echo "PASS: ping test from all $maxclients namespaces" fi -if [ $have_iperf -eq 0 ];then - echo "SKIP: iperf3 not installed" +if [ $have_socat -eq 0 ];then + echo "SKIP: socat not installed" if [ $ret -ne 0 ];then exit $ret fi exit $ksft_skip fi -ip netns exec $srv iperf3 -s > /dev/null 2>&1 & -iperfpid=$! -sleep 1 +listener_ready() +{ + ss -N "$1" -lnt -o "sport = :5201" | grep -q 5201 +} + +ip netns exec "$srv" socat -u TCP-LISTEN:5201,fork STDOUT > /dev/null 2>/dev/null & +socatpid=$! + +busywait 1000 listener_ready "$srv" -for i in $(seq 1 $maxclients); do +for i in $(seq 1 "$maxclients"); do if [ $ret -ne 0 ]; then break fi - cl="ns-cl$i-$sfx" - ip netns exec $cl iperf3 -c 10.3.0.99 --cport 10000 -n 1 > /dev/null - if [ $? -ne 0 ]; then - echo FAIL: Failure to connect for $cl 1>&2 - ip netns exec $gw conntrack -S 1>&2 + cl=$(eval echo \$cl"$i") + if ! ip netns exec "$cl" socat -4 -u STDIN TCP:10.3.0.99:5201,sourceport=10000 < /dev/null > /dev/null; then + echo "FAIL: Failure to connect for $cl" 1>&2 + ip netns exec "$gw" conntrack -S 1>&2 ret=1 fi done if [ $ret -eq 0 ];then - echo "PASS: iperf3 connections for all $maxclients net namespaces" + echo "PASS: socat connections for all $maxclients net namespaces" fi -kill $iperfpid +kill $socatpid wait -for i in $(seq 1 $maxclients); do - ip netns exec $gw nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null - if [ $? -ne 0 ];then +for i in $(seq 1 "$maxclients"); do + if ! ip netns exec "$gw" nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null;then ret=1 echo "FAIL: can't find expected tcp entry for veth$i" 1>&2 break @@ -300,8 +258,7 @@ if [ $ret -eq 0 ];then echo "PASS: Found client connection for all $maxclients net namespaces" fi -ip netns exec $gw nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null -if [ $? -ne 0 ];then +if ! ip netns exec "$gw" nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null;then ret=1 echo "FAIL: cannot find return entry on veth0" 1>&2 fi diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh new file mode 100755 index 000000000000..8538f08c64c2 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_queue.sh @@ -0,0 +1,417 @@ +#!/bin/bash +# +# This tests nf_queue: +# 1. can process packets from all hooks +# 2. support running nfqueue from more than one base chain +# +# shellcheck disable=SC2162,SC2317 + +source lib.sh +ret=0 +timeout=2 + +cleanup() +{ + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + ip netns pids "$nsrouter" | xargs kill 2>/dev/null + + cleanup_all_ns + + rm -f "$TMPINPUT" + rm -f "$TMPFILE0" + rm -f "$TMPFILE1" + rm -f "$TMPFILE2" "$TMPFILE3" +} + +checktool "nft --version" "test without nft tool" + +trap cleanup EXIT + +setup_ns ns1 ns2 nsrouter + +TMPFILE0=$(mktemp) +TMPFILE1=$(mktemp) +TMPFILE2=$(mktemp) +TMPFILE3=$(mktemp) + +TMPINPUT=$(mktemp) +dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT" + +if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi +ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2" + +ip -net "$nsrouter" link set veth0 up +ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad + +ip -net "$nsrouter" link set veth1 up +ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1 +ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad + +ip -net "$ns1" link set eth0 up +ip -net "$ns2" link set eth0 up + +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" route add default via dead:1::1 + +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns2" route add default via dead:2::1 + +load_ruleset() { + local name=$1 + local prio=$2 + +ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF +table inet $name { + chain nfq { + ip protocol icmp queue bypass + icmpv6 type { "echo-request", "echo-reply" } queue num 1 bypass + } + chain pre { + type filter hook prerouting priority $prio; policy accept; + jump nfq + } + chain input { + type filter hook input priority $prio; policy accept; + jump nfq + } + chain forward { + type filter hook forward priority $prio; policy accept; + tcp dport 12345 queue num 2 + jump nfq + } + chain output { + type filter hook output priority $prio; policy accept; + tcp dport 12345 queue num 3 + tcp sport 23456 queue num 3 + jump nfq + } + chain post { + type filter hook postrouting priority $prio; policy accept; + jump nfq + } +} +EOF +} + +load_counter_ruleset() { + local prio=$1 + +ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF +table inet countrules { + chain pre { + type filter hook prerouting priority $prio; policy accept; + counter + } + chain input { + type filter hook input priority $prio; policy accept; + counter + } + chain forward { + type filter hook forward priority $prio; policy accept; + counter + } + chain output { + type filter hook output priority $prio; policy accept; + counter + } + chain post { + type filter hook postrouting priority $prio; policy accept; + counter + } +} +EOF +} + +test_ping() { + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then + return 1 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then + return 2 + fi + + return 0 +} + +test_ping_router() { + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then + return 3 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then + return 4 + fi + + return 0 +} + +test_queue_blackhole() { + local proto=$1 + +ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF +table $proto blackh { + chain forward { + type filter hook forward priority 0; policy accept; + queue num 600 + } +} +EOF + if [ "$proto" = "ip" ] ;then + ip netns exec "$ns1" ping -W 2 -c 1 -q 10.0.2.99 > /dev/null + lret=$? + elif [ "$proto" = "ip6" ]; then + ip netns exec "$ns1" ping -W 2 -c 1 -q dead:2::99 > /dev/null + lret=$? + else + lret=111 + fi + + # queue without bypass keyword should drop traffic if no listener exists. + if [ "$lret" -eq 0 ];then + echo "FAIL: $proto expected failure, got $lret" 1>&2 + exit 1 + fi + + if ! ip netns exec "$nsrouter" nft delete table "$proto" blackh; then + echo "FAIL: $proto: Could not delete blackh table" + exit 1 + fi + + echo "PASS: $proto: statement with no listener results in packet drop" +} + +nf_queue_wait() +{ + local procfile="/proc/self/net/netfilter/nfnetlink_queue" + local netns id + + netns="$1" + id="$2" + + # if this file doesn't exist, nfnetlink_module isn't loaded. + # rather than loading it ourselves, wait for kernel module autoload + # completion, nfnetlink should do so automatically because nf_queue + # helper program, spawned in the background, asked for this functionality. + test -f "$procfile" && + ip netns exec "$netns" cat "$procfile" | grep -q "^ *$id " +} + +test_queue() +{ + local expected="$1" + local last="" + + # spawn nf_queue listeners + ip netns exec "$nsrouter" ./nf_queue -c -q 0 -t $timeout > "$TMPFILE0" & + ip netns exec "$nsrouter" ./nf_queue -c -q 1 -t $timeout > "$TMPFILE1" & + + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 0 + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 1 + + if ! test_ping;then + echo "FAIL: netns routing/connectivity with active listener on queues 0 and 1: $ret" 1>&2 + exit $ret + fi + + if ! test_ping_router;then + echo "FAIL: netns router unreachable listener on queue 0 and 1: $ret" 1>&2 + exit $ret + fi + + wait + ret=$? + + for file in $TMPFILE0 $TMPFILE1; do + last=$(tail -n1 "$file") + if [ x"$last" != x"$expected packets total" ]; then + echo "FAIL: Expected $expected packets total, but got $last" 1>&2 + ip netns exec "$nsrouter" nft list ruleset + exit 1 + fi + done + + echo "PASS: Expected and received $last" +} + +listener_ready() +{ + ss -N "$1" -lnt -o "sport = :12345" | grep -q 12345 +} + +test_tcp_forward() +{ + ip netns exec "$nsrouter" ./nf_queue -q 2 -t "$timeout" & + local nfqpid=$! + + timeout 5 ip netns exec "$ns2" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & + local rpid=$! + + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" + + ip netns exec "$ns1" socat -u STDIN TCP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null + + wait "$rpid" && echo "PASS: tcp and nfqueue in forward chain" +} + +test_tcp_localhost() +{ + dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT" + timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & + local rpid=$! + + ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" & + local nfqpid=$! + + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" + + ip netns exec "$nsrouter" socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" >/dev/null + + wait "$rpid" && echo "PASS: tcp via loopback" + wait 2>/dev/null +} + +test_tcp_localhost_connectclose() +{ + ip netns exec "$nsrouter" ./connect_close -p 23456 -t "$timeout" & + ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" & + + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 3 + + wait && echo "PASS: tcp via loopback with connect/close" + wait 2>/dev/null +} + +test_tcp_localhost_requeue() +{ +ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF +flush ruleset +table inet filter { + chain output { + type filter hook output priority 0; policy accept; + tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0 + } + chain post { + type filter hook postrouting priority 0; policy accept; + tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0 + } +} +EOF + timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & + local rpid=$! + + ip netns exec "$nsrouter" ./nf_queue -c -q 1 -t "$timeout" > "$TMPFILE2" & + + # nfqueue 1 will be called via output hook. But this time, + # re-queue the packet to nfqueue program on queue 2. + ip netns exec "$nsrouter" ./nf_queue -G -d 150 -c -q 0 -Q 1 -t "$timeout" > "$TMPFILE3" & + + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" + ip netns exec "$nsrouter" socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" > /dev/null + + wait + + if ! diff -u "$TMPFILE2" "$TMPFILE3" ; then + echo "FAIL: lost packets during requeue?!" 1>&2 + return + fi + + echo "PASS: tcp via loopback and re-queueing" +} + +test_icmp_vrf() { + if ! ip -net "$ns1" link add tvrf type vrf table 9876;then + echo "SKIP: Could not add vrf device" + return + fi + + ip -net "$ns1" li set eth0 master tvrf + ip -net "$ns1" li set tvrf up + + ip -net "$ns1" route add 10.0.2.0/24 via 10.0.1.1 dev eth0 table 9876 +ip netns exec "$ns1" nft -f /dev/stdin <<EOF +flush ruleset +table inet filter { + chain output { + type filter hook output priority 0; policy accept; + meta oifname "tvrf" icmp type echo-request counter queue num 1 + meta oifname "eth0" icmp type echo-request counter queue num 1 + } + chain post { + type filter hook postrouting priority 0; policy accept; + meta oifname "tvrf" icmp type echo-request counter queue num 1 + meta oifname "eth0" icmp type echo-request counter queue num 1 + } +} +EOF + ip netns exec "$ns1" ./nf_queue -q 1 -t "$timeout" & + local nfqpid=$! + + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$ns1" 1 + + ip netns exec "$ns1" ip vrf exec tvrf ping -c 1 10.0.2.99 > /dev/null + + for n in output post; do + for d in tvrf eth0; do + if ! ip netns exec "$ns1" nft list chain inet filter "$n" | grep -q "oifname \"$d\" icmp type echo-request counter packets 1"; then + echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2 + ip netns exec "$ns1" nft list ruleset + ret=1 + return + fi + done + done + + wait "$nfqpid" && echo "PASS: icmp+nfqueue via vrf" + wait 2>/dev/null +} + +ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + +load_ruleset "filter" 0 + +if test_ping; then + # queue bypass works (rules were skipped, no listener) + echo "PASS: ${ns1} can reach ${ns2}" +else + echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2 + exit $ret +fi + +test_queue_blackhole ip +test_queue_blackhole ip6 + +# dummy ruleset to add base chains between the +# queueing rules. We don't want the second reinject +# to re-execute the old hooks. +load_counter_ruleset 10 + +# we are hooking all: prerouting/input/forward/output/postrouting. +# we ping ${ns2} from ${ns1} via ${nsrouter} using ipv4 and ipv6, so: +# 1x icmp prerouting,forward,postrouting -> 3 queue events (6 incl. reply). +# 1x icmp prerouting,input,output postrouting -> 4 queue events incl. reply. +# so we expect that userspace program receives 10 packets. +test_queue 10 + +# same. We queue to a second program as well. +load_ruleset "filter2" 20 +test_queue 20 + +test_tcp_forward +test_tcp_localhost +test_tcp_localhost_connectclose +test_tcp_localhost_requeue +test_icmp_vrf + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_synproxy.sh b/tools/testing/selftests/net/netfilter/nft_synproxy.sh new file mode 100755 index 000000000000..293f667a6aec --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_synproxy.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +ret=0 + +checktool "nft --version" "run test without nft tool" +checktool "iperf3 --version" "run test without iperf3" + +setup_ns nsr ns1 ns2 + +modprobe -q nf_conntrack + +cleanup() { + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + + cleanup_all_ns +} + +trap cleanup EXIT + +ip link add veth0 netns "$nsr" type veth peer name eth0 netns "$ns1" +ip link add veth1 netns "$nsr" type veth peer name eth0 netns "$ns2" + +for dev in veth0 veth1; do + ip -net "$nsr" link set "$dev" up +done + +ip -net "$nsr" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsr" addr add 10.0.2.1/24 dev veth1 + +ip netns exec "$nsr" sysctl -q net.ipv4.conf.veth0.forwarding=1 +ip netns exec "$nsr" sysctl -q net.ipv4.conf.veth1.forwarding=1 +ip netns exec "$nsr" sysctl -q net.netfilter.nf_conntrack_tcp_loose=0 + +for n in $ns1 $ns2; do + ip -net "$n" link set eth0 up +done +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns2" route add default via 10.0.2.1 + +# test basic connectivity +if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then + echo "ERROR: $ns1 cannot reach $ns2" 1>&2 + exit 1 +fi + +if ! ip netns exec "$ns2" ping -c 1 -q 10.0.1.99 > /dev/null; then + echo "ERROR: $ns2 cannot reach $ns1" 1>&2 + exit 1 +fi + +ip netns exec "$ns2" iperf3 -s > /dev/null 2>&1 & +# ip netns exec $nsr tcpdump -vvv -n -i veth1 tcp | head -n 10 & + +sleep 1 + +ip netns exec "$nsr" nft -f - <<EOF +table inet filter { + chain prerouting { + type filter hook prerouting priority -300; policy accept; + meta iif veth0 tcp flags syn counter notrack + } + + chain forward { + type filter hook forward priority 0; policy accept; + + ct state new,established counter accept + + meta iif veth0 meta l4proto tcp ct state untracked,invalid synproxy mss 1460 sack-perm timestamp + + ct state invalid counter drop + + # make ns2 unreachable w.o. tcp synproxy + tcp flags syn counter drop + } +} +EOF +if [ $? -ne 0 ]; then + echo "SKIP: Cannot add nft synproxy" + exit $ksft_skip +fi + +if ! ip netns exec "$ns1" timeout 5 iperf3 -c 10.0.2.99 -n $((1 * 1024 * 1024)) > /dev/null; then + echo "FAIL: iperf3 returned an error" 1>&2 + ret=1 + ip netns exec "$nsr" nft list ruleset +else + echo "PASS: synproxy connection successful" +fi + +exit $ret diff --git a/tools/testing/selftests/netfilter/nft_zones_many.sh b/tools/testing/selftests/net/netfilter/nft_zones_many.sh index 5a8db0b48928..4ad75038f6ff 100755 --- a/tools/testing/selftests/netfilter/nft_zones_many.sh +++ b/tools/testing/selftests/net/netfilter/nft_zones_many.sh @@ -3,11 +3,7 @@ # Test insertion speed for packets with identical addresses/ports # that are all placed in distinct conntrack zones. -sfx=$(mktemp -u "XXXXXXXX") -ns="ns-$sfx" - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh zones=2000 have_ct_tool=0 @@ -15,35 +11,24 @@ ret=0 cleanup() { - ip netns del $ns -} - -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi + cleanup_all_ns } checktool "nft --version" "run test without nft tool" -checktool "ip -Version" "run test without ip tool" checktool "socat -V" "run test without socat tool" -checktool "ip netns add $ns" "create net namespace" + +setup_ns ns1 trap cleanup EXIT -conntrack -V > /dev/null 2>&1 -if [ $? -eq 0 ];then +if conntrack -V > /dev/null 2>&1; then have_ct_tool=1 fi -ip -net "$ns" link set lo up - test_zones() { local max_zones=$1 -ip netns exec $ns sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600 -ip netns exec $ns nft -f /dev/stdin<<EOF +ip netns exec "$ns1" nft -f /dev/stdin<<EOF flush ruleset table inet raw { map rndzone { @@ -56,29 +41,39 @@ table inet raw { } } EOF +if [ "$?" -ne 0 ];then + echo "SKIP: Cannot add nftables rules" + exit $ksft_skip +fi + + ip netns exec "$ns1" sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600 + ( echo "add element inet raw rndzone {" - for i in $(seq 1 $max_zones);do + for i in $(seq 1 "$max_zones");do echo -n "$i : $i" - if [ $i -lt $max_zones ]; then + if [ "$i" -lt "$max_zones" ]; then echo "," else echo "}" fi done - ) | ip netns exec $ns nft -f /dev/stdin + ) | ip netns exec "$ns1" nft -f /dev/stdin local i=0 local j=0 - local outerstart=$(date +%s%3N) - local stop=$outerstart - - while [ $i -lt $max_zones ]; do - local start=$(date +%s%3N) + local outerstart + local stop + outerstart=$(date +%s%3N) + stop=$outerstart + + while [ "$i" -lt "$max_zones" ]; do + local start + start=$(date +%s%3N) i=$((i + 1000)) j=$((j + 1)) # nft rule in output places each packet in a different zone. - dd if=/dev/zero of=/dev/stdout bs=8k count=1000 2>/dev/null | ip netns exec "$ns" socat STDIN UDP:127.0.0.1:12345,sourceport=12345 + dd if=/dev/zero bs=8k count=1000 2>/dev/null | ip netns exec "$ns1" socat -u STDIN UDP:127.0.0.1:12345,sourceport=12345 if [ $? -ne 0 ] ;then ret=1 break @@ -89,14 +84,15 @@ EOF echo "PASS: added 1000 entries in $duration ms (now $i total, loop $j)" done - if [ $have_ct_tool -eq 1 ]; then - local count=$(ip netns exec "$ns" conntrack -C) - local duration=$((stop-outerstart)) + if [ "$have_ct_tool" -eq 1 ]; then + local count duration + count=$(ip netns exec "$ns1" conntrack -C) + duration=$((stop-outerstart)) - if [ $count -eq $max_zones ]; then + if [ "$count" -eq "$max_zones" ]; then echo "PASS: inserted $count entries from packet path in $duration ms total" else - ip netns exec $ns conntrack -S 1>&2 + ip netns exec "$ns1" conntrack -S 1>&2 echo "FAIL: inserted $count entries from packet path in $duration ms total, expected $max_zones entries" ret=1 fi @@ -110,18 +106,19 @@ EOF test_conntrack_tool() { local max_zones=$1 - ip netns exec $ns conntrack -F >/dev/null 2>/dev/null + ip netns exec "$ns1" conntrack -F >/dev/null 2>/dev/null - local outerstart=$(date +%s%3N) - local start=$(date +%s%3N) - local stop=$start - local i=0 - while [ $i -lt $max_zones ]; do + local outerstart start stop i + outerstart=$(date +%s%3N) + start=$(date +%s%3N) + stop="$start" + i=0 + while [ "$i" -lt "$max_zones" ]; do i=$((i + 1)) - ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + ip netns exec "$ns1" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i >/dev/null 2>&1 if [ $? -ne 0 ];then - ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + ip netns exec "$ns1" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i > /dev/null echo "FAIL: conntrack -I returned an error" ret=1 @@ -137,13 +134,15 @@ test_conntrack_tool() { fi done - local count=$(ip netns exec "$ns" conntrack -C) - local duration=$((stop-outerstart)) + local count + local duration + count=$(ip netns exec "$ns1" conntrack -C) + duration=$((stop-outerstart)) - if [ $count -eq $max_zones ]; then + if [ "$count" -eq "$max_zones" ]; then echo "PASS: inserted $count entries via ctnetlink in $duration ms" else - ip netns exec $ns conntrack -S 1>&2 + ip netns exec "$ns1" conntrack -S 1>&2 echo "FAIL: inserted $count entries via ctnetlink in $duration ms, expected $max_zones entries ($duration ms)" ret=1 fi @@ -151,7 +150,7 @@ test_conntrack_tool() { test_zones $zones -if [ $have_ct_tool -eq 1 ];then +if [ "$have_ct_tool" -eq 1 ];then test_conntrack_tool $zones else echo "SKIP: Could not run ctnetlink insertion test without conntrack tool" diff --git a/tools/testing/selftests/netfilter/rpath.sh b/tools/testing/selftests/net/netfilter/rpath.sh index 5289c8447a41..4485fd7675ed 100755 --- a/tools/testing/selftests/netfilter/rpath.sh +++ b/tools/testing/selftests/net/netfilter/rpath.sh @@ -64,12 +64,18 @@ ip -net "$ns2" a a fec0:42::1/64 dev d0 nodad # firewall matches to test [ -n "$iptables" ] && { common='-t raw -A PREROUTING -s 192.168.0.0/16' - ip netns exec "$ns2" "$iptables" $common -m rpfilter + if ! ip netns exec "$ns2" "$iptables" $common -m rpfilter;then + echo "Cannot add rpfilter rule" + exit $ksft_skip + fi ip netns exec "$ns2" "$iptables" $common -m rpfilter --invert } [ -n "$ip6tables" ] && { common='-t raw -A PREROUTING -s fec0::/16' - ip netns exec "$ns2" "$ip6tables" $common -m rpfilter + if ! ip netns exec "$ns2" "$ip6tables" $common -m rpfilter;then + echo "Cannot add rpfilter rule" + exit $ksft_skip + fi ip netns exec "$ns2" "$ip6tables" $common -m rpfilter --invert } [ -n "$nft" ] && ip netns exec "$ns2" $nft -f - <<EOF diff --git a/tools/testing/selftests/netfilter/sctp_collision.c b/tools/testing/selftests/net/netfilter/sctp_collision.c index 21bb1cfd8a85..21bb1cfd8a85 100644 --- a/tools/testing/selftests/netfilter/sctp_collision.c +++ b/tools/testing/selftests/net/netfilter/sctp_collision.c diff --git a/tools/testing/selftests/net/netfilter/settings b/tools/testing/selftests/net/netfilter/settings new file mode 100644 index 000000000000..288bd9704773 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/settings @@ -0,0 +1 @@ +timeout=500 diff --git a/tools/testing/selftests/netfilter/xt_string.sh b/tools/testing/selftests/net/netfilter/xt_string.sh index 1802653a4728..8d401c69e317 100755 --- a/tools/testing/selftests/netfilter/xt_string.sh +++ b/tools/testing/selftests/net/netfilter/xt_string.sh @@ -5,53 +5,57 @@ ksft_skip=4 rc=0 -if ! iptables --version >/dev/null 2>&1; then - echo "SKIP: Test needs iptables" - exit $ksft_skip -fi -if ! ip -V >/dev/null 2>&1; then - echo "SKIP: Test needs iproute2" - exit $ksft_skip -fi -if ! nc -h >/dev/null 2>&1; then - echo "SKIP: Test needs netcat" - exit $ksft_skip -fi +source lib.sh + +checktool "socat -h" "run test without socat" +checktool "iptables --version" "test needs iptables" + +infile=$(mktemp) + +cleanup() +{ + ip netns del "$netns" + rm -f "$infile" +} + +trap cleanup EXIT + +setup_ns netns + +ip -net "$netns" link add d0 type dummy +ip -net "$netns" link set d0 up +ip -net "$netns" addr add 10.1.2.1/24 dev d0 pattern="foo bar baz" patlen=11 hdrlen=$((20 + 8)) # IPv4 + UDP -ns="ns-$(mktemp -u XXXXXXXX)" -trap 'ip netns del $ns' EXIT -ip netns add "$ns" -ip -net "$ns" link add d0 type dummy -ip -net "$ns" link set d0 up -ip -net "$ns" addr add 10.1.2.1/24 dev d0 - -#ip netns exec "$ns" tcpdump -npXi d0 & + +#ip netns exec "$netns" tcpdump -npXi d0 & #tcpdump_pid=$! -#trap 'kill $tcpdump_pid; ip netns del $ns' EXIT +#trap 'kill $tcpdump_pid; ip netns del $netns' EXIT add_rule() { # (alg, from, to) - ip netns exec "$ns" \ + ip netns exec "$netns" \ iptables -A OUTPUT -o d0 -m string \ - --string "$pattern" --algo $1 --from $2 --to $3 + --string "$pattern" --algo "$1" --from "$2" --to "$3" } showrules() { # () - ip netns exec "$ns" iptables -v -S OUTPUT | grep '^-A' + ip netns exec "$netns" iptables -v -S OUTPUT | grep '^-A' } zerorules() { - ip netns exec "$ns" iptables -Z OUTPUT + ip netns exec "$netns" iptables -Z OUTPUT } countrule() { # (pattern) showrules | grep -c -- "$*" } send() { # (offset) - ( for ((i = 0; i < $1 - $hdrlen; i++)); do - printf " " + ( for ((i = 0; i < $1 - hdrlen; i++)); do + echo -n " " done - printf "$pattern" - ) | ip netns exec "$ns" nc -w 1 -u 10.1.2.2 27374 + echo -n "$pattern" + ) > "$infile" + + ip netns exec "$netns" socat -t 1 -u STDIN UDP-SENDTO:10.1.2.2:27374 < "$infile" } add_rule bm 1000 1500 @@ -61,8 +65,8 @@ add_rule kmp 1400 1600 zerorules send 0 -send $((1000 - $patlen)) -if [ $(countrule -c 0 0) -ne 4 ]; then +send $((1000 - patlen)) +if [ "$(countrule -c 0 0)" -ne 4 ]; then echo "FAIL: rules match data before --from" showrules ((rc--)) @@ -70,16 +74,16 @@ fi zerorules send 1000 -send $((1400 - $patlen)) -if [ $(countrule -c 2) -ne 2 ]; then +send $((1400 - patlen)) +if [ "$(countrule -c 2)" -ne 2 ]; then echo "FAIL: only two rules should match at low offset" showrules ((rc--)) fi zerorules -send $((1500 - $patlen)) -if [ $(countrule -c 1) -ne 4 ]; then +send $((1500 - patlen)) +if [ "$(countrule -c 1)" -ne 4 ]; then echo "FAIL: all rules should match at end of packet" showrules ((rc--)) @@ -87,7 +91,7 @@ fi zerorules send 1495 -if [ $(countrule -c 1) -ne 1 ]; then +if [ "$(countrule -c 1)" -ne 1 ]; then echo "FAIL: only kmp with proper --to should match pattern spanning fragments" showrules ((rc--)) @@ -95,23 +99,23 @@ fi zerorules send 1500 -if [ $(countrule -c 1) -ne 2 ]; then +if [ "$(countrule -c 1)" -ne 2 ]; then echo "FAIL: two rules should match pattern at start of second fragment" showrules ((rc--)) fi zerorules -send $((1600 - $patlen)) -if [ $(countrule -c 1) -ne 2 ]; then +send $((1600 - patlen)) +if [ "$(countrule -c 1)" -ne 2 ]; then echo "FAIL: two rules should match pattern at end of largest --to" showrules ((rc--)) fi zerorules -send $((1600 - $patlen + 1)) -if [ $(countrule -c 1) -ne 0 ]; then +send $((1600 - patlen + 1)) +if [ "$(countrule -c 1)" -ne 0 ]; then echo "FAIL: no rules should match pattern extending largest --to" showrules ((rc--)) @@ -119,10 +123,11 @@ fi zerorules send 1600 -if [ $(countrule -c 1) -ne 0 ]; then +if [ "$(countrule -c 1)" -ne 0 ]; then echo "FAIL: no rule should match pattern past largest --to" showrules ((rc--)) fi +[ $rc -eq 0 ] && echo "PASS: string match tests" exit $rc diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py new file mode 100755 index 000000000000..93d9d914529b --- /dev/null +++ b/tools/testing/selftests/net/nl_netdev.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import time +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import ksft_eq, ksft_ge, ksft_busy_wait +from lib.py import NetdevFamily, NetdevSimDev, ip + + +def empty_check(nf) -> None: + devs = nf.dev_get({}, dump=True) + ksft_ge(len(devs), 1) + + +def lo_check(nf) -> None: + lo_info = nf.dev_get({"ifindex": 1}) + ksft_eq(len(lo_info['xdp-features']), 0) + ksft_eq(len(lo_info['xdp-rx-metadata-features']), 0) + + +def page_pool_check(nf) -> None: + with NetdevSimDev() as nsimdev: + nsim = nsimdev.nsims[0] + + def up(): + ip(f"link set dev {nsim.ifname} up") + + def down(): + ip(f"link set dev {nsim.ifname} down") + + def get_pp(): + pp_list = nf.page_pool_get({}, dump=True) + return [pp for pp in pp_list if pp.get("ifindex") == nsim.ifindex] + + # No page pools when down + down() + ksft_eq(len(get_pp()), 0) + + # Up, empty page pool appears + up() + pp_list = get_pp() + ksft_ge(len(pp_list), 0) + refs = sum([pp["inflight"] for pp in pp_list]) + ksft_eq(refs, 0) + + # Down, it disappears, again + down() + pp_list = get_pp() + ksft_eq(len(pp_list), 0) + + # Up, allocate a page + up() + nsim.dfs_write("pp_hold", "y") + pp_list = nf.page_pool_get({}, dump=True) + refs = sum([pp["inflight"] for pp in pp_list if pp.get("ifindex") == nsim.ifindex]) + ksft_ge(refs, 1) + + # Now let's leak a page + down() + pp_list = get_pp() + ksft_eq(len(pp_list), 1) + refs = sum([pp["inflight"] for pp in pp_list]) + ksft_eq(refs, 1) + attached = [pp for pp in pp_list if "detach-time" not in pp] + ksft_eq(len(attached), 0) + + # New pp can get created, and we'll have two + up() + pp_list = get_pp() + attached = [pp for pp in pp_list if "detach-time" not in pp] + detached = [pp for pp in pp_list if "detach-time" in pp] + ksft_eq(len(attached), 1) + ksft_eq(len(detached), 1) + + # Free the old page and the old pp is gone + nsim.dfs_write("pp_hold", "n") + # Freeing check is once a second so we may need to retry + ksft_busy_wait(lambda: len(get_pp()) == 1, deadline=2) + + # And down... + down() + ksft_eq(len(get_pp()), 0) + + # Last, leave the page hanging for destroy, nothing to check + # we're trying to exercise the orphaning path in the kernel + up() + nsim.dfs_write("pp_hold", "y") + + +def main() -> None: + nf = NetdevFamily() + ksft_run([empty_check, lo_check, page_pool_check], + args=(nf, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py index 5e0e539a323d..1dd057afd3fb 100644 --- a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py +++ b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py @@ -489,7 +489,7 @@ class ovsactions(nla): actstr, reason = parse_extract_field( actstr, "drop(", - "([0-9]+)", + r"([0-9]+)", lambda x: int(x, 0), False, None, @@ -502,9 +502,9 @@ class ovsactions(nla): actstr = actstr[len("drop"): ] return (totallen - len(actstr)) - elif parse_starts_block(actstr, "^(\d+)", False, True): + elif parse_starts_block(actstr, r"^(\d+)", False, True): actstr, output = parse_extract_field( - actstr, None, "(\d+)", lambda x: int(x), False, "0" + actstr, None, r"(\d+)", lambda x: int(x), False, "0" ) self["attrs"].append(["OVS_ACTION_ATTR_OUTPUT", output]) parsed = True @@ -512,7 +512,7 @@ class ovsactions(nla): actstr, recircid = parse_extract_field( actstr, "recirc(", - "([0-9a-fA-Fx]+)", + r"([0-9a-fA-Fx]+)", lambda x: int(x, 0), False, 0, @@ -588,17 +588,17 @@ class ovsactions(nla): actstr = actstr[3:] actstr, ip_block_min = parse_extract_field( - actstr, "=", "([0-9a-fA-F\.]+)", str, False + actstr, "=", r"([0-9a-fA-F\.]+)", str, False ) actstr, ip_block_max = parse_extract_field( - actstr, "-", "([0-9a-fA-F\.]+)", str, False + actstr, "-", r"([0-9a-fA-F\.]+)", str, False ) actstr, proto_min = parse_extract_field( - actstr, ":", "(\d+)", int, False + actstr, ":", r"(\d+)", int, False ) actstr, proto_max = parse_extract_field( - actstr, "-", "(\d+)", int, False + actstr, "-", r"(\d+)", int, False ) if t is not None: diff --git a/tools/testing/selftests/net/reuseaddr_conflict.c b/tools/testing/selftests/net/reuseaddr_conflict.c index 7c5b12664b03..bfb07dc49518 100644 --- a/tools/testing/selftests/net/reuseaddr_conflict.c +++ b/tools/testing/selftests/net/reuseaddr_conflict.c @@ -109,6 +109,6 @@ int main(void) fd1 = open_port(0, 1); if (fd1 >= 0) error(1, 0, "Was allowed to create an ipv4 reuseport on an already bound non-reuseport socket with no ipv6"); - fprintf(stderr, "Success"); + fprintf(stderr, "Success\n"); return 0; } diff --git a/tools/testing/selftests/bpf/progs/sample_map_ret0.c b/tools/testing/selftests/net/sample_map_ret0.bpf.c index 495990d355ef..43ca92594926 100644 --- a/tools/testing/selftests/bpf/progs/sample_map_ret0.c +++ b/tools/testing/selftests/net/sample_map_ret0.bpf.c @@ -17,7 +17,7 @@ struct { } array SEC(".maps"); /* Sample program which should always load for testing control paths. */ -SEC(".text") int func() +SEC("xdp") int func() { __u64 key64 = 0; __u32 key = 0; diff --git a/tools/testing/selftests/bpf/progs/sample_ret0.c b/tools/testing/selftests/net/sample_ret0.bpf.c index fec99750d6ea..1df5ca98bb65 100644 --- a/tools/testing/selftests/bpf/progs/sample_ret0.c +++ b/tools/testing/selftests/net/sample_ret0.bpf.c @@ -1,6 +1,9 @@ /* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */ +#define SEC(name) __attribute__((section(name), used)) + /* Sample program which should always load for testing control paths. */ +SEC("xdp") int func() { return 0; diff --git a/tools/testing/selftests/net/tcp_ao/lib/proc.c b/tools/testing/selftests/net/tcp_ao/lib/proc.c index 2fb6dd8adba6..8b984fa04286 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/proc.c +++ b/tools/testing/selftests/net/tcp_ao/lib/proc.c @@ -86,7 +86,7 @@ static void netstat_read_type(FILE *fnetstat, struct netstat **dest, char *line) pos = strchr(line, ' ') + 1; - if (fscanf(fnetstat, type->header_name) == EOF) + if (fscanf(fnetstat, "%[^ :]", type->header_name) == EOF) test_error("fscanf(%s)", type->header_name); if (fread(&tmp, 1, 1, fnetstat) != 1 || tmp != ':') test_error("Unexpected netstat format (%c)", tmp); diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c index 92276f916f2f..e408b9243b2c 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/setup.c +++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c @@ -17,37 +17,37 @@ static pthread_mutex_t ksft_print_lock = PTHREAD_MUTEX_INITIALIZER; void __test_msg(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_print_msg(buf); + ksft_print_msg("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_ok(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_pass(buf); + ksft_test_result_pass("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_fail(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_fail(buf); + ksft_test_result_fail("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_xfail(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_xfail(buf); + ksft_test_result_xfail("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_error(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_error(buf); + ksft_test_result_error("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_skip(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_skip(buf); + ksft_test_result_skip("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } diff --git a/tools/testing/selftests/net/tcp_ao/rst.c b/tools/testing/selftests/net/tcp_ao/rst.c index 7df8b8700e39..a2fe88d35ac0 100644 --- a/tools/testing/selftests/net/tcp_ao/rst.c +++ b/tools/testing/selftests/net/tcp_ao/rst.c @@ -256,8 +256,6 @@ static int test_wait_fds(int sk[], size_t nr, bool is_writable[], static void test_client_active_rst(unsigned int port) { - /* one in queue, another accept()ed */ - unsigned int wait_for = backlog + 2; int i, sk[3], err; bool is_writable[ARRAY_SIZE(sk)] = {false}; unsigned int last = ARRAY_SIZE(sk) - 1; @@ -275,16 +273,20 @@ static void test_client_active_rst(unsigned int port) for (i = 0; i < last; i++) { err = _test_connect_socket(sk[i], this_ip_dest, port, (i == 0) ? TEST_TIMEOUT_SEC : -1); - if (err < 0) test_error("failed to connect()"); } - synchronize_threads(); /* 2: connection accept()ed, another queued */ - err = test_wait_fds(sk, last, is_writable, wait_for, TEST_TIMEOUT_SEC); + synchronize_threads(); /* 2: two connections: one accept()ed, another queued */ + err = test_wait_fds(sk, last, is_writable, last, TEST_TIMEOUT_SEC); if (err < 0) test_error("test_wait_fds(): %d", err); + /* async connect() with third sk to get into request_sock_queue */ + err = _test_connect_socket(sk[last], this_ip_dest, port, -1); + if (err < 0) + test_error("failed to connect()"); + synchronize_threads(); /* 3: close listen socket */ if (test_client_verify(sk[0], packet_sz, quota / packet_sz, TEST_TIMEOUT_SEC)) test_fail("Failed to send data on connected socket"); @@ -292,13 +294,14 @@ static void test_client_active_rst(unsigned int port) test_ok("Verified established tcp connection"); synchronize_threads(); /* 4: finishing up */ - err = _test_connect_socket(sk[last], this_ip_dest, port, -1); - if (err < 0) - test_error("failed to connect()"); synchronize_threads(); /* 5: closed active sk */ - err = test_wait_fds(sk, ARRAY_SIZE(sk), NULL, - wait_for, TEST_TIMEOUT_SEC); + /* + * Wait for 2 connections: one accepted, another in the accept queue, + * the one in request_sock_queue won't get fully established, so + * doesn't receive an active RST, see inet_csk_listen_stop(). + */ + err = test_wait_fds(sk, last, NULL, last, TEST_TIMEOUT_SEC); if (err < 0) test_error("select(): %d", err); diff --git a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c index 452de131fa3a..517930f9721b 100644 --- a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c +++ b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c @@ -21,7 +21,7 @@ static void make_listen(int sk) static void test_vefify_ao_info(int sk, struct tcp_ao_info_opt *info, const char *tst) { - struct tcp_ao_info_opt tmp; + struct tcp_ao_info_opt tmp = {}; socklen_t len = sizeof(tmp); if (getsockopt(sk, IPPROTO_TCP, TCP_AO_INFO, &tmp, &len)) diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh index 8802604148dd..11a1ebda564f 100755 --- a/tools/testing/selftests/net/udpgro.sh +++ b/tools/testing/selftests/net/udpgro.sh @@ -7,7 +7,7 @@ source net_helper.sh readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" # set global exit status, but never reset nonzero one. check_err() diff --git a/tools/testing/selftests/net/udpgro_bench.sh b/tools/testing/selftests/net/udpgro_bench.sh index 7080eae5312b..c51ea90a1395 100755 --- a/tools/testing/selftests/net/udpgro_bench.sh +++ b/tools/testing/selftests/net/udpgro_bench.sh @@ -7,7 +7,7 @@ source net_helper.sh readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" cleanup() { local -r jobs="$(jobs -p)" diff --git a/tools/testing/selftests/net/udpgro_frglist.sh b/tools/testing/selftests/net/udpgro_frglist.sh index e1ff645bd3d1..17404f49cdb6 100755 --- a/tools/testing/selftests/net/udpgro_frglist.sh +++ b/tools/testing/selftests/net/udpgro_frglist.sh @@ -7,7 +7,7 @@ source net_helper.sh readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" cleanup() { local -r jobs="$(jobs -p)" @@ -42,8 +42,8 @@ run_one() { ip -n "${PEER_NS}" link set veth1 xdp object ${BPF_FILE} section xdp tc -n "${PEER_NS}" qdisc add dev veth1 clsact - tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file nat6to4.o section schedcls/ingress6/nat_6 direct-action - tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file nat6to4.o section schedcls/egress4/snat4 direct-action + tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file nat6to4.bpf.o section schedcls/ingress6/nat_6 direct-action + tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file nat6to4.bpf.o section schedcls/egress4/snat4 direct-action echo ${rx_args} ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r & @@ -89,7 +89,7 @@ if [ ! -f ${BPF_FILE} ]; then exit -1 fi -if [ ! -f nat6to4.o ]; then +if [ ! -f nat6to4.bpf.o ]; then echo "Missing nat6to4 helper. Run 'make' first" exit -1 fi diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh index 380cb15e942e..550d8eb3e224 100755 --- a/tools/testing/selftests/net/udpgro_fwd.sh +++ b/tools/testing/selftests/net/udpgro_fwd.sh @@ -3,7 +3,7 @@ source net_helper.sh -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" readonly BASE="ns-$(mktemp -u XXXXXX)" readonly SRC=2 readonly DST=1 @@ -244,7 +244,7 @@ for family in 4 6; do create_vxlan_pair ip netns exec $NS_DST ethtool -K veth$DST generic-receive-offload on ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on - run_test "GRO frag list over UDP tunnel" $OL_NET$DST 1 1 + run_test "GRO frag list over UDP tunnel" $OL_NET$DST 10 10 cleanup # use NAT to circumvent GRO FWD check @@ -258,13 +258,7 @@ for family in 4 6; do # load arp cache before running the test to reduce the amount of # stray traffic on top of the UDP tunnel ip netns exec $NS_SRC $PING -q -c 1 $OL_NET$DST_NAT >/dev/null - run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 1 1 $OL_NET$DST - cleanup - - create_vxlan_pair - run_bench "UDP tunnel fwd perf" $OL_NET$DST - ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on - run_bench "UDP tunnel GRO fwd perf" $OL_NET$DST + run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 10 10 $OL_NET$DST cleanup done diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c index 1d975bf52af3..85b3baa3f7f3 100644 --- a/tools/testing/selftests/net/udpgso.c +++ b/tools/testing/selftests/net/udpgso.c @@ -34,7 +34,7 @@ #endif #ifndef UDP_MAX_SEGMENTS -#define UDP_MAX_SEGMENTS (1 << 6UL) +#define UDP_MAX_SEGMENTS (1 << 7UL) #endif #define CONST_MTU_TEST 1500 diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh index 3a394b43e274..4f1edbafb946 100755 --- a/tools/testing/selftests/net/veth.sh +++ b/tools/testing/selftests/net/veth.sh @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" readonly STATS="$(mktemp -p /tmp ns-XXXXXX)" readonly BASE=`basename $STATS` readonly SRC=2 diff --git a/tools/testing/selftests/net/xdp_dummy.c b/tools/testing/selftests/net/xdp_dummy.bpf.c index d988b2e0cee8..d988b2e0cee8 100644 --- a/tools/testing/selftests/net/xdp_dummy.c +++ b/tools/testing/selftests/net/xdp_dummy.bpf.c diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile deleted file mode 100644 index 936c3085bb83..000000000000 --- a/tools/testing/selftests/netfilter/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Makefile for netfilter selftests - -TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \ - conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \ - nft_concat_range.sh nft_conntrack_helper.sh \ - nft_queue.sh nft_meta.sh nf_nat_edemux.sh \ - ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh \ - conntrack_vrf.sh nft_synproxy.sh rpath.sh nft_audit.sh \ - conntrack_sctp_collision.sh xt_string.sh \ - bridge_netfilter.sh - -HOSTPKG_CONFIG := pkg-config - -CFLAGS += $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null) -LDLIBS += $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl) - -TEST_GEN_FILES = nf-queue connect_close audit_logread sctp_collision \ - conntrack_dump_flush - -include ../lib.mk diff --git a/tools/testing/selftests/netfilter/bridge_brouter.sh b/tools/testing/selftests/netfilter/bridge_brouter.sh deleted file mode 100755 index 29f3955b9af7..000000000000 --- a/tools/testing/selftests/netfilter/bridge_brouter.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# -# This test is for bridge 'brouting', i.e. make some packets being routed -# rather than getting bridged even though they arrive on interface that is -# part of a bridge. - -# eth0 br0 eth0 -# setup is: ns1 <-> ns0 <-> ns2 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -ebtables -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ebtables" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add ns0 -ip netns add ns1 -ip netns add ns2 - -ip link add veth0 netns ns0 type veth peer name eth0 netns ns1 -if [ $? -ne 0 ]; then - echo "SKIP: Can't create veth device" - exit $ksft_skip -fi -ip link add veth1 netns ns0 type veth peer name eth0 netns ns2 - -ip -net ns0 link set lo up -ip -net ns0 link set veth0 up -ip -net ns0 link set veth1 up - -ip -net ns0 link add br0 type bridge -if [ $? -ne 0 ]; then - echo "SKIP: Can't create bridge br0" - exit $ksft_skip -fi - -ip -net ns0 link set veth0 master br0 -ip -net ns0 link set veth1 master br0 -ip -net ns0 link set br0 up -ip -net ns0 addr add 10.0.0.1/24 dev br0 - -# place both in same subnet, ns1 and ns2 connected via ns0:br0 -for i in 1 2; do - ip -net ns$i link set lo up - ip -net ns$i link set eth0 up - ip -net ns$i addr add 10.0.0.1$i/24 dev eth0 -done - -test_ebtables_broute() -{ - local cipt - - # redirect is needed so the dstmac is rewritten to the bridge itself, - # ip stack won't process OTHERHOST (foreign unicast mac) packets. - ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP - if [ $? -ne 0 ]; then - echo "SKIP: Could not add ebtables broute redirect rule" - return $ksft_skip - fi - - # ping netns1, expected to not work (ip forwarding is off) - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1 - if [ $? -eq 0 ]; then - echo "ERROR: ping works, should have failed" 1>&2 - return 1 - fi - - # enable forwarding on both interfaces. - # neither needs an ip address, but at least the bridge needs - # an ip address in same network segment as ns1 and ns2 (ns0 - # needs to be able to determine route for to-be-forwarded packet). - ip netns exec ns0 sysctl -q net.ipv4.conf.veth0.forwarding=1 - ip netns exec ns0 sysctl -q net.ipv4.conf.veth1.forwarding=1 - - sleep 1 - - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping did not work, but it should (broute+forward)" 1>&2 - return 1 - fi - - echo "PASS: ns1/ns2 connectivity with active broute rule" - ip netns exec ns0 ebtables -t broute -F - - # ping netns1, expected to work (frames are bridged) - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping did not work, but it should (bridged)" 1>&2 - return 1 - fi - - ip netns exec ns0 ebtables -t filter -A FORWARD -p ipv4 --ip-protocol icmp -j DROP - - # ping netns1, expected to not work (DROP in bridge forward) - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1 - if [ $? -eq 0 ]; then - echo "ERROR: ping works, should have failed (icmp forward drop)" 1>&2 - return 1 - fi - - # re-activate brouter - ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP - - ip netns exec ns2 ping -q -c 1 10.0.0.11 > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping did not work, but it should (broute+forward 2)" 1>&2 - return 1 - fi - - echo "PASS: ns1/ns2 connectivity with active broute rule and bridge forward drop" - return 0 -} - -# test basic connectivity -ip netns exec ns1 ping -c 1 -q 10.0.0.12 > /dev/null -if [ $? -ne 0 ]; then - echo "ERROR: Could not reach ns2 from ns1" 1>&2 - ret=1 -fi - -ip netns exec ns2 ping -c 1 -q 10.0.0.11 > /dev/null -if [ $? -ne 0 ]; then - echo "ERROR: Could not reach ns1 from ns2" 1>&2 - ret=1 -fi - -if [ $ret -eq 0 ];then - echo "PASS: netns connectivity: ns1 and ns2 can reach each other" -fi - -test_ebtables_broute -ret=$? -for i in 0 1 2; do ip netns del ns$i;done - -exit $ret diff --git a/tools/testing/selftests/netfilter/bridge_netfilter.sh b/tools/testing/selftests/netfilter/bridge_netfilter.sh deleted file mode 100644 index 659b3ab02c8b..000000000000 --- a/tools/testing/selftests/netfilter/bridge_netfilter.sh +++ /dev/null @@ -1,188 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Test bridge netfilter + conntrack, a combination that doesn't really work, -# with multicast/broadcast packets racing for hash table insertion. - -# eth0 br0 eth0 -# setup is: ns1 <->,ns0 <-> ns3 -# ns2 <-' `'-> ns4 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" -ns1="ns1-$sfx" -ns2="ns2-$sfx" -ns3="ns3-$sfx" -ns4="ns4-$sfx" - -ebtables -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ebtables" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -for i in $(seq 0 4); do - eval ip netns add \$ns$i -done - -cleanup() { - for i in $(seq 0 4); do eval ip netns del \$ns$i;done -} - -trap cleanup EXIT - -do_ping() -{ - fromns="$1" - dstip="$2" - - ip netns exec $fromns ping -c 1 -q $dstip > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping from $fromns to $dstip" - ip netns exec ${ns0} nft list ruleset - ret=1 - fi -} - -bcast_ping() -{ - fromns="$1" - dstip="$2" - - for i in $(seq 1 1000); do - ip netns exec $fromns ping -q -f -b -c 1 -q $dstip > /dev/null 2>&1 - if [ $? -ne 0 ]; then - echo "ERROR: ping -b from $fromns to $dstip" - ip netns exec ${ns0} nft list ruleset - fi - done -} - -ip link add veth1 netns ${ns0} type veth peer name eth0 netns ${ns1} -if [ $? -ne 0 ]; then - echo "SKIP: Can't create veth device" - exit $ksft_skip -fi - -ip link add veth2 netns ${ns0} type veth peer name eth0 netns $ns2 -ip link add veth3 netns ${ns0} type veth peer name eth0 netns $ns3 -ip link add veth4 netns ${ns0} type veth peer name eth0 netns $ns4 - -ip -net ${ns0} link set lo up - -for i in $(seq 1 4); do - ip -net ${ns0} link set veth$i up -done - -ip -net ${ns0} link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1 -if [ $? -ne 0 ]; then - echo "SKIP: Can't create bridge br0" - exit $ksft_skip -fi - -# make veth0,1,2 part of bridge. -for i in $(seq 1 3); do - ip -net ${ns0} link set veth$i master br0 -done - -# add a macvlan on top of the bridge. -MACVLAN_ADDR=ba:f3:13:37:42:23 -ip -net ${ns0} link add link br0 name macvlan0 type macvlan mode private -ip -net ${ns0} link set macvlan0 address ${MACVLAN_ADDR} -ip -net ${ns0} link set macvlan0 up -ip -net ${ns0} addr add 10.23.0.1/24 dev macvlan0 - -# add a macvlan on top of veth4. -MACVLAN_ADDR=ba:f3:13:37:42:24 -ip -net ${ns0} link add link veth4 name macvlan4 type macvlan mode vepa -ip -net ${ns0} link set macvlan4 address ${MACVLAN_ADDR} -ip -net ${ns0} link set macvlan4 up - -# make the macvlan part of the bridge. -# veth4 is not a bridge port, only the macvlan on top of it. -ip -net ${ns0} link set macvlan4 master br0 - -ip -net ${ns0} link set br0 up -ip -net ${ns0} addr add 10.0.0.1/24 dev br0 -ip netns exec ${ns0} sysctl -q net.bridge.bridge-nf-call-iptables=1 -ret=$? -if [ $ret -ne 0 ] ; then - echo "SKIP: bridge netfilter not available" - ret=$ksft_skip -fi - -# for testing, so namespaces will reply to ping -b probes. -ip netns exec ${ns0} sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0 - -# enable conntrack in ns0 and drop broadcast packets in forward to -# avoid them from getting confirmed in the postrouting hook before -# the cloned skb is passed up the stack. -ip netns exec ${ns0} nft -f - <<EOF -table ip filter { - chain input { - type filter hook input priority 1; policy accept - iifname br0 counter - ct state new accept - } -} - -table bridge filter { - chain forward { - type filter hook forward priority 0; policy accept - meta pkttype broadcast ip protocol icmp counter drop - } -} -EOF - -# place 1, 2 & 3 in same subnet, connected via ns0:br0. -# ns4 is placed in same subnet as well, but its not -# part of the bridge: the corresponding veth4 is not -# part of the bridge, only its macvlan interface. -for i in $(seq 1 4); do - eval ip -net \$ns$i link set lo up - eval ip -net \$ns$i link set eth0 up -done -for i in $(seq 1 2); do - eval ip -net \$ns$i addr add 10.0.0.1$i/24 dev eth0 -done - -ip -net ${ns3} addr add 10.23.0.13/24 dev eth0 -ip -net ${ns4} addr add 10.23.0.14/24 dev eth0 - -# test basic connectivity -do_ping ${ns1} 10.0.0.12 -do_ping ${ns3} 10.23.0.1 -do_ping ${ns4} 10.23.0.1 - -if [ $ret -eq 0 ];then - echo "PASS: netns connectivity: ns1 can reach ns2, ns3 and ns4 can reach ns0" -fi - -bcast_ping ${ns1} 10.0.0.255 - -# This should deliver broadcast to macvlan0, which is on top of ns0:br0. -bcast_ping ${ns3} 10.23.0.255 - -# same, this time via veth4:macvlan4. -bcast_ping ${ns4} 10.23.0.255 - -read t < /proc/sys/kernel/tainted - -if [ $t -eq 0 ];then - echo PASS: kernel not tainted -else - echo ERROR: kernel is tainted - ret=1 -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/config b/tools/testing/selftests/netfilter/config deleted file mode 100644 index 7c42b1b2c69b..000000000000 --- a/tools/testing/selftests/netfilter/config +++ /dev/null @@ -1,9 +0,0 @@ -CONFIG_NET_NS=y -CONFIG_NF_TABLES_INET=y -CONFIG_NFT_QUEUE=m -CONFIG_NFT_NAT=m -CONFIG_NFT_REDIR=m -CONFIG_NFT_MASQ=m -CONFIG_NFT_FLOW_OFFLOAD=m -CONFIG_NF_CT_NETLINK=m -CONFIG_AUDIT=y diff --git a/tools/testing/selftests/netfilter/conntrack_sctp_collision.sh b/tools/testing/selftests/netfilter/conntrack_sctp_collision.sh deleted file mode 100755 index a924e595cfd8..000000000000 --- a/tools/testing/selftests/netfilter/conntrack_sctp_collision.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Testing For SCTP COLLISION SCENARIO as Below: -# -# 14:35:47.655279 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT] [init tag: 2017837359] -# 14:35:48.353250 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT] [init tag: 1187206187] -# 14:35:48.353275 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT ACK] [init tag: 2017837359] -# 14:35:48.353283 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [COOKIE ECHO] -# 14:35:48.353977 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [COOKIE ACK] -# 14:35:48.855335 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT ACK] [init tag: 164579970] -# -# TOPO: SERVER_NS (link0)<--->(link1) ROUTER_NS (link2)<--->(link3) CLIENT_NS - -CLIENT_NS=$(mktemp -u client-XXXXXXXX) -CLIENT_IP="198.51.200.1" -CLIENT_PORT=1234 - -SERVER_NS=$(mktemp -u server-XXXXXXXX) -SERVER_IP="198.51.100.1" -SERVER_PORT=1234 - -ROUTER_NS=$(mktemp -u router-XXXXXXXX) -CLIENT_GW="198.51.200.2" -SERVER_GW="198.51.100.2" - -# setup the topo -setup() { - ip net add $CLIENT_NS - ip net add $SERVER_NS - ip net add $ROUTER_NS - ip -n $SERVER_NS link add link0 type veth peer name link1 netns $ROUTER_NS - ip -n $CLIENT_NS link add link3 type veth peer name link2 netns $ROUTER_NS - - ip -n $SERVER_NS link set link0 up - ip -n $SERVER_NS addr add $SERVER_IP/24 dev link0 - ip -n $SERVER_NS route add $CLIENT_IP dev link0 via $SERVER_GW - - ip -n $ROUTER_NS link set link1 up - ip -n $ROUTER_NS link set link2 up - ip -n $ROUTER_NS addr add $SERVER_GW/24 dev link1 - ip -n $ROUTER_NS addr add $CLIENT_GW/24 dev link2 - ip net exec $ROUTER_NS sysctl -wq net.ipv4.ip_forward=1 - - ip -n $CLIENT_NS link set link3 up - ip -n $CLIENT_NS addr add $CLIENT_IP/24 dev link3 - ip -n $CLIENT_NS route add $SERVER_IP dev link3 via $CLIENT_GW - - # simulate the delay on OVS upcall by setting up a delay for INIT_ACK with - # tc on $SERVER_NS side - tc -n $SERVER_NS qdisc add dev link0 root handle 1: htb - tc -n $SERVER_NS class add dev link0 parent 1: classid 1:1 htb rate 100mbit - tc -n $SERVER_NS filter add dev link0 parent 1: protocol ip u32 match ip protocol 132 \ - 0xff match u8 2 0xff at 32 flowid 1:1 - tc -n $SERVER_NS qdisc add dev link0 parent 1:1 handle 10: netem delay 1200ms - - # simulate the ctstate check on OVS nf_conntrack - ip net exec $ROUTER_NS iptables -A FORWARD -m state --state INVALID,UNTRACKED -j DROP - ip net exec $ROUTER_NS iptables -A INPUT -p sctp -j DROP - - # use a smaller number for assoc's max_retrans to reproduce the issue - modprobe sctp - ip net exec $CLIENT_NS sysctl -wq net.sctp.association_max_retrans=3 -} - -cleanup() { - ip net exec $CLIENT_NS pkill sctp_collision 2>&1 >/dev/null - ip net exec $SERVER_NS pkill sctp_collision 2>&1 >/dev/null - ip net del "$CLIENT_NS" - ip net del "$SERVER_NS" - ip net del "$ROUTER_NS" -} - -do_test() { - ip net exec $SERVER_NS ./sctp_collision server \ - $SERVER_IP $SERVER_PORT $CLIENT_IP $CLIENT_PORT & - ip net exec $CLIENT_NS ./sctp_collision client \ - $CLIENT_IP $CLIENT_PORT $SERVER_IP $SERVER_PORT -} - -# NOTE: one way to work around the issue is set a smaller hb_interval -# ip net exec $CLIENT_NS sysctl -wq net.sctp.hb_interval=3500 - -# run the test case -trap cleanup EXIT -setup && \ -echo "Test for SCTP Collision in nf_conntrack:" && \ -do_test && echo "PASS!" -exit $? diff --git a/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh deleted file mode 100755 index e7d7bf13cff5..000000000000 --- a/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh +++ /dev/null @@ -1,167 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Check that UNREPLIED tcp conntrack will eventually timeout. -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -waittime=20 -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -cleanup() { - ip netns pids $ns1 | xargs kill 2>/dev/null - ip netns pids $ns2 | xargs kill 2>/dev/null - - ip netns del $ns1 - ip netns del $ns2 -} - -ipv4() { - echo -n 192.168.$1.2 -} - -check_counter() -{ - ns=$1 - name=$2 - expect=$3 - local lret=0 - - cnt=$(ip netns exec $ns2 nft list counter inet filter "$name" | grep -q "$expect") - if [ $? -ne 0 ]; then - echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2 - ip netns exec $ns2 nft list counter inet filter "$name" 1>&2 - lret=1 - fi - - return $lret -} - -# Create test namespaces -ip netns add $ns1 || exit 1 - -trap cleanup EXIT - -ip netns add $ns2 || exit 1 - -# Connect the namespace to the host using a veth pair -ip -net $ns1 link add name veth1 type veth peer name veth2 -ip -net $ns1 link set netns $ns2 dev veth2 - -ip -net $ns1 link set up dev lo -ip -net $ns2 link set up dev lo -ip -net $ns1 link set up dev veth1 -ip -net $ns2 link set up dev veth2 - -ip -net $ns2 addr add 10.11.11.2/24 dev veth2 -ip -net $ns2 route add default via 10.11.11.1 - -ip netns exec $ns2 sysctl -q net.ipv4.conf.veth2.forwarding=1 - -# add a rule inside NS so we enable conntrack -ip netns exec $ns1 iptables -A INPUT -m state --state established,related -j ACCEPT - -ip -net $ns1 addr add 10.11.11.1/24 dev veth1 -ip -net $ns1 route add 10.99.99.99 via 10.11.11.2 - -# Check connectivity works -ip netns exec $ns1 ping -q -c 2 10.11.11.2 >/dev/null || exit 1 - -ip netns exec $ns2 nc -l -p 8080 < /dev/null & - -# however, conntrack entries are there - -ip netns exec $ns2 nft -f - <<EOF -table inet filter { - counter connreq { } - counter redir { } - chain input { - type filter hook input priority 0; policy accept; - ct state new tcp flags syn ip daddr 10.99.99.99 tcp dport 80 counter name "connreq" accept - ct state new ct status dnat tcp dport 8080 counter name "redir" accept - } -} -EOF -if [ $? -ne 0 ]; then - echo "ERROR: Could not load nft rules" - exit 1 -fi - -ip netns exec $ns2 sysctl -q net.netfilter.nf_conntrack_tcp_timeout_syn_sent=10 - -echo "INFO: connect $ns1 -> $ns2 to the virtual ip" -ip netns exec $ns1 bash -c 'while true ; do - nc -p 60000 10.99.99.99 80 - sleep 1 - done' & - -sleep 1 - -ip netns exec $ns2 nft -f - <<EOF -table inet nat { - chain prerouting { - type nat hook prerouting priority 0; policy accept; - ip daddr 10.99.99.99 tcp dport 80 redirect to :8080 - } -} -EOF -if [ $? -ne 0 ]; then - echo "ERROR: Could not load nat redirect" - exit 1 -fi - -count=$(ip netns exec $ns2 conntrack -L -p tcp --dport 80 2>/dev/null | wc -l) -if [ $count -eq 0 ]; then - echo "ERROR: $ns2 did not pick up tcp connection from peer" - exit 1 -fi - -echo "INFO: NAT redirect added in ns $ns2, waiting for $waittime seconds for nat to take effect" -for i in $(seq 1 $waittime); do - echo -n "." - - sleep 1 - - count=$(ip netns exec $ns2 conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l) - if [ $count -gt 0 ]; then - echo - echo "PASS: redirection took effect after $i seconds" - break - fi - - m=$((i%20)) - if [ $m -eq 0 ]; then - echo " waited for $i seconds" - fi -done - -expect="packets 1 bytes 60" -check_counter "$ns2" "redir" "$expect" -if [ $? -ne 0 ]; then - ret=1 -fi - -if [ $ret -eq 0 ];then - echo "PASS: redirection counter has expected values" -else - echo "ERROR: no tcp connection was redirected" -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/ipvs.sh b/tools/testing/selftests/netfilter/ipvs.sh deleted file mode 100755 index c3b8f90c497e..000000000000 --- a/tools/testing/selftests/netfilter/ipvs.sh +++ /dev/null @@ -1,228 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# End-to-end ipvs test suite -# Topology: -#--------------------------------------------------------------+ -# | | -# ns0 | ns1 | -# ----------- | ----------- ----------- | -# | veth01 | --------- | veth10 | | veth12 | | -# ----------- peer ----------- ----------- | -# | | | | -# ----------- | | | -# | br0 | |----------------- peer |--------------| -# ----------- | | | -# | | | | -# ---------- peer ---------- ----------- | -# | veth02 | --------- | veth20 | | veth21 | | -# ---------- | ---------- ----------- | -# | ns2 | -# | | -#--------------------------------------------------------------+ -# -# We assume that all network driver are loaded -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 -GREEN='\033[0;92m' -RED='\033[0;31m' -NC='\033[0m' # No Color - -readonly port=8080 - -readonly vip_v4=207.175.44.110 -readonly cip_v4=10.0.0.2 -readonly gip_v4=10.0.0.1 -readonly dip_v4=172.16.0.1 -readonly rip_v4=172.16.0.2 -readonly sip_v4=10.0.0.3 - -readonly infile="$(mktemp)" -readonly outfile="$(mktemp)" -readonly datalen=32 - -sysipvsnet="/proc/sys/net/ipv4/vs/" -if [ ! -d $sysipvsnet ]; then - modprobe -q ip_vs - if [ $? -ne 0 ]; then - echo "skip: could not run test without ipvs module" - exit $ksft_skip - fi -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ]; then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ipvsadm -v > /dev/null 2>&1 -if [ $? -ne 0 ]; then - echo "SKIP: Could not run test without ipvsadm" - exit $ksft_skip -fi - -setup() { - ip netns add ns0 - ip netns add ns1 - ip netns add ns2 - - ip link add veth01 netns ns0 type veth peer name veth10 netns ns1 - ip link add veth02 netns ns0 type veth peer name veth20 netns ns2 - ip link add veth12 netns ns1 type veth peer name veth21 netns ns2 - - ip netns exec ns0 ip link set veth01 up - ip netns exec ns0 ip link set veth02 up - ip netns exec ns0 ip link add br0 type bridge - ip netns exec ns0 ip link set veth01 master br0 - ip netns exec ns0 ip link set veth02 master br0 - ip netns exec ns0 ip link set br0 up - ip netns exec ns0 ip addr add ${cip_v4}/24 dev br0 - - ip netns exec ns1 ip link set lo up - ip netns exec ns1 ip link set veth10 up - ip netns exec ns1 ip addr add ${gip_v4}/24 dev veth10 - ip netns exec ns1 ip link set veth12 up - ip netns exec ns1 ip addr add ${dip_v4}/24 dev veth12 - - ip netns exec ns2 ip link set lo up - ip netns exec ns2 ip link set veth21 up - ip netns exec ns2 ip addr add ${rip_v4}/24 dev veth21 - ip netns exec ns2 ip link set veth20 up - ip netns exec ns2 ip addr add ${sip_v4}/24 dev veth20 - - sleep 1 - - dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none -} - -cleanup() { - for i in 0 1 2 - do - ip netns del ns$i > /dev/null 2>&1 - done - - if [ -f "${outfile}" ]; then - rm "${outfile}" - fi - if [ -f "${infile}" ]; then - rm "${infile}" - fi -} - -server_listen() { - ip netns exec ns2 nc -l -p 8080 > "${outfile}" & - server_pid=$! - sleep 0.2 -} - -client_connect() { - ip netns exec ns0 timeout 2 nc -w 1 ${vip_v4} ${port} < "${infile}" -} - -verify_data() { - wait "${server_pid}" - cmp "$infile" "$outfile" 2>/dev/null -} - -test_service() { - server_listen - client_connect - verify_data -} - - -test_dr() { - ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 - - ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1 - ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr - ip netns exec ns1 ipvsadm -a -t ${vip_v4}:${port} -r ${rip_v4}:${port} - ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 - - # avoid incorrect arp response - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1 - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2 - # avoid reverse route lookup - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0 - ip netns exec ns2 sysctl -qw net.ipv4.conf.veth21.rp_filter=0 - ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1 - - test_service -} - -test_nat() { - ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 - - ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1 - ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr - ip netns exec ns1 ipvsadm -a -m -t ${vip_v4}:${port} -r ${rip_v4}:${port} - ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 - - ip netns exec ns2 ip link del veth20 - ip netns exec ns2 ip route add default via ${dip_v4} dev veth21 - - test_service -} - -test_tun() { - ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 - - ip netns exec ns1 modprobe ipip - ip netns exec ns1 ip link set tunl0 up - ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=0 - ip netns exec ns1 sysctl -qw net.ipv4.conf.all.send_redirects=0 - ip netns exec ns1 sysctl -qw net.ipv4.conf.default.send_redirects=0 - ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr - ip netns exec ns1 ipvsadm -a -i -t ${vip_v4}:${port} -r ${rip_v4}:${port} - ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 - - ip netns exec ns2 modprobe ipip - ip netns exec ns2 ip link set tunl0 up - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1 - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2 - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0 - ip netns exec ns2 sysctl -qw net.ipv4.conf.tunl0.rp_filter=0 - ip netns exec ns2 sysctl -qw net.ipv4.conf.veth21.rp_filter=0 - ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1 - - test_service -} - -run_tests() { - local errors= - - echo "Testing DR mode..." - cleanup - setup - test_dr - errors=$(( $errors + $? )) - - echo "Testing NAT mode..." - cleanup - setup - test_nat - errors=$(( $errors + $? )) - - echo "Testing Tunnel mode..." - cleanup - setup - test_tun - errors=$(( $errors + $? )) - - return $errors -} - -trap cleanup EXIT - -run_tests - -if [ $? -ne 0 ]; then - echo -e "$(basename $0): ${RED}FAIL${NC}" - exit 1 -fi -echo -e "$(basename $0): ${GREEN}PASS${NC}" -exit 0 diff --git a/tools/testing/selftests/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/netfilter/nf_nat_edemux.sh deleted file mode 100755 index a1aa8f4a5828..000000000000 --- a/tools/testing/selftests/netfilter/nf_nat_edemux.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Test NAT source port clash resolution -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -socatpid=0 - -cleanup() -{ - [ $socatpid -gt 0 ] && kill $socatpid - ip netns del $ns1 - ip netns del $ns2 -} - -socat -h > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without socat" - exit $ksft_skip -fi - -iptables --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without iptables" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add "$ns1" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns1" - exit $ksft_skip -fi - -trap cleanup EXIT - -ip netns add $ns2 - -# Connect the namespaces using a veth pair -ip link add name veth2 type veth peer name veth1 -ip link set netns $ns1 dev veth1 -ip link set netns $ns2 dev veth2 - -ip netns exec $ns1 ip link set up dev lo -ip netns exec $ns1 ip link set up dev veth1 -ip netns exec $ns1 ip addr add 192.168.1.1/24 dev veth1 - -ip netns exec $ns2 ip link set up dev lo -ip netns exec $ns2 ip link set up dev veth2 -ip netns exec $ns2 ip addr add 192.168.1.2/24 dev veth2 - -# Create a server in one namespace -ip netns exec $ns1 socat -u TCP-LISTEN:5201,fork OPEN:/dev/null,wronly=1 & -socatpid=$! - -# Restrict source port to just one so we don't have to exhaust -# all others. -ip netns exec $ns2 sysctl -q net.ipv4.ip_local_port_range="10000 10000" - -# add a virtual IP using DNAT -ip netns exec $ns2 iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201 - -# ... and route it to the other namespace -ip netns exec $ns2 ip route add 10.96.0.1 via 192.168.1.1 - -sleep 1 - -# add a persistent connection from the other namespace -ip netns exec $ns2 socat -t 10 - TCP:192.168.1.1:5201 > /dev/null & - -sleep 1 - -# ip daddr:dport will be rewritten to 192.168.1.1 5201 -# NAT must reallocate source port 10000 because -# 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use -echo test | ip netns exec $ns2 socat -t 3 -u STDIN TCP:10.96.0.1:443,connect-timeout=3 >/dev/null -ret=$? - -# Check socat can connect to 10.96.0.1:443 (aka 192.168.1.1:5201). -if [ $ret -eq 0 ]; then - echo "PASS: socat can connect via NAT'd address" -else - echo "FAIL: socat cannot connect via NAT'd address" -fi - -# check sport clashres. -ip netns exec $ns1 iptables -t nat -A PREROUTING -p tcp --dport 5202 -j REDIRECT --to-ports 5201 -ip netns exec $ns1 iptables -t nat -A PREROUTING -p tcp --dport 5203 -j REDIRECT --to-ports 5201 - -sleep 5 | ip netns exec $ns2 socat -t 5 -u STDIN TCP:192.168.1.1:5202,connect-timeout=5 >/dev/null & -cpid1=$! -sleep 1 - -# if connect succeeds, client closes instantly due to EOF on stdin. -# if connect hangs, it will time out after 5s. -echo | ip netns exec $ns2 socat -t 3 -u STDIN TCP:192.168.1.1:5203,connect-timeout=5 >/dev/null & -cpid2=$! - -time_then=$(date +%s) -wait $cpid2 -rv=$? -time_now=$(date +%s) - -# Check how much time has elapsed, expectation is for -# 'cpid2' to connect and then exit (and no connect delay). -delta=$((time_now - time_then)) - -if [ $delta -lt 2 -a $rv -eq 0 ]; then - echo "PASS: could connect to service via redirected ports" -else - echo "FAIL: socat cannot connect to service via redirect ($delta seconds elapsed, returned $rv)" - ret=1 -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/netfilter/nft_conntrack_helper.sh deleted file mode 100755 index faa7778d7bd1..000000000000 --- a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh +++ /dev/null @@ -1,197 +0,0 @@ -#!/bin/bash -# -# This tests connection tracking helper assignment: -# 1. can attach ftp helper to a connection from nft ruleset. -# 2. auto-assign still works. -# -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -testipv6=1 - -cleanup() -{ - ip netns del ${ns1} - ip netns del ${ns2} -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -conntrack -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without conntrack tool" - exit $ksft_skip -fi - -which nc >/dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without netcat tool" - exit $ksft_skip -fi - -trap cleanup EXIT - -ip netns add ${ns1} -ip netns add ${ns2} - -ip link add veth0 netns ${ns1} type veth peer name veth0 netns ${ns2} > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip -fi - -ip -net ${ns1} link set lo up -ip -net ${ns1} link set veth0 up - -ip -net ${ns2} link set lo up -ip -net ${ns2} link set veth0 up - -ip -net ${ns1} addr add 10.0.1.1/24 dev veth0 -ip -net ${ns1} addr add dead:1::1/64 dev veth0 - -ip -net ${ns2} addr add 10.0.1.2/24 dev veth0 -ip -net ${ns2} addr add dead:1::2/64 dev veth0 - -load_ruleset_family() { - local family=$1 - local ns=$2 - -ip netns exec ${ns} nft -f - <<EOF -table $family raw { - ct helper ftp { - type "ftp" protocol tcp - } - chain pre { - type filter hook prerouting priority 0; policy accept; - tcp dport 2121 ct helper set "ftp" - } - chain output { - type filter hook output priority 0; policy accept; - tcp dport 2121 ct helper set "ftp" - } -} -EOF - return $? -} - -check_for_helper() -{ - local netns=$1 - local message=$2 - local port=$3 - - if echo $message |grep -q 'ipv6';then - local family="ipv6" - else - local family="ipv4" - fi - - ip netns exec ${netns} conntrack -L -f $family -p tcp --dport $port 2> /dev/null |grep -q 'helper=ftp' - if [ $? -ne 0 ] ; then - if [ $autoassign -eq 0 ] ;then - echo "FAIL: ${netns} did not show attached helper $message" 1>&2 - ret=1 - else - echo "PASS: ${netns} did not show attached helper $message" 1>&2 - fi - else - if [ $autoassign -eq 0 ] ;then - echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2 - else - echo "FAIL: ${netns} connection on port $port has ftp helper attached" 1>&2 - ret=1 - fi - fi - - return 0 -} - -test_helper() -{ - local port=$1 - local autoassign=$2 - - if [ $autoassign -eq 0 ] ;then - msg="set via ruleset" - else - msg="auto-assign" - fi - - sleep 3 | ip netns exec ${ns2} nc -w 2 -l -p $port > /dev/null & - - sleep 1 | ip netns exec ${ns1} nc -w 2 10.0.1.2 $port > /dev/null & - sleep 1 - - check_for_helper "$ns1" "ip $msg" $port $autoassign - check_for_helper "$ns2" "ip $msg" $port $autoassign - - wait - - if [ $testipv6 -eq 0 ] ;then - return 0 - fi - - ip netns exec ${ns1} conntrack -F 2> /dev/null - ip netns exec ${ns2} conntrack -F 2> /dev/null - - sleep 3 | ip netns exec ${ns2} nc -w 2 -6 -l -p $port > /dev/null & - - sleep 1 | ip netns exec ${ns1} nc -w 2 -6 dead:1::2 $port > /dev/null & - sleep 1 - - check_for_helper "$ns1" "ipv6 $msg" $port - check_for_helper "$ns2" "ipv6 $msg" $port - - wait -} - -load_ruleset_family ip ${ns1} -if [ $? -ne 0 ];then - echo "FAIL: ${ns1} cannot load ip ruleset" 1>&2 - exit 1 -fi - -load_ruleset_family ip6 ${ns1} -if [ $? -ne 0 ];then - echo "SKIP: ${ns1} cannot load ip6 ruleset" 1>&2 - testipv6=0 -fi - -load_ruleset_family inet ${ns2} -if [ $? -ne 0 ];then - echo "SKIP: ${ns1} cannot load inet ruleset" 1>&2 - load_ruleset_family ip ${ns2} - if [ $? -ne 0 ];then - echo "FAIL: ${ns2} cannot load ip ruleset" 1>&2 - exit 1 - fi - - if [ $testipv6 -eq 1 ] ;then - load_ruleset_family ip6 ${ns2} - if [ $? -ne 0 ];then - echo "FAIL: ${ns2} cannot load ip6 ruleset" 1>&2 - exit 1 - fi - fi -fi - -test_helper 2121 0 -ip netns exec ${ns1} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' -ip netns exec ${ns2} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' -test_helper 21 1 - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_fib.sh b/tools/testing/selftests/netfilter/nft_fib.sh deleted file mode 100755 index dff476e45e77..000000000000 --- a/tools/testing/selftests/netfilter/nft_fib.sh +++ /dev/null @@ -1,273 +0,0 @@ -#!/bin/bash -# -# This tests the fib expression. -# -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -nsrouter="nsrouter-$sfx" -timeout=4 - -log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) - -cleanup() -{ - ip netns del ${ns1} - ip netns del ${ns2} - ip netns del ${nsrouter} - - [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add ${nsrouter} -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace" - exit $ksft_skip -fi - -trap cleanup EXIT - -dmesg | grep -q ' nft_rpfilter: ' -if [ $? -eq 0 ]; then - dmesg -c | grep ' nft_rpfilter: ' - echo "WARN: a previous test run has failed" 1>&2 -fi - -sysctl -q net.netfilter.nf_log_all_netns=1 -ip netns add ${ns1} -ip netns add ${ns2} - -load_ruleset() { - local netns=$1 - -ip netns exec ${netns} nft -f /dev/stdin <<EOF -table inet filter { - chain prerouting { - type filter hook prerouting priority 0; policy accept; - fib saddr . iif oif missing counter log prefix "$netns nft_rpfilter: " drop - } -} -EOF -} - -load_pbr_ruleset() { - local netns=$1 - -ip netns exec ${netns} nft -f /dev/stdin <<EOF -table inet filter { - chain forward { - type filter hook forward priority raw; - fib saddr . iif oif gt 0 accept - log drop - } -} -EOF -} - -load_ruleset_count() { - local netns=$1 - -ip netns exec ${netns} nft -f /dev/stdin <<EOF -table inet filter { - chain prerouting { - type filter hook prerouting priority 0; policy accept; - ip daddr 1.1.1.1 fib saddr . iif oif missing counter drop - ip6 daddr 1c3::c01d fib saddr . iif oif missing counter drop - } -} -EOF -} - -check_drops() { - dmesg | grep -q ' nft_rpfilter: ' - if [ $? -eq 0 ]; then - dmesg | grep ' nft_rpfilter: ' - echo "FAIL: rpfilter did drop packets" - return 1 - fi - - return 0 -} - -check_fib_counter() { - local want=$1 - local ns=$2 - local address=$3 - - line=$(ip netns exec ${ns} nft list table inet filter | grep 'fib saddr . iif' | grep $address | grep "packets $want" ) - ret=$? - - if [ $ret -ne 0 ];then - echo "Netns $ns fib counter doesn't match expected packet count of $want for $address" 1>&2 - ip netns exec ${ns} nft list table inet filter - return 1 - fi - - if [ $want -gt 0 ]; then - echo "PASS: fib expression did drop packets for $address" - fi - - return 0 -} - -load_ruleset ${nsrouter} -load_ruleset ${ns1} -load_ruleset ${ns2} - -ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip -fi -ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} - -ip -net ${nsrouter} link set lo up -ip -net ${nsrouter} link set veth0 up -ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 -ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 - -ip -net ${nsrouter} link set veth1 up -ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 -ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 - -ip -net ${ns1} link set lo up -ip -net ${ns1} link set eth0 up - -ip -net ${ns2} link set lo up -ip -net ${ns2} link set eth0 up - -ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 -ip -net ${ns1} route add default via 10.0.1.1 -ip -net ${ns1} route add default via dead:1::1 - -ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns2} addr add dead:2::99/64 dev eth0 -ip -net ${ns2} route add default via 10.0.2.1 -ip -net ${ns2} route add default via dead:2::1 - -test_ping() { - local daddr4=$1 - local daddr6=$2 - - ip netns exec ${ns1} ping -c 1 -q $daddr4 > /dev/null - ret=$? - if [ $ret -ne 0 ];then - check_drops - echo "FAIL: ${ns1} cannot reach $daddr4, ret $ret" 1>&2 - return 1 - fi - - ip netns exec ${ns1} ping -c 3 -q $daddr6 > /dev/null - ret=$? - if [ $ret -ne 0 ];then - check_drops - echo "FAIL: ${ns1} cannot reach $daddr6, ret $ret" 1>&2 - return 1 - fi - - return 0 -} - -ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null - -sleep 3 - -test_ping 10.0.2.1 dead:2::1 || exit 1 -check_drops || exit 1 - -test_ping 10.0.2.99 dead:2::99 || exit 1 -check_drops || exit 1 - -echo "PASS: fib expression did not cause unwanted packet drops" - -ip netns exec ${nsrouter} nft flush table inet filter - -ip -net ${ns1} route del default -ip -net ${ns1} -6 route del default - -ip -net ${ns1} addr del 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr del dead:1::99/64 dev eth0 - -ip -net ${ns1} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns1} addr add dead:2::99/64 dev eth0 - -ip -net ${ns1} route add default via 10.0.2.1 -ip -net ${ns1} -6 route add default via dead:2::1 - -ip -net ${nsrouter} addr add dead:2::1/64 dev veth0 - -# switch to ruleset that doesn't log, this time -# its expected that this does drop the packets. -load_ruleset_count ${nsrouter} - -# ns1 has a default route, but nsrouter does not. -# must not check return value, ping to 1.1.1.1 will -# fail. -check_fib_counter 0 ${nsrouter} 1.1.1.1 || exit 1 -check_fib_counter 0 ${nsrouter} 1c3::c01d || exit 1 - -ip netns exec ${ns1} ping -c 1 -W 1 -q 1.1.1.1 > /dev/null -check_fib_counter 1 ${nsrouter} 1.1.1.1 || exit 1 - -sleep 2 -ip netns exec ${ns1} ping -c 3 -q 1c3::c01d > /dev/null -check_fib_counter 3 ${nsrouter} 1c3::c01d || exit 1 - -# delete all rules -ip netns exec ${ns1} nft flush ruleset -ip netns exec ${ns2} nft flush ruleset -ip netns exec ${nsrouter} nft flush ruleset - -ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 - -ip -net ${ns1} addr del 10.0.2.99/24 dev eth0 -ip -net ${ns1} addr del dead:2::99/64 dev eth0 - -ip -net ${nsrouter} addr del dead:2::1/64 dev veth0 - -# ... pbr ruleset for the router, check iif+oif. -load_pbr_ruleset ${nsrouter} -if [ $? -ne 0 ] ; then - echo "SKIP: Could not load fib forward ruleset" - exit $ksft_skip -fi - -ip -net ${nsrouter} rule add from all table 128 -ip -net ${nsrouter} rule add from all iif veth0 table 129 -ip -net ${nsrouter} route add table 128 to 10.0.1.0/24 dev veth0 -ip -net ${nsrouter} route add table 129 to 10.0.2.0/24 dev veth1 - -# drop main ipv4 table -ip -net ${nsrouter} -4 rule delete table main - -test_ping 10.0.2.99 dead:2::99 -if [ $? -ne 0 ] ; then - ip -net ${nsrouter} nft list ruleset - echo "FAIL: fib mismatch in pbr setup" - exit 1 -fi - -echo "PASS: fib expression forward check with policy based routing" -exit 0 diff --git a/tools/testing/selftests/netfilter/nft_queue.sh b/tools/testing/selftests/netfilter/nft_queue.sh deleted file mode 100755 index e12729753351..000000000000 --- a/tools/testing/selftests/netfilter/nft_queue.sh +++ /dev/null @@ -1,449 +0,0 @@ -#!/bin/bash -# -# This tests nf_queue: -# 1. can process packets from all hooks -# 2. support running nfqueue from more than one base chain -# -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -nsrouter="nsrouter-$sfx" -timeout=4 - -cleanup() -{ - ip netns pids ${ns1} | xargs kill 2>/dev/null - ip netns pids ${ns2} | xargs kill 2>/dev/null - ip netns pids ${nsrouter} | xargs kill 2>/dev/null - - ip netns del ${ns1} - ip netns del ${ns2} - ip netns del ${nsrouter} - rm -f "$TMPFILE0" - rm -f "$TMPFILE1" - rm -f "$TMPFILE2" "$TMPFILE3" -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add ${nsrouter} -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace" - exit $ksft_skip -fi - -TMPFILE0=$(mktemp) -TMPFILE1=$(mktemp) -TMPFILE2=$(mktemp) -TMPFILE3=$(mktemp) -trap cleanup EXIT - -ip netns add ${ns1} -ip netns add ${ns2} - -ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip -fi -ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} - -ip -net ${nsrouter} link set lo up -ip -net ${nsrouter} link set veth0 up -ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 -ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 - -ip -net ${nsrouter} link set veth1 up -ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 -ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 - -ip -net ${ns1} link set lo up -ip -net ${ns1} link set eth0 up - -ip -net ${ns2} link set lo up -ip -net ${ns2} link set eth0 up - -ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 -ip -net ${ns1} route add default via 10.0.1.1 -ip -net ${ns1} route add default via dead:1::1 - -ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns2} addr add dead:2::99/64 dev eth0 -ip -net ${ns2} route add default via 10.0.2.1 -ip -net ${ns2} route add default via dead:2::1 - -load_ruleset() { - local name=$1 - local prio=$2 - -ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF -table inet $name { - chain nfq { - ip protocol icmp queue bypass - icmpv6 type { "echo-request", "echo-reply" } queue num 1 bypass - } - chain pre { - type filter hook prerouting priority $prio; policy accept; - jump nfq - } - chain input { - type filter hook input priority $prio; policy accept; - jump nfq - } - chain forward { - type filter hook forward priority $prio; policy accept; - tcp dport 12345 queue num 2 - jump nfq - } - chain output { - type filter hook output priority $prio; policy accept; - tcp dport 12345 queue num 3 - tcp sport 23456 queue num 3 - jump nfq - } - chain post { - type filter hook postrouting priority $prio; policy accept; - jump nfq - } -} -EOF -} - -load_counter_ruleset() { - local prio=$1 - -ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF -table inet countrules { - chain pre { - type filter hook prerouting priority $prio; policy accept; - counter - } - chain input { - type filter hook input priority $prio; policy accept; - counter - } - chain forward { - type filter hook forward priority $prio; policy accept; - counter - } - chain output { - type filter hook output priority $prio; policy accept; - counter - } - chain post { - type filter hook postrouting priority $prio; policy accept; - counter - } -} -EOF -} - -test_ping() { - ip netns exec ${ns1} ping -c 1 -q 10.0.2.99 > /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - ip netns exec ${ns1} ping -c 1 -q dead:2::99 > /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - return 0 -} - -test_ping_router() { - ip netns exec ${ns1} ping -c 1 -q 10.0.2.1 > /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - ip netns exec ${ns1} ping -c 1 -q dead:2::1 > /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - return 0 -} - -test_queue_blackhole() { - local proto=$1 - -ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF -table $proto blackh { - chain forward { - type filter hook forward priority 0; policy accept; - queue num 600 - } -} -EOF - if [ $proto = "ip" ] ;then - ip netns exec ${ns1} ping -W 2 -c 1 -q 10.0.2.99 > /dev/null - lret=$? - elif [ $proto = "ip6" ]; then - ip netns exec ${ns1} ping -W 2 -c 1 -q dead:2::99 > /dev/null - lret=$? - else - lret=111 - fi - - # queue without bypass keyword should drop traffic if no listener exists. - if [ $lret -eq 0 ];then - echo "FAIL: $proto expected failure, got $lret" 1>&2 - exit 1 - fi - - ip netns exec ${nsrouter} nft delete table $proto blackh - if [ $? -ne 0 ] ;then - echo "FAIL: $proto: Could not delete blackh table" - exit 1 - fi - - echo "PASS: $proto: statement with no listener results in packet drop" -} - -test_queue() -{ - local expected=$1 - local last="" - - # spawn nf-queue listeners - ip netns exec ${nsrouter} ./nf-queue -c -q 0 -t $timeout > "$TMPFILE0" & - ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE1" & - sleep 1 - test_ping - ret=$? - if [ $ret -ne 0 ];then - echo "FAIL: netns routing/connectivity with active listener on queue $queue: $ret" 1>&2 - exit $ret - fi - - test_ping_router - ret=$? - if [ $ret -ne 0 ];then - echo "FAIL: netns router unreachable listener on queue $queue: $ret" 1>&2 - exit $ret - fi - - wait - ret=$? - - for file in $TMPFILE0 $TMPFILE1; do - last=$(tail -n1 "$file") - if [ x"$last" != x"$expected packets total" ]; then - echo "FAIL: Expected $expected packets total, but got $last" 1>&2 - cat "$file" 1>&2 - - ip netns exec ${nsrouter} nft list ruleset - exit 1 - fi - done - - echo "PASS: Expected and received $last" -} - -test_tcp_forward() -{ - ip netns exec ${nsrouter} ./nf-queue -q 2 -t $timeout & - local nfqpid=$! - - tmpfile=$(mktemp) || exit 1 - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile - ip netns exec ${ns2} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & - local rpid=$! - - sleep 1 - ip netns exec ${ns1} nc -w 5 10.0.2.99 12345 <"$tmpfile" >/dev/null & - - rm -f "$tmpfile" - - wait $rpid - wait $lpid - [ $? -eq 0 ] && echo "PASS: tcp and nfqueue in forward chain" -} - -test_tcp_localhost() -{ - tmpfile=$(mktemp) || exit 1 - - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile - ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & - local rpid=$! - - ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout & - local nfqpid=$! - - sleep 1 - ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null - rm -f "$tmpfile" - - wait $rpid - [ $? -eq 0 ] && echo "PASS: tcp via loopback" - wait 2>/dev/null -} - -test_tcp_localhost_connectclose() -{ - tmpfile=$(mktemp) || exit 1 - - ip netns exec ${nsrouter} ./connect_close -p 23456 -t $timeout & - - ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout & - local nfqpid=$! - - sleep 1 - rm -f "$tmpfile" - - wait $rpid - [ $? -eq 0 ] && echo "PASS: tcp via loopback with connect/close" - wait 2>/dev/null -} - -test_tcp_localhost_requeue() -{ -ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF -flush ruleset -table inet filter { - chain output { - type filter hook output priority 0; policy accept; - tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0 - } - chain post { - type filter hook postrouting priority 0; policy accept; - tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0 - } -} -EOF - tmpfile=$(mktemp) || exit 1 - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile - ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & - local rpid=$! - - ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE2" & - - # nfqueue 1 will be called via output hook. But this time, - # re-queue the packet to nfqueue program on queue 2. - ip netns exec ${nsrouter} ./nf-queue -G -d 150 -c -q 0 -Q 1 -t $timeout > "$TMPFILE3" & - - sleep 1 - ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null - rm -f "$tmpfile" - - wait - - if ! diff -u "$TMPFILE2" "$TMPFILE3" ; then - echo "FAIL: lost packets during requeue?!" 1>&2 - return - fi - - echo "PASS: tcp via loopback and re-queueing" -} - -test_icmp_vrf() { - ip -net $ns1 link add tvrf type vrf table 9876 - if [ $? -ne 0 ];then - echo "SKIP: Could not add vrf device" - return - fi - - ip -net $ns1 li set eth0 master tvrf - ip -net $ns1 li set tvrf up - - ip -net $ns1 route add 10.0.2.0/24 via 10.0.1.1 dev eth0 table 9876 -ip netns exec ${ns1} nft -f /dev/stdin <<EOF -flush ruleset -table inet filter { - chain output { - type filter hook output priority 0; policy accept; - meta oifname "tvrf" icmp type echo-request counter queue num 1 - meta oifname "eth0" icmp type echo-request counter queue num 1 - } - chain post { - type filter hook postrouting priority 0; policy accept; - meta oifname "tvrf" icmp type echo-request counter queue num 1 - meta oifname "eth0" icmp type echo-request counter queue num 1 - } -} -EOF - ip netns exec ${ns1} ./nf-queue -q 1 -t $timeout & - local nfqpid=$! - - sleep 1 - ip netns exec ${ns1} ip vrf exec tvrf ping -c 1 10.0.2.99 > /dev/null - - for n in output post; do - for d in tvrf eth0; do - ip netns exec ${ns1} nft list chain inet filter $n | grep -q "oifname \"$d\" icmp type echo-request counter packets 1" - if [ $? -ne 0 ] ; then - echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2 - ip netns exec ${ns1} nft list ruleset - ret=1 - return - fi - done - done - - wait $nfqpid - [ $? -eq 0 ] && echo "PASS: icmp+nfqueue via vrf" - wait 2>/dev/null -} - -ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - -load_ruleset "filter" 0 - -sleep 3 - -test_ping -ret=$? -if [ $ret -eq 0 ];then - # queue bypass works (rules were skipped, no listener) - echo "PASS: ${ns1} can reach ${ns2}" -else - echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2 - exit $ret -fi - -test_queue_blackhole ip -test_queue_blackhole ip6 - -# dummy ruleset to add base chains between the -# queueing rules. We don't want the second reinject -# to re-execute the old hooks. -load_counter_ruleset 10 - -# we are hooking all: prerouting/input/forward/output/postrouting. -# we ping ${ns2} from ${ns1} via ${nsrouter} using ipv4 and ipv6, so: -# 1x icmp prerouting,forward,postrouting -> 3 queue events (6 incl. reply). -# 1x icmp prerouting,input,output postrouting -> 4 queue events incl. reply. -# so we expect that userspace program receives 10 packets. -test_queue 10 - -# same. We queue to a second program as well. -load_ruleset "filter2" 20 -test_queue 20 - -test_tcp_forward -test_tcp_localhost -test_tcp_localhost_connectclose -test_tcp_localhost_requeue -test_icmp_vrf - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_synproxy.sh b/tools/testing/selftests/netfilter/nft_synproxy.sh deleted file mode 100755 index b62933b680d6..000000000000 --- a/tools/testing/selftests/netfilter/nft_synproxy.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -rnd=$(mktemp -u XXXXXXXX) -nsr="nsr-$rnd" # synproxy machine -ns1="ns1-$rnd" # iperf client -ns2="ns2-$rnd" # iperf server - -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - -checktool "nft --version" "run test without nft tool" -checktool "ip -Version" "run test without ip tool" -checktool "iperf3 --version" "run test without iperf3" -checktool "ip netns add $nsr" "create net namespace" - -modprobe -q nf_conntrack - -ip netns add $ns1 -ip netns add $ns2 - -cleanup() { - ip netns pids $ns1 | xargs kill 2>/dev/null - ip netns pids $ns2 | xargs kill 2>/dev/null - ip netns del $ns1 - ip netns del $ns2 - - ip netns del $nsr -} - -trap cleanup EXIT - -ip link add veth0 netns $nsr type veth peer name eth0 netns $ns1 -ip link add veth1 netns $nsr type veth peer name eth0 netns $ns2 - -for dev in lo veth0 veth1; do -ip -net $nsr link set $dev up -done - -ip -net $nsr addr add 10.0.1.1/24 dev veth0 -ip -net $nsr addr add 10.0.2.1/24 dev veth1 - -ip netns exec $nsr sysctl -q net.ipv4.conf.veth0.forwarding=1 -ip netns exec $nsr sysctl -q net.ipv4.conf.veth1.forwarding=1 -ip netns exec $nsr sysctl -q net.netfilter.nf_conntrack_tcp_loose=0 - -for n in $ns1 $ns2; do - ip -net $n link set lo up - ip -net $n link set eth0 up -done -ip -net $ns1 addr add 10.0.1.99/24 dev eth0 -ip -net $ns2 addr add 10.0.2.99/24 dev eth0 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns2 route add default via 10.0.2.1 - -# test basic connectivity -if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then - echo "ERROR: $ns1 cannot reach $ns2" 1>&2 - exit 1 -fi - -if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then - echo "ERROR: $ns2 cannot reach $ns1" 1>&2 - exit 1 -fi - -ip netns exec $ns2 iperf3 -s > /dev/null 2>&1 & -# ip netns exec $nsr tcpdump -vvv -n -i veth1 tcp | head -n 10 & - -sleep 1 - -ip netns exec $nsr nft -f - <<EOF -table inet filter { - chain prerouting { - type filter hook prerouting priority -300; policy accept; - meta iif veth0 tcp flags syn counter notrack - } - - chain forward { - type filter hook forward priority 0; policy accept; - - ct state new,established counter accept - - meta iif veth0 meta l4proto tcp ct state untracked,invalid synproxy mss 1460 sack-perm timestamp - - ct state invalid counter drop - - # make ns2 unreachable w.o. tcp synproxy - tcp flags syn counter drop - } -} -EOF -if [ $? -ne 0 ]; then - echo "SKIP: Cannot add nft synproxy" - exit $ksft_skip -fi - -ip netns exec $ns1 timeout 5 iperf3 -c 10.0.2.99 -n $((1 * 1024 * 1024)) > /dev/null - -if [ $? -ne 0 ]; then - echo "FAIL: iperf3 returned an error" 1>&2 - ret=$? - ip netns exec $nsr nft list ruleset -else - echo "PASS: synproxy connection successful" -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_trans_stress.sh b/tools/testing/selftests/netfilter/nft_trans_stress.sh deleted file mode 100755 index 2ffba45a78bf..000000000000 --- a/tools/testing/selftests/netfilter/nft_trans_stress.sh +++ /dev/null @@ -1,151 +0,0 @@ -#!/bin/bash -# -# This test is for stress-testing the nf_tables config plane path vs. -# packet path processing: Make sure we never release rules that are -# still visible to other cpus. -# -# set -e - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -testns=testns-$(mktemp -u "XXXXXXXX") -tmp="" - -tables="foo bar baz quux" -global_ret=0 -eret=0 -lret=0 - -cleanup() { - ip netns pids "$testns" | xargs kill 2>/dev/null - ip netns del "$testns" - - rm -f "$tmp" -} - -check_result() -{ - local r=$1 - local OK="PASS" - - if [ $r -ne 0 ] ;then - OK="FAIL" - global_ret=$r - fi - - echo "$OK: nft $2 test returned $r" - - eret=0 -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -trap cleanup EXIT -tmp=$(mktemp) - -for table in $tables; do - echo add table inet "$table" >> "$tmp" - echo flush table inet "$table" >> "$tmp" - - echo "add chain inet $table INPUT { type filter hook input priority 0; }" >> "$tmp" - echo "add chain inet $table OUTPUT { type filter hook output priority 0; }" >> "$tmp" - for c in $(seq 1 400); do - chain=$(printf "chain%03u" "$c") - echo "add chain inet $table $chain" >> "$tmp" - done - - for c in $(seq 1 400); do - chain=$(printf "chain%03u" "$c") - for BASE in INPUT OUTPUT; do - echo "add rule inet $table $BASE counter jump $chain" >> "$tmp" - done - echo "add rule inet $table $chain counter return" >> "$tmp" - done -done - -ip netns add "$testns" -ip -netns "$testns" link set lo up - -lscpu | grep ^CPU\(s\): | ( read cpu cpunum ; -cpunum=$((cpunum-1)) -for i in $(seq 0 $cpunum);do - mask=$(printf 0x%x $((1<<$i))) - ip netns exec "$testns" taskset $mask ping -4 127.0.0.1 -fq > /dev/null & - ip netns exec "$testns" taskset $mask ping -6 ::1 -fq > /dev/null & -done) - -sleep 1 - -ip netns exec "$testns" nft -f "$tmp" -for i in $(seq 1 10) ; do ip netns exec "$testns" nft -f "$tmp" & done - -for table in $tables;do - randsleep=$((RANDOM%2)) - sleep $randsleep - ip netns exec "$testns" nft delete table inet $table - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi -done - -check_result $eret "add/delete" - -for i in $(seq 1 10) ; do - (echo "flush ruleset"; cat "$tmp") | ip netns exec "$testns" nft -f /dev/stdin - - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi -done - -check_result $eret "reload" - -for i in $(seq 1 10) ; do - (echo "flush ruleset"; cat "$tmp" - echo "insert rule inet foo INPUT meta nftrace set 1" - echo "insert rule inet foo OUTPUT meta nftrace set 1" - ) | ip netns exec "$testns" nft -f /dev/stdin - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi - - (echo "flush ruleset"; cat "$tmp" - ) | ip netns exec "$testns" nft -f /dev/stdin - - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi -done - -check_result $eret "add/delete with nftrace enabled" - -echo "insert rule inet foo INPUT meta nftrace set 1" >> $tmp -echo "insert rule inet foo OUTPUT meta nftrace set 1" >> $tmp - -for i in $(seq 1 10) ; do - (echo "flush ruleset"; cat "$tmp") | ip netns exec "$testns" nft -f /dev/stdin - - lret=$? - if [ $lret -ne 0 ]; then - eret=1 - fi -done - -check_result $lret "add/delete with nftrace enabled" - -exit $global_ret diff --git a/tools/testing/selftests/netfilter/settings b/tools/testing/selftests/netfilter/settings deleted file mode 100644 index 6091b45d226b..000000000000 --- a/tools/testing/selftests/netfilter/settings +++ /dev/null @@ -1 +0,0 @@ -timeout=120 diff --git a/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c index 505294da1b9f..d6f99eb9be65 100644 --- a/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c +++ b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c @@ -154,7 +154,7 @@ static int dev_papr_vpd_null_handle(void) static int papr_vpd_close_handle_without_reading(void) { const int devfd = open(DEVPATH, O_RDONLY); - struct papr_location_code lc; + struct papr_location_code lc = { .str = "", }; int fd; SKIP_IF_MSG(devfd < 0 && errno == ENOENT, diff --git a/tools/testing/selftests/seccomp/settings b/tools/testing/selftests/seccomp/settings index 6091b45d226b..a953c96aa16e 100644 --- a/tools/testing/selftests/seccomp/settings +++ b/tools/testing/selftests/seccomp/settings @@ -1 +1 @@ -timeout=120 +timeout=180 diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index d49dd3ffd0d9..c001dd79179d 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -66,7 +66,7 @@ static int check_diff(struct timeval start, struct timeval end) diff = end.tv_usec - start.tv_usec; diff += (end.tv_sec - start.tv_sec) * USECS_PER_SEC; - if (abs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { + if (llabs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { printf("Diff too high: %lld..", diff); return -1; } @@ -184,80 +184,71 @@ static int check_timer_create(int which) return 0; } -int remain; -__thread int got_signal; +static pthread_t ctd_thread; +static volatile int ctd_count, ctd_failed; -static void *distribution_thread(void *arg) +static void ctd_sighandler(int sig) { - while (__atomic_load_n(&remain, __ATOMIC_RELAXED)); - return NULL; + if (pthread_self() != ctd_thread) + ctd_failed = 1; + ctd_count--; } -static void distribution_handler(int nr) +static void *ctd_thread_func(void *arg) { - if (!__atomic_exchange_n(&got_signal, 1, __ATOMIC_RELAXED)) - __atomic_fetch_sub(&remain, 1, __ATOMIC_RELAXED); -} - -/* - * Test that all running threads _eventually_ receive CLOCK_PROCESS_CPUTIME_ID - * timer signals. This primarily tests that the kernel does not favour any one. - */ -static int check_timer_distribution(void) -{ - int err, i; - timer_t id; - const int nthreads = 10; - pthread_t threads[nthreads]; struct itimerspec val = { .it_value.tv_sec = 0, .it_value.tv_nsec = 1000 * 1000, .it_interval.tv_sec = 0, .it_interval.tv_nsec = 1000 * 1000, }; + timer_t id; - remain = nthreads + 1; /* worker threads + this thread */ - signal(SIGALRM, distribution_handler); - err = timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); - if (err < 0) { - ksft_perror("Can't create timer"); - return -1; - } - err = timer_settime(id, 0, &val, NULL); - if (err < 0) { - ksft_perror("Can't set timer"); - return -1; - } + /* 1/10 seconds to ensure the leader sleeps */ + usleep(10000); - for (i = 0; i < nthreads; i++) { - err = pthread_create(&threads[i], NULL, distribution_thread, - NULL); - if (err) { - ksft_print_msg("Can't create thread: %s (%d)\n", - strerror(errno), errno); - return -1; - } - } + ctd_count = 100; + if (timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id)) + return "Can't create timer\n"; + if (timer_settime(id, 0, &val, NULL)) + return "Can't set timer\n"; - /* Wait for all threads to receive the signal. */ - while (__atomic_load_n(&remain, __ATOMIC_RELAXED)); + while (ctd_count > 0 && !ctd_failed) + ; - for (i = 0; i < nthreads; i++) { - err = pthread_join(threads[i], NULL); - if (err) { - ksft_print_msg("Can't join thread: %s (%d)\n", - strerror(errno), errno); - return -1; - } - } + if (timer_delete(id)) + return "Can't delete timer\n"; - if (timer_delete(id)) { - ksft_perror("Can't delete timer"); - return -1; - } + return NULL; +} + +/* + * Test that only the running thread receives the timer signal. + */ +static int check_timer_distribution(void) +{ + const char *errmsg; - ksft_test_result_pass("check_timer_distribution\n"); + signal(SIGALRM, ctd_sighandler); + + errmsg = "Can't create thread\n"; + if (pthread_create(&ctd_thread, NULL, ctd_thread_func, NULL)) + goto err; + + errmsg = "Can't join thread\n"; + if (pthread_join(ctd_thread, (void **)&errmsg) || errmsg) + goto err; + + if (!ctd_failed) + ksft_test_result_pass("check signal distribution\n"); + else if (ksft_min_kernel_version(6, 3)) + ksft_test_result_fail("check signal distribution\n"); + else + ksft_test_result_skip("check signal distribution (old kernel)\n"); return 0; +err: + ksft_print_msg("%s", errmsg); + return -1; } int main(int argc, char **argv) diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c index 48b9a803235a..d13ebde20322 100644 --- a/tools/testing/selftests/timers/valid-adjtimex.c +++ b/tools/testing/selftests/timers/valid-adjtimex.c @@ -21,9 +21,6 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - - - #include <stdio.h> #include <stdlib.h> #include <time.h> @@ -62,45 +59,47 @@ int clear_time_state(void) #define NUM_FREQ_OUTOFRANGE 4 #define NUM_FREQ_INVALID 2 +#define SHIFTED_PPM (1 << 16) + long valid_freq[NUM_FREQ_VALID] = { - -499<<16, - -450<<16, - -400<<16, - -350<<16, - -300<<16, - -250<<16, - -200<<16, - -150<<16, - -100<<16, - -75<<16, - -50<<16, - -25<<16, - -10<<16, - -5<<16, - -1<<16, + -499 * SHIFTED_PPM, + -450 * SHIFTED_PPM, + -400 * SHIFTED_PPM, + -350 * SHIFTED_PPM, + -300 * SHIFTED_PPM, + -250 * SHIFTED_PPM, + -200 * SHIFTED_PPM, + -150 * SHIFTED_PPM, + -100 * SHIFTED_PPM, + -75 * SHIFTED_PPM, + -50 * SHIFTED_PPM, + -25 * SHIFTED_PPM, + -10 * SHIFTED_PPM, + -5 * SHIFTED_PPM, + -1 * SHIFTED_PPM, -1000, - 1<<16, - 5<<16, - 10<<16, - 25<<16, - 50<<16, - 75<<16, - 100<<16, - 150<<16, - 200<<16, - 250<<16, - 300<<16, - 350<<16, - 400<<16, - 450<<16, - 499<<16, + 1 * SHIFTED_PPM, + 5 * SHIFTED_PPM, + 10 * SHIFTED_PPM, + 25 * SHIFTED_PPM, + 50 * SHIFTED_PPM, + 75 * SHIFTED_PPM, + 100 * SHIFTED_PPM, + 150 * SHIFTED_PPM, + 200 * SHIFTED_PPM, + 250 * SHIFTED_PPM, + 300 * SHIFTED_PPM, + 350 * SHIFTED_PPM, + 400 * SHIFTED_PPM, + 450 * SHIFTED_PPM, + 499 * SHIFTED_PPM, }; long outofrange_freq[NUM_FREQ_OUTOFRANGE] = { - -1000<<16, - -550<<16, - 550<<16, - 1000<<16, + -1000 * SHIFTED_PPM, + -550 * SHIFTED_PPM, + 550 * SHIFTED_PPM, + 1000 * SHIFTED_PPM, }; #define LONG_MAX (~0UL>>1) diff --git a/tools/testing/selftests/turbostat/defcolumns.py b/tools/testing/selftests/turbostat/defcolumns.py new file mode 100755 index 000000000000..d9b042097da7 --- /dev/null +++ b/tools/testing/selftests/turbostat/defcolumns.py @@ -0,0 +1,60 @@ +#!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +from shutil import which + +turbostat = which('turbostat') +if turbostat is None: + print('Could not find turbostat binary') + exit(1) + +timeout = which('timeout') +if timeout is None: + print('Could not find timeout binary') + exit(1) + +proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +# +# By default --list reports also "usec" and "Time_Of_Day_Seconds" columns +# which are only visible when running with --debug. +# +expected_columns_debug = proc_turbostat.stdout.replace(b',', b'\t').strip() +expected_columns = expected_columns_debug.replace(b'usec\t', b'').replace(b'Time_Of_Day_Seconds\t', b'').replace(b'X2APIC\t', b'').replace(b'APIC\t', b'') + +# +# Run turbostat with no options for 10 seconds and send SIGINT +# +timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '1s'] +turbostat_argv = [turbostat, '-i', '0.250'] + +print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) +actual_columns = proc_turbostat.stdout.split(b'\n')[0] +if expected_columns != actual_columns: + print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}') + exit(1) +print('OK') + +# +# Same, but with --debug +# +turbostat_argv.append('--debug') + +print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) +actual_columns = proc_turbostat.stdout.split(b'\n')[0] +if expected_columns_debug != actual_columns: + print(f'turbostat column check failed\n{expected_columns_debug=}\n{actual_columns=}') + exit(1) +print('OK') |