From 54e20be48fd4bc1df5f6fbca552b5be8c47dbd18 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2026 07:16:40 +0100 Subject: xor: split xor.h Keep xor.h for the public API, and split the struct xor_block_template definition that is only needed by the xor.ko core and architecture-specific optimizations into a separate xor_impl.h header. Link: https://lkml.kernel.org/r/20260327061704.3707577-9-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Tested-by: Eric Biggers Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Anton Ivanov Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chris Mason Cc: Christian Borntraeger Cc: Dan Williams Cc: David S. Miller Cc: David Sterba Cc: Heiko Carstens Cc: Herbert Xu Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jason A. Donenfeld Cc: Johannes Berg Cc: Li Nan Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Richard Henderson Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/raid/xor.h | 22 +--------------------- include/linux/raid/xor_impl.h | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 21 deletions(-) create mode 100644 include/linux/raid/xor_impl.h (limited to 'include/linux/raid') diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h index 51b811b62322..02bda8d99534 100644 --- a/include/linux/raid/xor.h +++ b/include/linux/raid/xor.h @@ -7,24 +7,4 @@ extern void xor_blocks(unsigned int count, unsigned int bytes, void *dest, void **srcs); -struct xor_block_template { - struct xor_block_template *next; - const char *name; - int speed; - void (*do_2)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_3)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_4)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_5)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); -}; - -#endif +#endif /* _XOR_H */ diff --git a/include/linux/raid/xor_impl.h b/include/linux/raid/xor_impl.h new file mode 100644 index 000000000000..a1890cd66812 --- /dev/null +++ b/include/linux/raid/xor_impl.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _XOR_IMPL_H +#define _XOR_IMPL_H + +struct xor_block_template { + struct xor_block_template *next; + const char *name; + int speed; + void (*do_2)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict); + void (*do_3)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict); + void (*do_4)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict); + void (*do_5)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict); +}; + +#endif /* _XOR_IMPL_H */ -- cgit v1.2.3 From 35ebc4de105989034f1250e40eb6dbf5e136b04e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2026 07:16:41 +0100 Subject: xor: remove macro abuse for XOR implementation registrations Drop the pretty confusing historic XOR_TRY_TEMPLATES and XOR_SELECT_TEMPLATE, and instead let the architectures provide a arch_xor_init that calls either xor_register to register candidates or xor_force to force a specific implementation. Link: https://lkml.kernel.org/r/20260327061704.3707577-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Tested-by: Eric Biggers Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Anton Ivanov Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chris Mason Cc: Christian Borntraeger Cc: Dan Williams Cc: David S. Miller Cc: David Sterba Cc: Heiko Carstens Cc: Herbert Xu Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jason A. Donenfeld Cc: Johannes Berg Cc: Li Nan Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Richard Henderson Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/alpha/include/asm/xor.h | 29 ++++++++++++++------------- arch/arm/include/asm/xor.h | 25 ++++++++++++------------ arch/arm64/include/asm/xor.h | 18 ++++++++--------- arch/loongarch/include/asm/xor.h | 42 +++++++++++++++------------------------- arch/powerpc/include/asm/xor.h | 31 +++++++++++++---------------- arch/riscv/include/asm/xor.h | 19 +++++++++--------- arch/s390/include/asm/xor.h | 12 +++++------- arch/sparc/include/asm/xor_32.h | 14 +++++++------- arch/sparc/include/asm/xor_64.h | 31 ++++++++++++++--------------- arch/x86/include/asm/xor.h | 3 --- arch/x86/include/asm/xor_32.h | 36 ++++++++++++++++++---------------- arch/x86/include/asm/xor_64.h | 18 ++++++++++------- arch/x86/include/asm/xor_avx.h | 9 --------- include/asm-generic/xor.h | 8 -------- include/linux/raid/xor_impl.h | 5 +++++ lib/raid/xor/xor-core.c | 41 +++++++++++++++++++++++++++++---------- 16 files changed, 168 insertions(+), 173 deletions(-) (limited to 'include/linux/raid') diff --git a/arch/alpha/include/asm/xor.h b/arch/alpha/include/asm/xor.h index e0de0c233ab9..4c8085711df1 100644 --- a/arch/alpha/include/asm/xor.h +++ b/arch/alpha/include/asm/xor.h @@ -851,16 +851,19 @@ static struct xor_block_template xor_block_alpha_prefetch = { /* For grins, also test the generic routines. */ #include -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_alpha); \ - xor_speed(&xor_block_alpha_prefetch); \ - } while (0) - -/* Force the use of alpha_prefetch if EV6, as it is significantly - faster in the cold cache case. */ -#define XOR_SELECT_TEMPLATE(FASTEST) \ - (implver() == IMPLVER_EV6 ? &xor_block_alpha_prefetch : FASTEST) +/* + * Force the use of alpha_prefetch if EV6, as it is significantly faster in the + * cold cache case. + */ +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + if (implver() == IMPLVER_EV6) { + xor_force(&xor_block_alpha_prefetch); + } else { + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); + xor_register(&xor_block_alpha); + xor_register(&xor_block_alpha_prefetch); + } +} diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h index bca2a6514746..b2dcd49186e2 100644 --- a/arch/arm/include/asm/xor.h +++ b/arch/arm/include/asm/xor.h @@ -138,15 +138,6 @@ static struct xor_block_template xor_block_arm4regs = { .do_5 = xor_arm4regs_5, }; -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_arm4regs); \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - NEON_TEMPLATES; \ - } while (0) - #ifdef CONFIG_KERNEL_MODE_NEON extern struct xor_block_template const xor_block_neon_inner; @@ -201,8 +192,16 @@ static struct xor_block_template xor_block_neon = { .do_5 = xor_neon_5 }; -#define NEON_TEMPLATES \ - do { if (cpu_has_neon()) xor_speed(&xor_block_neon); } while (0) -#else -#define NEON_TEMPLATES +#endif /* CONFIG_KERNEL_MODE_NEON */ + +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_arm4regs); + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); +#ifdef CONFIG_KERNEL_MODE_NEON + if (cpu_has_neon()) + xor_register(&xor_block_neon); #endif +} diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h index bb7428d4ebc6..3cee1eb86371 100644 --- a/arch/arm64/include/asm/xor.h +++ b/arch/arm64/include/asm/xor.h @@ -60,14 +60,14 @@ static struct xor_block_template xor_block_arm64 = { .do_4 = xor_neon_4, .do_5 = xor_neon_5 }; -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - if (cpu_has_neon()) { \ - xor_speed(&xor_block_arm64);\ - } \ - } while (0) + +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); + if (cpu_has_neon()) + xor_register(&xor_block_arm64); +} #endif /* ! CONFIG_KERNEL_MODE_NEON */ diff --git a/arch/loongarch/include/asm/xor.h b/arch/loongarch/include/asm/xor.h index 12467fffee46..d17c0e3b047f 100644 --- a/arch/loongarch/include/asm/xor.h +++ b/arch/loongarch/include/asm/xor.h @@ -16,14 +16,6 @@ static struct xor_block_template xor_block_lsx = { .do_4 = xor_lsx_4, .do_5 = xor_lsx_5, }; - -#define XOR_SPEED_LSX() \ - do { \ - if (cpu_has_lsx) \ - xor_speed(&xor_block_lsx); \ - } while (0) -#else /* CONFIG_CPU_HAS_LSX */ -#define XOR_SPEED_LSX() #endif /* CONFIG_CPU_HAS_LSX */ #ifdef CONFIG_CPU_HAS_LASX @@ -34,14 +26,6 @@ static struct xor_block_template xor_block_lasx = { .do_4 = xor_lasx_4, .do_5 = xor_lasx_5, }; - -#define XOR_SPEED_LASX() \ - do { \ - if (cpu_has_lasx) \ - xor_speed(&xor_block_lasx); \ - } while (0) -#else /* CONFIG_CPU_HAS_LASX */ -#define XOR_SPEED_LASX() #endif /* CONFIG_CPU_HAS_LASX */ /* @@ -54,15 +38,21 @@ static struct xor_block_template xor_block_lasx = { */ #include -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_8regs_p); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_32regs_p); \ - XOR_SPEED_LSX(); \ - XOR_SPEED_LASX(); \ -} while (0) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_8regs_p); + xor_register(&xor_block_32regs); + xor_register(&xor_block_32regs_p); +#ifdef CONFIG_CPU_HAS_LSX + if (cpu_has_lsx) + xor_register(&xor_block_lsx); +#endif +#ifdef CONFIG_CPU_HAS_LASX + if (cpu_has_lasx) + xor_register(&xor_block_lasx); +#endif +} #endif /* _ASM_LOONGARCH_XOR_H */ diff --git a/arch/powerpc/include/asm/xor.h b/arch/powerpc/include/asm/xor.h index 37d05c11d09c..30224c5279c4 100644 --- a/arch/powerpc/include/asm/xor.h +++ b/arch/powerpc/include/asm/xor.h @@ -21,27 +21,22 @@ static struct xor_block_template xor_block_altivec = { .do_4 = xor_altivec_4, .do_5 = xor_altivec_5, }; - -#define XOR_SPEED_ALTIVEC() \ - do { \ - if (cpu_has_feature(CPU_FTR_ALTIVEC)) \ - xor_speed(&xor_block_altivec); \ - } while (0) -#else -#define XOR_SPEED_ALTIVEC() -#endif +#endif /* CONFIG_ALTIVEC */ /* Also try the generic routines. */ #include -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_8regs_p); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_32regs_p); \ - XOR_SPEED_ALTIVEC(); \ -} while (0) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_8regs_p); + xor_register(&xor_block_32regs); + xor_register(&xor_block_32regs_p); +#ifdef CONFIG_ALTIVEC + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + xor_register(&xor_block_altivec); +#endif +} #endif /* _ASM_POWERPC_XOR_H */ diff --git a/arch/riscv/include/asm/xor.h b/arch/riscv/include/asm/xor.h index 96011861e46b..ed5f27903efc 100644 --- a/arch/riscv/include/asm/xor.h +++ b/arch/riscv/include/asm/xor.h @@ -55,14 +55,15 @@ static struct xor_block_template xor_block_rvv = { .do_4 = xor_vector_4, .do_5 = xor_vector_5 }; +#endif /* CONFIG_RISCV_ISA_V */ -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - if (has_vector()) { \ - xor_speed(&xor_block_rvv);\ - } \ - } while (0) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); +#ifdef CONFIG_RISCV_ISA_V + if (has_vector()) + xor_register(&xor_block_rvv); #endif +} diff --git a/arch/s390/include/asm/xor.h b/arch/s390/include/asm/xor.h index 857d6759b67f..4e2233f64da9 100644 --- a/arch/s390/include/asm/xor.h +++ b/arch/s390/include/asm/xor.h @@ -10,12 +10,10 @@ extern struct xor_block_template xor_block_xc; -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - xor_speed(&xor_block_xc); \ -} while (0) - -#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_xc) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + xor_force(&xor_block_xc); +} #endif /* _ASM_S390_XOR_H */ diff --git a/arch/sparc/include/asm/xor_32.h b/arch/sparc/include/asm/xor_32.h index 0351813cf3af..8fbf0c07ec28 100644 --- a/arch/sparc/include/asm/xor_32.h +++ b/arch/sparc/include/asm/xor_32.h @@ -259,10 +259,10 @@ static struct xor_block_template xor_block_SPARC = { /* For grins, also test the generic routines. */ #include -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_SPARC); \ - } while (0) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); + xor_register(&xor_block_SPARC); +} diff --git a/arch/sparc/include/asm/xor_64.h b/arch/sparc/include/asm/xor_64.h index caaddea8ad79..e0482ecc0a68 100644 --- a/arch/sparc/include/asm/xor_64.h +++ b/arch/sparc/include/asm/xor_64.h @@ -60,20 +60,17 @@ static struct xor_block_template xor_block_niagara = { .do_5 = xor_niagara_5, }; -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_VIS); \ - xor_speed(&xor_block_niagara); \ - } while (0) - -/* For VIS for everything except Niagara. */ -#define XOR_SELECT_TEMPLATE(FASTEST) \ - ((tlb_type == hypervisor && \ - (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 || \ - sun4v_chip_type == SUN4V_CHIP_NIAGARA2 || \ - sun4v_chip_type == SUN4V_CHIP_NIAGARA3 || \ - sun4v_chip_type == SUN4V_CHIP_NIAGARA4 || \ - sun4v_chip_type == SUN4V_CHIP_NIAGARA5)) ? \ - &xor_block_niagara : \ - &xor_block_VIS) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + /* Force VIS for everything except Niagara. */ + if (tlb_type == hypervisor && + (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA2 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA3 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA4 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA5)) + xor_force(&xor_block_niagara); + else + xor_force(&xor_block_VIS); +} diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index 7b0307acc410..33f5620d8d69 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h @@ -496,7 +496,4 @@ static struct xor_block_template xor_block_sse_pf64 = { # include #endif -#define XOR_SELECT_TEMPLATE(FASTEST) \ - AVX_SELECT(FASTEST) - #endif /* _ASM_X86_XOR_H */ diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 7a6b9474591e..ee32d08c27bc 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -552,22 +552,24 @@ static struct xor_block_template xor_block_pIII_sse = { /* We force the use of the SSE xor block because it can write around L2. We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - AVX_XOR_SPEED; \ - if (boot_cpu_has(X86_FEATURE_XMM)) { \ - xor_speed(&xor_block_pIII_sse); \ - xor_speed(&xor_block_sse_pf64); \ - } else if (boot_cpu_has(X86_FEATURE_MMX)) { \ - xor_speed(&xor_block_pII_mmx); \ - xor_speed(&xor_block_p5_mmx); \ - } else { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_8regs_p); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_32regs_p); \ - } \ -} while (0) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + if (boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_OSXSAVE)) { + xor_force(&xor_block_avx); + } else if (boot_cpu_has(X86_FEATURE_XMM)) { + xor_register(&xor_block_pIII_sse); + xor_register(&xor_block_sse_pf64); + } else if (boot_cpu_has(X86_FEATURE_MMX)) { + xor_register(&xor_block_pII_mmx); + xor_register(&xor_block_p5_mmx); + } else { + xor_register(&xor_block_8regs); + xor_register(&xor_block_8regs_p); + xor_register(&xor_block_32regs); + xor_register(&xor_block_32regs_p); + } +} #endif /* _ASM_X86_XOR_32_H */ diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 0307e4ec5044..2d2ceb241866 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h @@ -17,12 +17,16 @@ static struct xor_block_template xor_block_sse = { /* We force the use of the SSE xor block because it can write around L2. We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - AVX_XOR_SPEED; \ - xor_speed(&xor_block_sse_pf64); \ - xor_speed(&xor_block_sse); \ -} while (0) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + if (boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_OSXSAVE)) { + xor_force(&xor_block_avx); + } else { + xor_register(&xor_block_sse_pf64); + xor_register(&xor_block_sse); + } +} #endif /* _ASM_X86_XOR_64_H */ diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h index 7f81dd5897f4..c600888436bb 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/arch/x86/include/asm/xor_avx.h @@ -166,13 +166,4 @@ static struct xor_block_template xor_block_avx = { .do_5 = xor_avx_5, }; -#define AVX_XOR_SPEED \ -do { \ - if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \ - xor_speed(&xor_block_avx); \ -} while (0) - -#define AVX_SELECT(FASTEST) \ - (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST) - #endif diff --git a/include/asm-generic/xor.h b/include/asm-generic/xor.h index 44509d48fca2..79c0096aa9d9 100644 --- a/include/asm-generic/xor.h +++ b/include/asm-generic/xor.h @@ -728,11 +728,3 @@ static struct xor_block_template xor_block_32regs_p __maybe_unused = { .do_4 = xor_32regs_p_4, .do_5 = xor_32regs_p_5, }; - -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_8regs_p); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_32regs_p); \ - } while (0) diff --git a/include/linux/raid/xor_impl.h b/include/linux/raid/xor_impl.h index a1890cd66812..6ed4c445ab24 100644 --- a/include/linux/raid/xor_impl.h +++ b/include/linux/raid/xor_impl.h @@ -2,6 +2,8 @@ #ifndef _XOR_IMPL_H #define _XOR_IMPL_H +#include + struct xor_block_template { struct xor_block_template *next; const char *name; @@ -22,4 +24,7 @@ struct xor_block_template { const unsigned long * __restrict); }; +void __init xor_register(struct xor_block_template *tmpl); +void __init xor_force(struct xor_block_template *tmpl); + #endif /* _XOR_IMPL_H */ diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c index db1824011a12..93608b5fece9 100644 --- a/lib/raid/xor/xor-core.c +++ b/lib/raid/xor/xor-core.c @@ -14,10 +14,6 @@ #include #include -#ifndef XOR_SELECT_TEMPLATE -#define XOR_SELECT_TEMPLATE(x) (x) -#endif - /* The xor routines to use. */ static struct xor_block_template *active_template; @@ -55,12 +51,33 @@ EXPORT_SYMBOL(xor_blocks); static struct xor_block_template *__initdata template_list; static bool __initdata xor_forced = false; -static void __init do_xor_register(struct xor_block_template *tmpl) +/** + * xor_register - register a XOR template + * @tmpl: template to register + * + * Register a XOR implementation with the core. Registered implementations + * will be measured by a trivial benchmark, and the fastest one is chosen + * unless an implementation is forced using xor_force(). + */ +void __init xor_register(struct xor_block_template *tmpl) { tmpl->next = template_list; template_list = tmpl; } +/** + * xor_force - force use of a XOR template + * @tmpl: template to register + * + * Register a XOR implementation with the core and force using it. Forcing + * an implementation will make the core ignore any template registered using + * xor_register(), or any previous implementation forced using xor_force(). + */ +void __init xor_force(struct xor_block_template *tmpl) +{ + active_template = tmpl; +} + #define BENCH_SIZE 4096 #define REPS 800U @@ -126,11 +143,19 @@ static int __init calibrate_xor_blocks(void) static int __init xor_init(void) { +#ifdef arch_xor_init + arch_xor_init(); +#else + xor_register(&xor_block_8regs); + xor_register(&xor_block_8regs_p); + xor_register(&xor_block_32regs); + xor_register(&xor_block_32regs_p); +#endif + /* * If this arch/cpu has a short-circuited selection, don't loop through * all the possible functions, just use the best one. */ - active_template = XOR_SELECT_TEMPLATE(NULL); if (active_template) { pr_info("xor: automatically using best checksumming function %-10s\n", active_template->name); @@ -138,10 +163,6 @@ static int __init xor_init(void) return 0; } -#define xor_speed do_xor_register - XOR_TRY_TEMPLATES; -#undef xor_speed - #ifdef MODULE return calibrate_xor_blocks(); #else -- cgit v1.2.3 From e20043b4765cdf7ec8e963d706bb91469cba8cb8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2026 07:16:53 +0100 Subject: xor: make xor.ko self-contained in lib/raid/ Move the asm/xor.h headers to lib/raid/xor/$(SRCARCH)/xor_arch.h and include/linux/raid/xor_impl.h to lib/raid/xor/xor_impl.h so that the xor.ko module implementation is self-contained in lib/raid/. As this remove the asm-generic mechanism a new kconfig symbol is added to indicate that a architecture-specific implementations exists, and xor_arch.h should be included. Link: https://lkml.kernel.org/r/20260327061704.3707577-22-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Tested-by: Eric Biggers Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Anton Ivanov Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chris Mason Cc: Christian Borntraeger Cc: Dan Williams Cc: David S. Miller Cc: David Sterba Cc: Heiko Carstens Cc: Herbert Xu Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jason A. Donenfeld Cc: Johannes Berg Cc: Li Nan Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Richard Henderson Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/alpha/include/asm/xor.h | 24 ------------------- arch/arm/include/asm/xor.h | 21 ---------------- arch/arm64/include/asm/xor.h | 24 ------------------- arch/loongarch/include/asm/xor.h | 40 ------------------------------- arch/powerpc/include/asm/xor.h | 29 ---------------------- arch/riscv/include/asm/xor.h | 19 --------------- arch/s390/include/asm/xor.h | 19 --------------- arch/sparc/include/asm/xor.h | 44 ---------------------------------- arch/um/include/asm/xor.h | 8 ------- arch/x86/include/asm/xor.h | 43 --------------------------------- include/asm-generic/Kbuild | 1 - include/asm-generic/xor.h | 11 --------- include/linux/raid/xor_impl.h | 30 ----------------------- lib/raid/Kconfig | 15 ++++++++++++ lib/raid/xor/Makefile | 6 +++++ lib/raid/xor/alpha/xor.c | 4 ++-- lib/raid/xor/alpha/xor_arch.h | 22 +++++++++++++++++ lib/raid/xor/arm/xor-neon-glue.c | 4 ++-- lib/raid/xor/arm/xor-neon.c | 2 +- lib/raid/xor/arm/xor.c | 4 ++-- lib/raid/xor/arm/xor_arch.h | 19 +++++++++++++++ lib/raid/xor/arm64/xor-neon-glue.c | 4 ++-- lib/raid/xor/arm64/xor-neon.c | 4 ++-- lib/raid/xor/arm64/xor_arch.h | 21 ++++++++++++++++ lib/raid/xor/loongarch/xor_arch.h | 33 +++++++++++++++++++++++++ lib/raid/xor/loongarch/xor_simd_glue.c | 4 ++-- lib/raid/xor/powerpc/xor_arch.h | 22 +++++++++++++++++ lib/raid/xor/powerpc/xor_vmx_glue.c | 4 ++-- lib/raid/xor/riscv/xor-glue.c | 4 ++-- lib/raid/xor/riscv/xor_arch.h | 17 +++++++++++++ lib/raid/xor/s390/xor.c | 4 ++-- lib/raid/xor/s390/xor_arch.h | 13 ++++++++++ lib/raid/xor/sparc/xor-sparc32.c | 4 ++-- lib/raid/xor/sparc/xor-sparc64-glue.c | 4 ++-- lib/raid/xor/sparc/xor_arch.h | 35 +++++++++++++++++++++++++++ lib/raid/xor/um/xor_arch.h | 2 ++ lib/raid/xor/x86/xor-avx.c | 4 ++-- lib/raid/xor/x86/xor-mmx.c | 4 ++-- lib/raid/xor/x86/xor-sse.c | 4 ++-- lib/raid/xor/x86/xor_arch.h | 36 ++++++++++++++++++++++++++++ lib/raid/xor/xor-32regs-prefetch.c | 3 +-- lib/raid/xor/xor-32regs.c | 3 +-- lib/raid/xor/xor-8regs-prefetch.c | 3 +-- lib/raid/xor/xor-8regs.c | 3 +-- lib/raid/xor/xor-core.c | 18 ++++++++------ lib/raid/xor/xor_impl.h | 36 ++++++++++++++++++++++++++++ 46 files changed, 321 insertions(+), 357 deletions(-) delete mode 100644 arch/alpha/include/asm/xor.h delete mode 100644 arch/arm/include/asm/xor.h delete mode 100644 arch/arm64/include/asm/xor.h delete mode 100644 arch/loongarch/include/asm/xor.h delete mode 100644 arch/powerpc/include/asm/xor.h delete mode 100644 arch/riscv/include/asm/xor.h delete mode 100644 arch/s390/include/asm/xor.h delete mode 100644 arch/sparc/include/asm/xor.h delete mode 100644 arch/um/include/asm/xor.h delete mode 100644 arch/x86/include/asm/xor.h delete mode 100644 include/asm-generic/xor.h delete mode 100644 include/linux/raid/xor_impl.h create mode 100644 lib/raid/xor/alpha/xor_arch.h create mode 100644 lib/raid/xor/arm/xor_arch.h create mode 100644 lib/raid/xor/arm64/xor_arch.h create mode 100644 lib/raid/xor/loongarch/xor_arch.h create mode 100644 lib/raid/xor/powerpc/xor_arch.h create mode 100644 lib/raid/xor/riscv/xor_arch.h create mode 100644 lib/raid/xor/s390/xor_arch.h create mode 100644 lib/raid/xor/sparc/xor_arch.h create mode 100644 lib/raid/xor/um/xor_arch.h create mode 100644 lib/raid/xor/x86/xor_arch.h create mode 100644 lib/raid/xor/xor_impl.h (limited to 'include/linux/raid') diff --git a/arch/alpha/include/asm/xor.h b/arch/alpha/include/asm/xor.h deleted file mode 100644 index e517be577a09..000000000000 --- a/arch/alpha/include/asm/xor.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ - -#include -#include - -extern struct xor_block_template xor_block_alpha; -extern struct xor_block_template xor_block_alpha_prefetch; - -/* - * Force the use of alpha_prefetch if EV6, as it is significantly faster in the - * cold cache case. - */ -#define arch_xor_init arch_xor_init -static __always_inline void __init arch_xor_init(void) -{ - if (implver() == IMPLVER_EV6) { - xor_force(&xor_block_alpha_prefetch); - } else { - xor_register(&xor_block_8regs); - xor_register(&xor_block_32regs); - xor_register(&xor_block_alpha); - xor_register(&xor_block_alpha_prefetch); - } -} diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h deleted file mode 100644 index 989c55872ef6..000000000000 --- a/arch/arm/include/asm/xor.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2001 Russell King - */ -#include -#include - -extern struct xor_block_template xor_block_arm4regs; -extern struct xor_block_template xor_block_neon; - -#define arch_xor_init arch_xor_init -static __always_inline void __init arch_xor_init(void) -{ - xor_register(&xor_block_arm4regs); - xor_register(&xor_block_8regs); - xor_register(&xor_block_32regs); -#ifdef CONFIG_KERNEL_MODE_NEON - if (cpu_has_neon()) - xor_register(&xor_block_neon); -#endif -} diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h deleted file mode 100644 index 4782c760bcac..000000000000 --- a/arch/arm64/include/asm/xor.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Authors: Jackie Liu - * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. - */ - -#include -#include - -extern struct xor_block_template xor_block_neon; -extern struct xor_block_template xor_block_eor3; - -#define arch_xor_init arch_xor_init -static __always_inline void __init arch_xor_init(void) -{ - xor_register(&xor_block_8regs); - xor_register(&xor_block_32regs); - if (cpu_has_neon()) { - if (cpu_have_named_feature(SHA3)) - xor_register(&xor_block_eor3); - else - xor_register(&xor_block_neon); - } -} diff --git a/arch/loongarch/include/asm/xor.h b/arch/loongarch/include/asm/xor.h deleted file mode 100644 index 7e32f72f8b03..000000000000 --- a/arch/loongarch/include/asm/xor.h +++ /dev/null @@ -1,40 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2023 WANG Xuerui - */ -#ifndef _ASM_LOONGARCH_XOR_H -#define _ASM_LOONGARCH_XOR_H - -#include - -/* - * For grins, also test the generic routines. - * - * More importantly: it cannot be ruled out at this point of time, that some - * future (maybe reduced) models could run the vector algorithms slower than - * the scalar ones, maybe for errata or micro-op reasons. It may be - * appropriate to revisit this after one or two more uarch generations. - */ -#include - -extern struct xor_block_template xor_block_lsx; -extern struct xor_block_template xor_block_lasx; - -#define arch_xor_init arch_xor_init -static __always_inline void __init arch_xor_init(void) -{ - xor_register(&xor_block_8regs); - xor_register(&xor_block_8regs_p); - xor_register(&xor_block_32regs); - xor_register(&xor_block_32regs_p); -#ifdef CONFIG_CPU_HAS_LSX - if (cpu_has_lsx) - xor_register(&xor_block_lsx); -#endif -#ifdef CONFIG_CPU_HAS_LASX - if (cpu_has_lasx) - xor_register(&xor_block_lasx); -#endif -} - -#endif /* _ASM_LOONGARCH_XOR_H */ diff --git a/arch/powerpc/include/asm/xor.h b/arch/powerpc/include/asm/xor.h deleted file mode 100644 index 3293ac87181c..000000000000 --- a/arch/powerpc/include/asm/xor.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * - * Copyright (C) IBM Corporation, 2012 - * - * Author: Anton Blanchard - */ -#ifndef _ASM_POWERPC_XOR_H -#define _ASM_POWERPC_XOR_H - -#include -#include - -extern struct xor_block_template xor_block_altivec; - -#define arch_xor_init arch_xor_init -static __always_inline void __init arch_xor_init(void) -{ - xor_register(&xor_block_8regs); - xor_register(&xor_block_8regs_p); - xor_register(&xor_block_32regs); - xor_register(&xor_block_32regs_p); -#ifdef CONFIG_ALTIVEC - if (cpu_has_feature(CPU_FTR_ALTIVEC)) - xor_register(&xor_block_altivec); -#endif -} - -#endif /* _ASM_POWERPC_XOR_H */ diff --git a/arch/riscv/include/asm/xor.h b/arch/riscv/include/asm/xor.h deleted file mode 100644 index 614d9209d078..000000000000 --- a/arch/riscv/include/asm/xor.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2021 SiFive - */ -#include -#include - -extern struct xor_block_template xor_block_rvv; - -#define arch_xor_init arch_xor_init -static __always_inline void __init arch_xor_init(void) -{ - xor_register(&xor_block_8regs); - xor_register(&xor_block_32regs); -#ifdef CONFIG_RISCV_ISA_V - if (has_vector()) - xor_register(&xor_block_rvv); -#endif -} diff --git a/arch/s390/include/asm/xor.h b/arch/s390/include/asm/xor.h deleted file mode 100644 index 4e2233f64da9..000000000000 --- a/arch/s390/include/asm/xor.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Optimited xor routines - * - * Copyright IBM Corp. 2016 - * Author(s): Martin Schwidefsky - */ -#ifndef _ASM_S390_XOR_H -#define _ASM_S390_XOR_H - -extern struct xor_block_template xor_block_xc; - -#define arch_xor_init arch_xor_init -static __always_inline void __init arch_xor_init(void) -{ - xor_force(&xor_block_xc); -} - -#endif /* _ASM_S390_XOR_H */ diff --git a/arch/sparc/include/asm/xor.h b/arch/sparc/include/asm/xor.h deleted file mode 100644 index f923b009fc24..000000000000 --- a/arch/sparc/include/asm/xor.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) - * Copyright (C) 2006 David S. Miller - */ -#ifndef ___ASM_SPARC_XOR_H -#define ___ASM_SPARC_XOR_H - -#if defined(__sparc__) && defined(__arch64__) -#include - -extern struct xor_block_template xor_block_VIS; -extern struct xor_block_template xor_block_niagara; - -#define arch_xor_init arch_xor_init -static __always_inline void __init arch_xor_init(void) -{ - /* Force VIS for everything except Niagara. */ - if (tlb_type == hypervisor && - (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 || - sun4v_chip_type == SUN4V_CHIP_NIAGARA2 || - sun4v_chip_type == SUN4V_CHIP_NIAGARA3 || - sun4v_chip_type == SUN4V_CHIP_NIAGARA4 || - sun4v_chip_type == SUN4V_CHIP_NIAGARA5)) - xor_force(&xor_block_niagara); - else - xor_force(&xor_block_VIS); -} -#else /* sparc64 */ - -/* For grins, also test the generic routines. */ -#include - -extern struct xor_block_template xor_block_SPARC; - -#define arch_xor_init arch_xor_init -static __always_inline void __init arch_xor_init(void) -{ - xor_register(&xor_block_8regs); - xor_register(&xor_block_32regs); - xor_register(&xor_block_SPARC); -} -#endif /* !sparc64 */ -#endif /* ___ASM_SPARC_XOR_H */ diff --git a/arch/um/include/asm/xor.h b/arch/um/include/asm/xor.h deleted file mode 100644 index 99e5c7e1f475..000000000000 --- a/arch/um/include/asm/xor.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_UM_XOR_H -#define _ASM_UM_XOR_H - -#include -#include <../../x86/include/asm/xor.h> - -#endif diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h deleted file mode 100644 index d1aab8275908..000000000000 --- a/arch/x86/include/asm/xor.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _ASM_X86_XOR_H -#define _ASM_X86_XOR_H - -#include -#include - -extern struct xor_block_template xor_block_pII_mmx; -extern struct xor_block_template xor_block_p5_mmx; -extern struct xor_block_template xor_block_sse; -extern struct xor_block_template xor_block_sse_pf64; -extern struct xor_block_template xor_block_avx; - -/* - * When SSE is available, use it as it can write around L2. We may also be able - * to load into the L1 only depending on how the cpu deals with a load to a line - * that is being prefetched. - * - * When AVX2 is available, force using it as it is better by all measures. - * - * 32-bit without MMX can fall back to the generic routines. - */ -#define arch_xor_init arch_xor_init -static __always_inline void __init arch_xor_init(void) -{ - if (boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_OSXSAVE)) { - xor_force(&xor_block_avx); - } else if (IS_ENABLED(CONFIG_X86_64) || boot_cpu_has(X86_FEATURE_XMM)) { - xor_register(&xor_block_sse); - xor_register(&xor_block_sse_pf64); - } else if (boot_cpu_has(X86_FEATURE_MMX)) { - xor_register(&xor_block_pII_mmx); - xor_register(&xor_block_p5_mmx); - } else { - xor_register(&xor_block_8regs); - xor_register(&xor_block_8regs_p); - xor_register(&xor_block_32regs); - xor_register(&xor_block_32regs_p); - } -} - -#endif /* _ASM_X86_XOR_H */ diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 9aff61e7b8f2..2c53a1e0b760 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -65,4 +65,3 @@ mandatory-y += vermagic.h mandatory-y += vga.h mandatory-y += video.h mandatory-y += word-at-a-time.h -mandatory-y += xor.h diff --git a/include/asm-generic/xor.h b/include/asm-generic/xor.h deleted file mode 100644 index fc151fdc45ab..000000000000 --- a/include/asm-generic/xor.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * include/asm-generic/xor.h - * - * Generic optimized RAID-5 checksumming functions. - */ - -extern struct xor_block_template xor_block_8regs; -extern struct xor_block_template xor_block_32regs; -extern struct xor_block_template xor_block_8regs_p; -extern struct xor_block_template xor_block_32regs_p; diff --git a/include/linux/raid/xor_impl.h b/include/linux/raid/xor_impl.h deleted file mode 100644 index 6ed4c445ab24..000000000000 --- a/include/linux/raid/xor_impl.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _XOR_IMPL_H -#define _XOR_IMPL_H - -#include - -struct xor_block_template { - struct xor_block_template *next; - const char *name; - int speed; - void (*do_2)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_3)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_4)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_5)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); -}; - -void __init xor_register(struct xor_block_template *tmpl); -void __init xor_force(struct xor_block_template *tmpl); - -#endif /* _XOR_IMPL_H */ diff --git a/lib/raid/Kconfig b/lib/raid/Kconfig index 01b73a1c303f..81cb3f9c0a7b 100644 --- a/lib/raid/Kconfig +++ b/lib/raid/Kconfig @@ -2,3 +2,18 @@ config XOR_BLOCKS tristate + +# selected by architectures that provide an optimized XOR implementation +config XOR_BLOCKS_ARCH + depends on XOR_BLOCKS + default y if ALPHA + default y if ARM + default y if ARM64 + default y if CPU_HAS_LSX # loongarch + default y if ALTIVEC # powerpc + default y if RISCV_ISA_V + default y if SPARC + default y if S390 + default y if X86_32 + default y if X86_64 + bool diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile index 05aca96041b3..df55823c4d82 100644 --- a/lib/raid/xor/Makefile +++ b/lib/raid/xor/Makefile @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +ccflags-y += -I $(src) + obj-$(CONFIG_XOR_BLOCKS) += xor.o xor-y += xor-core.o @@ -8,6 +10,10 @@ xor-y += xor-32regs.o xor-y += xor-8regs-prefetch.o xor-y += xor-32regs-prefetch.o +ifeq ($(CONFIG_XOR_BLOCKS_ARCH),y) +CFLAGS_xor-core.o += -I$(src)/$(SRCARCH) +endif + xor-$(CONFIG_ALPHA) += alpha/xor.o xor-$(CONFIG_ARM) += arm/xor.o ifeq ($(CONFIG_ARM),y) diff --git a/lib/raid/xor/alpha/xor.c b/lib/raid/xor/alpha/xor.c index 0964ac420604..90694cc47395 100644 --- a/lib/raid/xor/alpha/xor.c +++ b/lib/raid/xor/alpha/xor.c @@ -2,8 +2,8 @@ /* * Optimized XOR parity functions for alpha EV5 and EV6 */ -#include -#include +#include "xor_impl.h" +#include "xor_arch.h" extern void xor_alpha_2(unsigned long bytes, unsigned long * __restrict p1, diff --git a/lib/raid/xor/alpha/xor_arch.h b/lib/raid/xor/alpha/xor_arch.h new file mode 100644 index 000000000000..0dcfea578a48 --- /dev/null +++ b/lib/raid/xor/alpha/xor_arch.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include + +extern struct xor_block_template xor_block_alpha; +extern struct xor_block_template xor_block_alpha_prefetch; + +/* + * Force the use of alpha_prefetch if EV6, as it is significantly faster in the + * cold cache case. + */ +static __always_inline void __init arch_xor_init(void) +{ + if (implver() == IMPLVER_EV6) { + xor_force(&xor_block_alpha_prefetch); + } else { + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); + xor_register(&xor_block_alpha); + xor_register(&xor_block_alpha_prefetch); + } +} diff --git a/lib/raid/xor/arm/xor-neon-glue.c b/lib/raid/xor/arm/xor-neon-glue.c index c7b162b383a2..7afd6294464b 100644 --- a/lib/raid/xor/arm/xor-neon-glue.c +++ b/lib/raid/xor/arm/xor-neon-glue.c @@ -2,8 +2,8 @@ /* * Copyright (C) 2001 Russell King */ -#include -#include +#include "xor_impl.h" +#include "xor_arch.h" extern struct xor_block_template const xor_block_neon_inner; diff --git a/lib/raid/xor/arm/xor-neon.c b/lib/raid/xor/arm/xor-neon.c index c9d4378b0f0e..806a42c5952c 100644 --- a/lib/raid/xor/arm/xor-neon.c +++ b/lib/raid/xor/arm/xor-neon.c @@ -3,7 +3,7 @@ * Copyright (C) 2013 Linaro Ltd */ -#include +#include "xor_impl.h" #ifndef __ARM_NEON__ #error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon' diff --git a/lib/raid/xor/arm/xor.c b/lib/raid/xor/arm/xor.c index 2263341dbbcd..5bd5f048bbe9 100644 --- a/lib/raid/xor/arm/xor.c +++ b/lib/raid/xor/arm/xor.c @@ -2,8 +2,8 @@ /* * Copyright (C) 2001 Russell King */ -#include -#include +#include "xor_impl.h" +#include "xor_arch.h" #define __XOR(a1, a2) a1 ^= a2 diff --git a/lib/raid/xor/arm/xor_arch.h b/lib/raid/xor/arm/xor_arch.h new file mode 100644 index 000000000000..5a7eedb48fbb --- /dev/null +++ b/lib/raid/xor/arm/xor_arch.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2001 Russell King + */ +#include + +extern struct xor_block_template xor_block_arm4regs; +extern struct xor_block_template xor_block_neon; + +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_arm4regs); + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); +#ifdef CONFIG_KERNEL_MODE_NEON + if (cpu_has_neon()) + xor_register(&xor_block_neon); +#endif +} diff --git a/lib/raid/xor/arm64/xor-neon-glue.c b/lib/raid/xor/arm64/xor-neon-glue.c index 08c3e3573388..3db0a318cf5b 100644 --- a/lib/raid/xor/arm64/xor-neon-glue.c +++ b/lib/raid/xor/arm64/xor-neon-glue.c @@ -4,9 +4,9 @@ * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. */ -#include #include -#include +#include "xor_impl.h" +#include "xor_arch.h" #include "xor-neon.h" #define XOR_TEMPLATE(_name) \ diff --git a/lib/raid/xor/arm64/xor-neon.c b/lib/raid/xor/arm64/xor-neon.c index 61194c292917..61f00c4fee49 100644 --- a/lib/raid/xor/arm64/xor-neon.c +++ b/lib/raid/xor/arm64/xor-neon.c @@ -4,10 +4,10 @@ * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. */ -#include #include #include -#include +#include "xor_impl.h" +#include "xor_arch.h" #include "xor-neon.h" void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, diff --git a/lib/raid/xor/arm64/xor_arch.h b/lib/raid/xor/arm64/xor_arch.h new file mode 100644 index 000000000000..5dbb40319501 --- /dev/null +++ b/lib/raid/xor/arm64/xor_arch.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Authors: Jackie Liu + * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. + */ +#include + +extern struct xor_block_template xor_block_neon; +extern struct xor_block_template xor_block_eor3; + +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); + if (cpu_has_neon()) { + if (cpu_have_named_feature(SHA3)) + xor_register(&xor_block_eor3); + else + xor_register(&xor_block_neon); + } +} diff --git a/lib/raid/xor/loongarch/xor_arch.h b/lib/raid/xor/loongarch/xor_arch.h new file mode 100644 index 000000000000..fe5e8244fd0e --- /dev/null +++ b/lib/raid/xor/loongarch/xor_arch.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2023 WANG Xuerui + */ +#include + +/* + * For grins, also test the generic routines. + * + * More importantly: it cannot be ruled out at this point of time, that some + * future (maybe reduced) models could run the vector algorithms slower than + * the scalar ones, maybe for errata or micro-op reasons. It may be + * appropriate to revisit this after one or two more uarch generations. + */ + +extern struct xor_block_template xor_block_lsx; +extern struct xor_block_template xor_block_lasx; + +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_8regs_p); + xor_register(&xor_block_32regs); + xor_register(&xor_block_32regs_p); +#ifdef CONFIG_CPU_HAS_LSX + if (cpu_has_lsx) + xor_register(&xor_block_lsx); +#endif +#ifdef CONFIG_CPU_HAS_LASX + if (cpu_has_lasx) + xor_register(&xor_block_lasx); +#endif +} diff --git a/lib/raid/xor/loongarch/xor_simd_glue.c b/lib/raid/xor/loongarch/xor_simd_glue.c index 11fa3b47ba83..b387aa0213b4 100644 --- a/lib/raid/xor/loongarch/xor_simd_glue.c +++ b/lib/raid/xor/loongarch/xor_simd_glue.c @@ -6,9 +6,9 @@ */ #include -#include #include -#include +#include "xor_impl.h" +#include "xor_arch.h" #include "xor_simd.h" #define MAKE_XOR_GLUE_2(flavor) \ diff --git a/lib/raid/xor/powerpc/xor_arch.h b/lib/raid/xor/powerpc/xor_arch.h new file mode 100644 index 000000000000..3b00a4a2fd67 --- /dev/null +++ b/lib/raid/xor/powerpc/xor_arch.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard + */ +#include + +extern struct xor_block_template xor_block_altivec; + +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_8regs_p); + xor_register(&xor_block_32regs); + xor_register(&xor_block_32regs_p); +#ifdef CONFIG_ALTIVEC + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + xor_register(&xor_block_altivec); +#endif +} diff --git a/lib/raid/xor/powerpc/xor_vmx_glue.c b/lib/raid/xor/powerpc/xor_vmx_glue.c index c41e38340700..56e99ddfb64f 100644 --- a/lib/raid/xor/powerpc/xor_vmx_glue.c +++ b/lib/raid/xor/powerpc/xor_vmx_glue.c @@ -7,9 +7,9 @@ #include #include -#include #include -#include +#include "xor_impl.h" +#include "xor_arch.h" #include "xor_vmx.h" static void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1, diff --git a/lib/raid/xor/riscv/xor-glue.c b/lib/raid/xor/riscv/xor-glue.c index 11666a4b6b68..060e5f22ebcc 100644 --- a/lib/raid/xor/riscv/xor-glue.c +++ b/lib/raid/xor/riscv/xor-glue.c @@ -3,11 +3,11 @@ * Copyright (C) 2021 SiFive */ -#include #include #include #include -#include +#include "xor_impl.h" +#include "xor_arch.h" static void xor_vector_2(unsigned long bytes, unsigned long *__restrict p1, const unsigned long *__restrict p2) diff --git a/lib/raid/xor/riscv/xor_arch.h b/lib/raid/xor/riscv/xor_arch.h new file mode 100644 index 000000000000..9240857d760b --- /dev/null +++ b/lib/raid/xor/riscv/xor_arch.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2021 SiFive + */ +#include + +extern struct xor_block_template xor_block_rvv; + +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); +#ifdef CONFIG_RISCV_ISA_V + if (has_vector()) + xor_register(&xor_block_rvv); +#endif +} diff --git a/lib/raid/xor/s390/xor.c b/lib/raid/xor/s390/xor.c index acbd268adfc8..c28cb56fec92 100644 --- a/lib/raid/xor/s390/xor.c +++ b/lib/raid/xor/s390/xor.c @@ -7,8 +7,8 @@ */ #include -#include -#include +#include "xor_impl.h" +#include "xor_arch.h" static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p2) diff --git a/lib/raid/xor/s390/xor_arch.h b/lib/raid/xor/s390/xor_arch.h new file mode 100644 index 000000000000..4a233ed2b97a --- /dev/null +++ b/lib/raid/xor/s390/xor_arch.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Optimited xor routines + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky + */ +extern struct xor_block_template xor_block_xc; + +static __always_inline void __init arch_xor_init(void) +{ + xor_force(&xor_block_xc); +} diff --git a/lib/raid/xor/sparc/xor-sparc32.c b/lib/raid/xor/sparc/xor-sparc32.c index b65a75a6e59d..307c4a84f535 100644 --- a/lib/raid/xor/sparc/xor-sparc32.c +++ b/lib/raid/xor/sparc/xor-sparc32.c @@ -5,8 +5,8 @@ * * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz) */ -#include -#include +#include "xor_impl.h" +#include "xor_arch.h" static void sparc_2(unsigned long bytes, unsigned long * __restrict p1, diff --git a/lib/raid/xor/sparc/xor-sparc64-glue.c b/lib/raid/xor/sparc/xor-sparc64-glue.c index 3c67c8c3a0e8..5f90c2460b54 100644 --- a/lib/raid/xor/sparc/xor-sparc64-glue.c +++ b/lib/raid/xor/sparc/xor-sparc64-glue.c @@ -8,8 +8,8 @@ * Copyright (C) 2006 David S. Miller */ -#include -#include +#include "xor_impl.h" +#include "xor_arch.h" void xor_vis_2(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p2); diff --git a/lib/raid/xor/sparc/xor_arch.h b/lib/raid/xor/sparc/xor_arch.h new file mode 100644 index 000000000000..af288abe4e91 --- /dev/null +++ b/lib/raid/xor/sparc/xor_arch.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) + * Copyright (C) 2006 David S. Miller + */ +#if defined(__sparc__) && defined(__arch64__) +#include + +extern struct xor_block_template xor_block_VIS; +extern struct xor_block_template xor_block_niagara; + +static __always_inline void __init arch_xor_init(void) +{ + /* Force VIS for everything except Niagara. */ + if (tlb_type == hypervisor && + (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA2 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA3 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA4 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA5)) + xor_force(&xor_block_niagara); + else + xor_force(&xor_block_VIS); +} +#else /* sparc64 */ + +extern struct xor_block_template xor_block_SPARC; + +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); + xor_register(&xor_block_SPARC); +} +#endif /* !sparc64 */ diff --git a/lib/raid/xor/um/xor_arch.h b/lib/raid/xor/um/xor_arch.h new file mode 100644 index 000000000000..a33e57a26c5e --- /dev/null +++ b/lib/raid/xor/um/xor_arch.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <../x86/xor_arch.h> diff --git a/lib/raid/xor/x86/xor-avx.c b/lib/raid/xor/x86/xor-avx.c index b49cb5199e70..d411efa1ff43 100644 --- a/lib/raid/xor/x86/xor-avx.c +++ b/lib/raid/xor/x86/xor-avx.c @@ -8,9 +8,9 @@ * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines */ #include -#include #include -#include +#include "xor_impl.h" +#include "xor_arch.h" #define BLOCK4(i) \ BLOCK(32 * i, 0) \ diff --git a/lib/raid/xor/x86/xor-mmx.c b/lib/raid/xor/x86/xor-mmx.c index cf0fafea33b7..e48c58f92874 100644 --- a/lib/raid/xor/x86/xor-mmx.c +++ b/lib/raid/xor/x86/xor-mmx.c @@ -4,9 +4,9 @@ * * Copyright (C) 1998 Ingo Molnar. */ -#include #include -#include +#include "xor_impl.h" +#include "xor_arch.h" #define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n" #define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n" diff --git a/lib/raid/xor/x86/xor-sse.c b/lib/raid/xor/x86/xor-sse.c index 0e727ced8b00..5993ed688c15 100644 --- a/lib/raid/xor/x86/xor-sse.c +++ b/lib/raid/xor/x86/xor-sse.c @@ -12,9 +12,9 @@ * x86-64 changes / gcc fixes from Andi Kleen. * Copyright 2002 Andi Kleen, SuSE Labs. */ -#include #include -#include +#include "xor_impl.h" +#include "xor_arch.h" #ifdef CONFIG_X86_32 /* reduce register pressure */ diff --git a/lib/raid/xor/x86/xor_arch.h b/lib/raid/xor/x86/xor_arch.h new file mode 100644 index 000000000000..99fe85a213c6 --- /dev/null +++ b/lib/raid/xor/x86/xor_arch.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#include + +extern struct xor_block_template xor_block_pII_mmx; +extern struct xor_block_template xor_block_p5_mmx; +extern struct xor_block_template xor_block_sse; +extern struct xor_block_template xor_block_sse_pf64; +extern struct xor_block_template xor_block_avx; + +/* + * When SSE is available, use it as it can write around L2. We may also be able + * to load into the L1 only depending on how the cpu deals with a load to a line + * that is being prefetched. + * + * When AVX2 is available, force using it as it is better by all measures. + * + * 32-bit without MMX can fall back to the generic routines. + */ +static __always_inline void __init arch_xor_init(void) +{ + if (boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_OSXSAVE)) { + xor_force(&xor_block_avx); + } else if (IS_ENABLED(CONFIG_X86_64) || boot_cpu_has(X86_FEATURE_XMM)) { + xor_register(&xor_block_sse); + xor_register(&xor_block_sse_pf64); + } else if (boot_cpu_has(X86_FEATURE_MMX)) { + xor_register(&xor_block_pII_mmx); + xor_register(&xor_block_p5_mmx); + } else { + xor_register(&xor_block_8regs); + xor_register(&xor_block_8regs_p); + xor_register(&xor_block_32regs); + xor_register(&xor_block_32regs_p); + } +} diff --git a/lib/raid/xor/xor-32regs-prefetch.c b/lib/raid/xor/xor-32regs-prefetch.c index 8666c287f777..2856a8e50cb8 100644 --- a/lib/raid/xor/xor-32regs-prefetch.c +++ b/lib/raid/xor/xor-32regs-prefetch.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include -#include -#include +#include "xor_impl.h" static void xor_32regs_p_2(unsigned long bytes, unsigned long * __restrict p1, diff --git a/lib/raid/xor/xor-32regs.c b/lib/raid/xor/xor-32regs.c index 58d4fac43eb4..cc44d64032fa 100644 --- a/lib/raid/xor/xor-32regs.c +++ b/lib/raid/xor/xor-32regs.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include -#include +#include "xor_impl.h" static void xor_32regs_2(unsigned long bytes, unsigned long * __restrict p1, diff --git a/lib/raid/xor/xor-8regs-prefetch.c b/lib/raid/xor/xor-8regs-prefetch.c index 67061e35a0a6..1d53aec50d27 100644 --- a/lib/raid/xor/xor-8regs-prefetch.c +++ b/lib/raid/xor/xor-8regs-prefetch.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include -#include -#include +#include "xor_impl.h" static void xor_8regs_p_2(unsigned long bytes, unsigned long * __restrict p1, diff --git a/lib/raid/xor/xor-8regs.c b/lib/raid/xor/xor-8regs.c index 769f796ab2cf..72a44e898c55 100644 --- a/lib/raid/xor/xor-8regs.c +++ b/lib/raid/xor/xor-8regs.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include -#include +#include "xor_impl.h" static void xor_8regs_2(unsigned long bytes, unsigned long * __restrict p1, diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c index 93608b5fece9..de1d2899490a 100644 --- a/lib/raid/xor/xor-core.c +++ b/lib/raid/xor/xor-core.c @@ -9,10 +9,9 @@ #include #include #include -#include #include #include -#include +#include "xor_impl.h" /* The xor routines to use. */ static struct xor_block_template *active_template; @@ -141,16 +140,21 @@ static int __init calibrate_xor_blocks(void) return 0; } -static int __init xor_init(void) -{ -#ifdef arch_xor_init - arch_xor_init(); +#ifdef CONFIG_XOR_BLOCKS_ARCH +#include "xor_arch.h" /* $SRCARCH/xor_arch.h */ #else +static void __init arch_xor_init(void) +{ xor_register(&xor_block_8regs); xor_register(&xor_block_8regs_p); xor_register(&xor_block_32regs); xor_register(&xor_block_32regs_p); -#endif +} +#endif /* CONFIG_XOR_BLOCKS_ARCH */ + +static int __init xor_init(void) +{ + arch_xor_init(); /* * If this arch/cpu has a short-circuited selection, don't loop through diff --git a/lib/raid/xor/xor_impl.h b/lib/raid/xor/xor_impl.h new file mode 100644 index 000000000000..44b6c99e2093 --- /dev/null +++ b/lib/raid/xor/xor_impl.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _XOR_IMPL_H +#define _XOR_IMPL_H + +#include + +struct xor_block_template { + struct xor_block_template *next; + const char *name; + int speed; + void (*do_2)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict); + void (*do_3)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict); + void (*do_4)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict); + void (*do_5)(unsigned long, unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict, + const unsigned long * __restrict); +}; + +/* generic implementations */ +extern struct xor_block_template xor_block_8regs; +extern struct xor_block_template xor_block_32regs; +extern struct xor_block_template xor_block_8regs_p; +extern struct xor_block_template xor_block_32regs_p; + +void __init xor_register(struct xor_block_template *tmpl); +void __init xor_force(struct xor_block_template *tmpl); + +#endif /* _XOR_IMPL_H */ -- cgit v1.2.3 From e420f0a88b24b80302f57965ceb7387aa3f12488 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2026 07:16:54 +0100 Subject: xor: add a better public API xor_blocks is very annoying to use, because it is limited to 4 + 1 sources / destinations, has an odd argument order and is completely undocumented. Lift the code that loops around it from btrfs and async_tx/async_xor into common code under the name xor_gen and properly document it. [hch@lst.de: make xor_blocks less annoying to use] Link: https://lkml.kernel.org/r/20260327061704.3707577-24-hch@lst.de Link: https://lkml.kernel.org/r/20260327061704.3707577-23-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Tested-by: Eric Biggers Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Anton Ivanov Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chris Mason Cc: Christian Borntraeger Cc: Dan Williams Cc: David S. Miller Cc: David Sterba Cc: Heiko Carstens Cc: Herbert Xu Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jason A. Donenfeld Cc: Johannes Berg Cc: Li Nan Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Richard Henderson Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/raid/xor.h | 2 ++ lib/raid/xor/xor-core.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h index 02bda8d99534..6d9a39fd85dd 100644 --- a/include/linux/raid/xor.h +++ b/include/linux/raid/xor.h @@ -7,4 +7,6 @@ extern void xor_blocks(unsigned int count, unsigned int bytes, void *dest, void **srcs); +void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes); + #endif /* _XOR_H */ diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c index de1d2899490a..2e46b6b83b0a 100644 --- a/lib/raid/xor/xor-core.c +++ b/lib/raid/xor/xor-core.c @@ -46,6 +46,40 @@ xor_blocks(unsigned int src_count, unsigned int bytes, void *dest, void **srcs) } EXPORT_SYMBOL(xor_blocks); +/** + * xor_gen - generate RAID-style XOR information + * @dest: destination vector + * @srcs: source vectors + * @src_cnt: number of source vectors + * @bytes: length in bytes of each vector + * + * Performs bit-wise XOR operation into @dest for each of the @src_cnt vectors + * in @srcs for a length of @bytes bytes. @src_cnt must be non-zero, and the + * memory pointed to by @dest and each member of @srcs must be at least 64-byte + * aligned. @bytes must be non-zero and a multiple of 512. + * + * Note: for typical RAID uses, @dest either needs to be zeroed, or filled with + * the first disk, which then needs to be removed from @srcs. + */ +void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes) +{ + unsigned int src_off = 0; + + WARN_ON_ONCE(in_interrupt()); + WARN_ON_ONCE(bytes == 0); + WARN_ON_ONCE(bytes & 511); + + while (src_cnt > 0) { + unsigned int this_cnt = min(src_cnt, MAX_XOR_BLOCKS); + + xor_blocks(this_cnt, bytes, dest, srcs + src_off); + + src_cnt -= this_cnt; + src_off += this_cnt; + } +} +EXPORT_SYMBOL(xor_gen); + /* Set of all registered templates. */ static struct xor_block_template *__initdata template_list; static bool __initdata xor_forced = false; -- cgit v1.2.3 From 80dcf0a7832a5acde0f0701a4dc7b586fc8bcc88 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2026 07:16:58 +0100 Subject: xor: pass the entire operation to the low-level ops Currently the high-level xor code chunks up all operations into small units for only up to 1 + 4 vectors, and passes it to four different methods. This means the FPU/vector context is entered and left a lot for wide stripes, and a lot of indirect expensive indirect calls are performed. Switch to passing the entire gen_xor request to the low-level ops, and provide a macro to dispatch it to the existing helper. This reduce the number of indirect calls and FPU/vector context switches by a factor approaching nr_stripes / 4, and also reduces source and binary code size. Link: https://lkml.kernel.org/r/20260327061704.3707577-27-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Tested-by: Eric Biggers Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Anton Ivanov Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chris Mason Cc: Christian Borntraeger Cc: Dan Williams Cc: David S. Miller Cc: David Sterba Cc: Heiko Carstens Cc: Herbert Xu Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jason A. Donenfeld Cc: Johannes Berg Cc: Li Nan Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Richard Henderson Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/raid/xor.h | 5 --- lib/raid/xor/alpha/xor.c | 19 +++++----- lib/raid/xor/arm/xor-neon-glue.c | 49 +++----------------------- lib/raid/xor/arm/xor-neon.c | 9 ++--- lib/raid/xor/arm/xor.c | 10 +++--- lib/raid/xor/arm/xor_arch.h | 3 ++ lib/raid/xor/arm64/xor-neon-glue.c | 44 +++-------------------- lib/raid/xor/arm64/xor-neon.c | 20 +++++++---- lib/raid/xor/arm64/xor-neon.h | 32 +++-------------- lib/raid/xor/loongarch/xor_simd_glue.c | 62 ++++++-------------------------- lib/raid/xor/powerpc/xor_vmx.c | 40 +++++++++++---------- lib/raid/xor/powerpc/xor_vmx.h | 16 ++------- lib/raid/xor/powerpc/xor_vmx_glue.c | 49 +++----------------------- lib/raid/xor/riscv/xor-glue.c | 43 ++++------------------- lib/raid/xor/s390/xor.c | 9 +++-- lib/raid/xor/sparc/xor-sparc32.c | 9 +++-- lib/raid/xor/sparc/xor-sparc64-glue.c | 19 +++++----- lib/raid/xor/x86/xor-avx.c | 29 ++++++--------- lib/raid/xor/x86/xor-mmx.c | 64 +++++++++++++--------------------- lib/raid/xor/x86/xor-sse.c | 63 ++++++++++++--------------------- lib/raid/xor/xor-32regs-prefetch.c | 10 +++--- lib/raid/xor/xor-32regs.c | 9 +++-- lib/raid/xor/xor-8regs-prefetch.c | 11 +++--- lib/raid/xor/xor-8regs.c | 9 +++-- lib/raid/xor/xor-core.c | 48 +++---------------------- lib/raid/xor/xor_impl.h | 48 +++++++++++++++++-------- 26 files changed, 224 insertions(+), 505 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h index 6d9a39fd85dd..870558c9d36e 100644 --- a/include/linux/raid/xor.h +++ b/include/linux/raid/xor.h @@ -2,11 +2,6 @@ #ifndef _XOR_H #define _XOR_H -#define MAX_XOR_BLOCKS 4 - -extern void xor_blocks(unsigned int count, unsigned int bytes, - void *dest, void **srcs); - void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes); #endif /* _XOR_H */ diff --git a/lib/raid/xor/alpha/xor.c b/lib/raid/xor/alpha/xor.c index 90694cc47395..a8f72f2dd3a5 100644 --- a/lib/raid/xor/alpha/xor.c +++ b/lib/raid/xor/alpha/xor.c @@ -832,18 +832,17 @@ xor_alpha_prefetch_5: \n\ .end xor_alpha_prefetch_5 \n\ "); +DO_XOR_BLOCKS(alpha, xor_alpha_2, xor_alpha_3, xor_alpha_4, xor_alpha_5); + struct xor_block_template xor_block_alpha = { - .name = "alpha", - .do_2 = xor_alpha_2, - .do_3 = xor_alpha_3, - .do_4 = xor_alpha_4, - .do_5 = xor_alpha_5, + .name = "alpha", + .xor_gen = xor_gen_alpha, }; +DO_XOR_BLOCKS(alpha_prefetch, xor_alpha_prefetch_2, xor_alpha_prefetch_3, + xor_alpha_prefetch_4, xor_alpha_prefetch_5); + struct xor_block_template xor_block_alpha_prefetch = { - .name = "alpha prefetch", - .do_2 = xor_alpha_prefetch_2, - .do_3 = xor_alpha_prefetch_3, - .do_4 = xor_alpha_prefetch_4, - .do_5 = xor_alpha_prefetch_5, + .name = "alpha prefetch", + .xor_gen = xor_gen_alpha_prefetch, }; diff --git a/lib/raid/xor/arm/xor-neon-glue.c b/lib/raid/xor/arm/xor-neon-glue.c index 7afd6294464b..cea39e019904 100644 --- a/lib/raid/xor/arm/xor-neon-glue.c +++ b/lib/raid/xor/arm/xor-neon-glue.c @@ -5,54 +5,15 @@ #include "xor_impl.h" #include "xor_arch.h" -extern struct xor_block_template const xor_block_neon_inner; - -static void -xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2) -{ - kernel_neon_begin(); - xor_block_neon_inner.do_2(bytes, p1, p2); - kernel_neon_end(); -} - -static void -xor_neon_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3) -{ - kernel_neon_begin(); - xor_block_neon_inner.do_3(bytes, p1, p2, p3); - kernel_neon_end(); -} - -static void -xor_neon_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4) -{ - kernel_neon_begin(); - xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4); - kernel_neon_end(); -} - -static void -xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5) +static void xor_gen_neon(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) { kernel_neon_begin(); - xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5); + xor_gen_neon_inner(dest, srcs, src_cnt, bytes); kernel_neon_end(); } struct xor_block_template xor_block_neon = { - .name = "neon", - .do_2 = xor_neon_2, - .do_3 = xor_neon_3, - .do_4 = xor_neon_4, - .do_5 = xor_neon_5 + .name = "neon", + .xor_gen = xor_gen_neon, }; diff --git a/lib/raid/xor/arm/xor-neon.c b/lib/raid/xor/arm/xor-neon.c index 806a42c5952c..23147e3a7904 100644 --- a/lib/raid/xor/arm/xor-neon.c +++ b/lib/raid/xor/arm/xor-neon.c @@ -4,6 +4,7 @@ */ #include "xor_impl.h" +#include "xor_arch.h" #ifndef __ARM_NEON__ #error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon' @@ -22,10 +23,4 @@ #define NO_TEMPLATE #include "../xor-8regs.c" -struct xor_block_template const xor_block_neon_inner = { - .name = "__inner_neon__", - .do_2 = xor_8regs_2, - .do_3 = xor_8regs_3, - .do_4 = xor_8regs_4, - .do_5 = xor_8regs_5, -}; +__DO_XOR_BLOCKS(neon_inner, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5); diff --git a/lib/raid/xor/arm/xor.c b/lib/raid/xor/arm/xor.c index 5bd5f048bbe9..45139b6c55ea 100644 --- a/lib/raid/xor/arm/xor.c +++ b/lib/raid/xor/arm/xor.c @@ -127,10 +127,10 @@ xor_arm4regs_5(unsigned long bytes, unsigned long * __restrict p1, } while (--lines); } +DO_XOR_BLOCKS(arm4regs, xor_arm4regs_2, xor_arm4regs_3, xor_arm4regs_4, + xor_arm4regs_5); + struct xor_block_template xor_block_arm4regs = { - .name = "arm4regs", - .do_2 = xor_arm4regs_2, - .do_3 = xor_arm4regs_3, - .do_4 = xor_arm4regs_4, - .do_5 = xor_arm4regs_5, + .name = "arm4regs", + .xor_gen = xor_gen_arm4regs, }; diff --git a/lib/raid/xor/arm/xor_arch.h b/lib/raid/xor/arm/xor_arch.h index 5a7eedb48fbb..775ff835df65 100644 --- a/lib/raid/xor/arm/xor_arch.h +++ b/lib/raid/xor/arm/xor_arch.h @@ -7,6 +7,9 @@ extern struct xor_block_template xor_block_arm4regs; extern struct xor_block_template xor_block_neon; +void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes); + static __always_inline void __init arch_xor_init(void) { xor_register(&xor_block_arm4regs); diff --git a/lib/raid/xor/arm64/xor-neon-glue.c b/lib/raid/xor/arm64/xor-neon-glue.c index 3db0a318cf5b..f0284f86feb4 100644 --- a/lib/raid/xor/arm64/xor-neon-glue.c +++ b/lib/raid/xor/arm64/xor-neon-glue.c @@ -10,50 +10,16 @@ #include "xor-neon.h" #define XOR_TEMPLATE(_name) \ -static void \ -xor_##_name##_2(unsigned long bytes, unsigned long * __restrict p1, \ - const unsigned long * __restrict p2) \ +static void xor_gen_##_name(void *dest, void **srcs, unsigned int src_cnt, \ + unsigned int bytes) \ { \ scoped_ksimd() \ - __xor_##_name##_2(bytes, p1, p2); \ -} \ - \ -static void \ -xor_##_name##_3(unsigned long bytes, unsigned long * __restrict p1, \ - const unsigned long * __restrict p2, \ - const unsigned long * __restrict p3) \ -{ \ - scoped_ksimd() \ - __xor_##_name##_3(bytes, p1, p2, p3); \ -} \ - \ -static void \ -xor_##_name##_4(unsigned long bytes, unsigned long * __restrict p1, \ - const unsigned long * __restrict p2, \ - const unsigned long * __restrict p3, \ - const unsigned long * __restrict p4) \ -{ \ - scoped_ksimd() \ - __xor_##_name##_4(bytes, p1, p2, p3, p4); \ -} \ - \ -static void \ -xor_##_name##_5(unsigned long bytes, unsigned long * __restrict p1, \ - const unsigned long * __restrict p2, \ - const unsigned long * __restrict p3, \ - const unsigned long * __restrict p4, \ - const unsigned long * __restrict p5) \ -{ \ - scoped_ksimd() \ - __xor_##_name##_5(bytes, p1, p2, p3, p4, p5); \ + xor_gen_##_name##_inner(dest, srcs, src_cnt, bytes); \ } \ \ struct xor_block_template xor_block_##_name = { \ - .name = __stringify(_name), \ - .do_2 = xor_##_name##_2, \ - .do_3 = xor_##_name##_3, \ - .do_4 = xor_##_name##_4, \ - .do_5 = xor_##_name##_5 \ + .name = __stringify(_name), \ + .xor_gen = xor_gen_##_name, \ }; XOR_TEMPLATE(neon); diff --git a/lib/raid/xor/arm64/xor-neon.c b/lib/raid/xor/arm64/xor-neon.c index 61f00c4fee49..97ef3cb92496 100644 --- a/lib/raid/xor/arm64/xor-neon.c +++ b/lib/raid/xor/arm64/xor-neon.c @@ -10,7 +10,7 @@ #include "xor_arch.h" #include "xor-neon.h" -void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, +static void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p2) { uint64_t *dp1 = (uint64_t *)p1; @@ -37,7 +37,7 @@ void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, } while (--lines > 0); } -void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1, +static void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p2, const unsigned long * __restrict p3) { @@ -73,7 +73,7 @@ void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1, } while (--lines > 0); } -void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1, +static void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p2, const unsigned long * __restrict p3, const unsigned long * __restrict p4) @@ -118,7 +118,7 @@ void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1, } while (--lines > 0); } -void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, +static void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p2, const unsigned long * __restrict p3, const unsigned long * __restrict p4, @@ -172,6 +172,9 @@ void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, } while (--lines > 0); } +__DO_XOR_BLOCKS(neon_inner, __xor_neon_2, __xor_neon_3, __xor_neon_4, + __xor_neon_5); + static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) { uint64x2_t res; @@ -182,7 +185,7 @@ static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) return res; } -void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1, +static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p2, const unsigned long * __restrict p3) { @@ -216,7 +219,7 @@ void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1, } while (--lines > 0); } -void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1, +static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p2, const unsigned long * __restrict p3, const unsigned long * __restrict p4) @@ -259,7 +262,7 @@ void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1, } while (--lines > 0); } -void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1, +static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p2, const unsigned long * __restrict p3, const unsigned long * __restrict p4, @@ -304,3 +307,6 @@ void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1, dp5 += 8; } while (--lines > 0); } + +__DO_XOR_BLOCKS(eor3_inner, __xor_neon_2, __xor_eor3_3, __xor_eor3_4, + __xor_eor3_5); diff --git a/lib/raid/xor/arm64/xor-neon.h b/lib/raid/xor/arm64/xor-neon.h index cec0ac846fea..514699ba8f5f 100644 --- a/lib/raid/xor/arm64/xor-neon.h +++ b/lib/raid/xor/arm64/xor-neon.h @@ -1,30 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2); -void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3); -void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4); -void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5); - -#define __xor_eor3_2 __xor_neon_2 -void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3); -void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4); -void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5); +void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes); +void xor_gen_eor3_inner(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes); diff --git a/lib/raid/xor/loongarch/xor_simd_glue.c b/lib/raid/xor/loongarch/xor_simd_glue.c index b387aa0213b4..7f324d924f87 100644 --- a/lib/raid/xor/loongarch/xor_simd_glue.c +++ b/lib/raid/xor/loongarch/xor_simd_glue.c @@ -11,63 +11,23 @@ #include "xor_arch.h" #include "xor_simd.h" -#define MAKE_XOR_GLUE_2(flavor) \ -static void xor_##flavor##_2(unsigned long bytes, unsigned long * __restrict p1,\ - const unsigned long * __restrict p2) \ +#define MAKE_XOR_GLUES(flavor) \ +DO_XOR_BLOCKS(flavor##_inner, __xor_##flavor##_2, __xor_##flavor##_3, \ + __xor_##flavor##_4, __xor_##flavor##_5); \ + \ +static void xor_gen_##flavor(void *dest, void **srcs, unsigned int src_cnt, \ + unsigned int bytes) \ { \ kernel_fpu_begin(); \ - __xor_##flavor##_2(bytes, p1, p2); \ + xor_gen_##flavor##_inner(dest, srcs, src_cnt, bytes); \ kernel_fpu_end(); \ } \ - -#define MAKE_XOR_GLUE_3(flavor) \ -static void xor_##flavor##_3(unsigned long bytes, unsigned long * __restrict p1,\ - const unsigned long * __restrict p2, \ - const unsigned long * __restrict p3) \ -{ \ - kernel_fpu_begin(); \ - __xor_##flavor##_3(bytes, p1, p2, p3); \ - kernel_fpu_end(); \ -} \ - -#define MAKE_XOR_GLUE_4(flavor) \ -static void xor_##flavor##_4(unsigned long bytes, unsigned long * __restrict p1,\ - const unsigned long * __restrict p2, \ - const unsigned long * __restrict p3, \ - const unsigned long * __restrict p4) \ -{ \ - kernel_fpu_begin(); \ - __xor_##flavor##_4(bytes, p1, p2, p3, p4); \ - kernel_fpu_end(); \ -} \ - -#define MAKE_XOR_GLUE_5(flavor) \ -static void xor_##flavor##_5(unsigned long bytes, unsigned long * __restrict p1,\ - const unsigned long * __restrict p2, \ - const unsigned long * __restrict p3, \ - const unsigned long * __restrict p4, \ - const unsigned long * __restrict p5) \ -{ \ - kernel_fpu_begin(); \ - __xor_##flavor##_5(bytes, p1, p2, p3, p4, p5); \ - kernel_fpu_end(); \ -} \ - -#define MAKE_XOR_GLUES(flavor) \ - MAKE_XOR_GLUE_2(flavor); \ - MAKE_XOR_GLUE_3(flavor); \ - MAKE_XOR_GLUE_4(flavor); \ - MAKE_XOR_GLUE_5(flavor); \ - \ -struct xor_block_template xor_block_##flavor = { \ - .name = __stringify(flavor), \ - .do_2 = xor_##flavor##_2, \ - .do_3 = xor_##flavor##_3, \ - .do_4 = xor_##flavor##_4, \ - .do_5 = xor_##flavor##_5, \ + \ +struct xor_block_template xor_block_##flavor = { \ + .name = __stringify(flavor), \ + .xor_gen = xor_gen_##flavor \ } - #ifdef CONFIG_CPU_HAS_LSX MAKE_XOR_GLUES(lsx); #endif /* CONFIG_CPU_HAS_LSX */ diff --git a/lib/raid/xor/powerpc/xor_vmx.c b/lib/raid/xor/powerpc/xor_vmx.c index aab49d056d18..09bed98c1bc7 100644 --- a/lib/raid/xor/powerpc/xor_vmx.c +++ b/lib/raid/xor/powerpc/xor_vmx.c @@ -10,6 +10,7 @@ * Sparse (as at v0.5.0) gets very, very confused by this file. * Make it a bit simpler for it. */ +#include "xor_impl.h" #if !defined(__CHECKER__) #include #else @@ -49,9 +50,9 @@ typedef vector signed char unative_t; V1##_3 = vec_xor(V1##_3, V2##_3); \ } while (0) -void __xor_altivec_2(unsigned long bytes, - unsigned long * __restrict v1_in, - const unsigned long * __restrict v2_in) +static void __xor_altivec_2(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in) { DEFINE(v1); DEFINE(v2); @@ -68,10 +69,10 @@ void __xor_altivec_2(unsigned long bytes, } while (--lines > 0); } -void __xor_altivec_3(unsigned long bytes, - unsigned long * __restrict v1_in, - const unsigned long * __restrict v2_in, - const unsigned long * __restrict v3_in) +static void __xor_altivec_3(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in, + const unsigned long * __restrict v3_in) { DEFINE(v1); DEFINE(v2); @@ -92,11 +93,11 @@ void __xor_altivec_3(unsigned long bytes, } while (--lines > 0); } -void __xor_altivec_4(unsigned long bytes, - unsigned long * __restrict v1_in, - const unsigned long * __restrict v2_in, - const unsigned long * __restrict v3_in, - const unsigned long * __restrict v4_in) +static void __xor_altivec_4(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in, + const unsigned long * __restrict v3_in, + const unsigned long * __restrict v4_in) { DEFINE(v1); DEFINE(v2); @@ -121,12 +122,12 @@ void __xor_altivec_4(unsigned long bytes, } while (--lines > 0); } -void __xor_altivec_5(unsigned long bytes, - unsigned long * __restrict v1_in, - const unsigned long * __restrict v2_in, - const unsigned long * __restrict v3_in, - const unsigned long * __restrict v4_in, - const unsigned long * __restrict v5_in) +static void __xor_altivec_5(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in, + const unsigned long * __restrict v3_in, + const unsigned long * __restrict v4_in, + const unsigned long * __restrict v5_in) { DEFINE(v1); DEFINE(v2); @@ -154,3 +155,6 @@ void __xor_altivec_5(unsigned long bytes, v5 += 4; } while (--lines > 0); } + +__DO_XOR_BLOCKS(altivec_inner, __xor_altivec_2, __xor_altivec_3, + __xor_altivec_4, __xor_altivec_5); diff --git a/lib/raid/xor/powerpc/xor_vmx.h b/lib/raid/xor/powerpc/xor_vmx.h index 573c41d90dac..1d26c1133a86 100644 --- a/lib/raid/xor/powerpc/xor_vmx.h +++ b/lib/raid/xor/powerpc/xor_vmx.h @@ -6,17 +6,5 @@ * outside of the enable/disable altivec block. */ -void __xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2); -void __xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3); -void __xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4); -void __xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5); +void xor_gen_altivec_inner(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes); diff --git a/lib/raid/xor/powerpc/xor_vmx_glue.c b/lib/raid/xor/powerpc/xor_vmx_glue.c index 56e99ddfb64f..dbfbb5cadc36 100644 --- a/lib/raid/xor/powerpc/xor_vmx_glue.c +++ b/lib/raid/xor/powerpc/xor_vmx_glue.c @@ -12,56 +12,17 @@ #include "xor_arch.h" #include "xor_vmx.h" -static void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2) +static void xor_gen_altivec(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) { preempt_disable(); enable_kernel_altivec(); - __xor_altivec_2(bytes, p1, p2); - disable_kernel_altivec(); - preempt_enable(); -} - -static void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3) -{ - preempt_disable(); - enable_kernel_altivec(); - __xor_altivec_3(bytes, p1, p2, p3); - disable_kernel_altivec(); - preempt_enable(); -} - -static void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4) -{ - preempt_disable(); - enable_kernel_altivec(); - __xor_altivec_4(bytes, p1, p2, p3, p4); - disable_kernel_altivec(); - preempt_enable(); -} - -static void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5) -{ - preempt_disable(); - enable_kernel_altivec(); - __xor_altivec_5(bytes, p1, p2, p3, p4, p5); + xor_gen_altivec_inner(dest, srcs, src_cnt, bytes); disable_kernel_altivec(); preempt_enable(); } struct xor_block_template xor_block_altivec = { - .name = "altivec", - .do_2 = xor_altivec_2, - .do_3 = xor_altivec_3, - .do_4 = xor_altivec_4, - .do_5 = xor_altivec_5, + .name = "altivec", + .xor_gen = xor_gen_altivec, }; diff --git a/lib/raid/xor/riscv/xor-glue.c b/lib/raid/xor/riscv/xor-glue.c index 060e5f22ebcc..2e4c1b05d998 100644 --- a/lib/raid/xor/riscv/xor-glue.c +++ b/lib/raid/xor/riscv/xor-glue.c @@ -9,48 +9,17 @@ #include "xor_impl.h" #include "xor_arch.h" -static void xor_vector_2(unsigned long bytes, unsigned long *__restrict p1, - const unsigned long *__restrict p2) -{ - kernel_vector_begin(); - xor_regs_2_(bytes, p1, p2); - kernel_vector_end(); -} - -static void xor_vector_3(unsigned long bytes, unsigned long *__restrict p1, - const unsigned long *__restrict p2, - const unsigned long *__restrict p3) -{ - kernel_vector_begin(); - xor_regs_3_(bytes, p1, p2, p3); - kernel_vector_end(); -} - -static void xor_vector_4(unsigned long bytes, unsigned long *__restrict p1, - const unsigned long *__restrict p2, - const unsigned long *__restrict p3, - const unsigned long *__restrict p4) -{ - kernel_vector_begin(); - xor_regs_4_(bytes, p1, p2, p3, p4); - kernel_vector_end(); -} +DO_XOR_BLOCKS(vector_inner, xor_regs_2_, xor_regs_3_, xor_regs_4_, xor_regs_5_); -static void xor_vector_5(unsigned long bytes, unsigned long *__restrict p1, - const unsigned long *__restrict p2, - const unsigned long *__restrict p3, - const unsigned long *__restrict p4, - const unsigned long *__restrict p5) +static void xor_gen_vector(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) { kernel_vector_begin(); - xor_regs_5_(bytes, p1, p2, p3, p4, p5); + xor_gen_vector_inner(dest, srcs, src_cnt, bytes); kernel_vector_end(); } struct xor_block_template xor_block_rvv = { - .name = "rvv", - .do_2 = xor_vector_2, - .do_3 = xor_vector_3, - .do_4 = xor_vector_4, - .do_5 = xor_vector_5 + .name = "rvv", + .xor_gen = xor_gen_vector, }; diff --git a/lib/raid/xor/s390/xor.c b/lib/raid/xor/s390/xor.c index c28cb56fec92..0c478678a129 100644 --- a/lib/raid/xor/s390/xor.c +++ b/lib/raid/xor/s390/xor.c @@ -125,10 +125,9 @@ static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1, : : "0", "cc", "memory"); } +DO_XOR_BLOCKS(xc, xor_xc_2, xor_xc_3, xor_xc_4, xor_xc_5); + struct xor_block_template xor_block_xc = { - .name = "xc", - .do_2 = xor_xc_2, - .do_3 = xor_xc_3, - .do_4 = xor_xc_4, - .do_5 = xor_xc_5, + .name = "xc", + .xor_gen = xor_gen_xc, }; diff --git a/lib/raid/xor/sparc/xor-sparc32.c b/lib/raid/xor/sparc/xor-sparc32.c index 307c4a84f535..fb37631e90e6 100644 --- a/lib/raid/xor/sparc/xor-sparc32.c +++ b/lib/raid/xor/sparc/xor-sparc32.c @@ -244,10 +244,9 @@ sparc_5(unsigned long bytes, unsigned long * __restrict p1, } while (--lines > 0); } +DO_XOR_BLOCKS(sparc32, sparc_2, sparc_3, sparc_4, sparc_5); + struct xor_block_template xor_block_SPARC = { - .name = "SPARC", - .do_2 = sparc_2, - .do_3 = sparc_3, - .do_4 = sparc_4, - .do_5 = sparc_5, + .name = "SPARC", + .xor_gen = xor_gen_sparc32, }; diff --git a/lib/raid/xor/sparc/xor-sparc64-glue.c b/lib/raid/xor/sparc/xor-sparc64-glue.c index 5f90c2460b54..a8a686e0d258 100644 --- a/lib/raid/xor/sparc/xor-sparc64-glue.c +++ b/lib/raid/xor/sparc/xor-sparc64-glue.c @@ -28,12 +28,11 @@ void xor_vis_5(unsigned long bytes, unsigned long * __restrict p1, /* XXX Ugh, write cheetah versions... -DaveM */ +DO_XOR_BLOCKS(vis, xor_vis_2, xor_vis_3, xor_vis_4, xor_vis_5); + struct xor_block_template xor_block_VIS = { - .name = "VIS", - .do_2 = xor_vis_2, - .do_3 = xor_vis_3, - .do_4 = xor_vis_4, - .do_5 = xor_vis_5, + .name = "VIS", + .xor_gen = xor_gen_vis, }; void xor_niagara_2(unsigned long bytes, unsigned long * __restrict p1, @@ -51,10 +50,10 @@ void xor_niagara_5(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p4, const unsigned long * __restrict p5); +DO_XOR_BLOCKS(niagara, xor_niagara_2, xor_niagara_3, xor_niagara_4, + xor_niagara_5); + struct xor_block_template xor_block_niagara = { - .name = "Niagara", - .do_2 = xor_niagara_2, - .do_3 = xor_niagara_3, - .do_4 = xor_niagara_4, - .do_5 = xor_niagara_5, + .name = "Niagara", + .xor_gen = xor_gen_niagara, }; diff --git a/lib/raid/xor/x86/xor-avx.c b/lib/raid/xor/x86/xor-avx.c index d411efa1ff43..f7777d7aa269 100644 --- a/lib/raid/xor/x86/xor-avx.c +++ b/lib/raid/xor/x86/xor-avx.c @@ -29,8 +29,6 @@ static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0, { unsigned long lines = bytes >> 9; - kernel_fpu_begin(); - while (lines--) { #undef BLOCK #define BLOCK(i, reg) \ @@ -47,8 +45,6 @@ do { \ p0 = (unsigned long *)((uintptr_t)p0 + 512); p1 = (unsigned long *)((uintptr_t)p1 + 512); } - - kernel_fpu_end(); } static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0, @@ -57,8 +53,6 @@ static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0, { unsigned long lines = bytes >> 9; - kernel_fpu_begin(); - while (lines--) { #undef BLOCK #define BLOCK(i, reg) \ @@ -78,8 +72,6 @@ do { \ p1 = (unsigned long *)((uintptr_t)p1 + 512); p2 = (unsigned long *)((uintptr_t)p2 + 512); } - - kernel_fpu_end(); } static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0, @@ -89,8 +81,6 @@ static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0, { unsigned long lines = bytes >> 9; - kernel_fpu_begin(); - while (lines--) { #undef BLOCK #define BLOCK(i, reg) \ @@ -113,8 +103,6 @@ do { \ p2 = (unsigned long *)((uintptr_t)p2 + 512); p3 = (unsigned long *)((uintptr_t)p3 + 512); } - - kernel_fpu_end(); } static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0, @@ -125,8 +113,6 @@ static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0, { unsigned long lines = bytes >> 9; - kernel_fpu_begin(); - while (lines--) { #undef BLOCK #define BLOCK(i, reg) \ @@ -152,14 +138,19 @@ do { \ p3 = (unsigned long *)((uintptr_t)p3 + 512); p4 = (unsigned long *)((uintptr_t)p4 + 512); } +} + +DO_XOR_BLOCKS(avx_inner, xor_avx_2, xor_avx_3, xor_avx_4, xor_avx_5); +static void xor_gen_avx(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) +{ + kernel_fpu_begin(); + xor_gen_avx_inner(dest, srcs, src_cnt, bytes); kernel_fpu_end(); } struct xor_block_template xor_block_avx = { - .name = "avx", - .do_2 = xor_avx_2, - .do_3 = xor_avx_3, - .do_4 = xor_avx_4, - .do_5 = xor_avx_5, + .name = "avx", + .xor_gen = xor_gen_avx, }; diff --git a/lib/raid/xor/x86/xor-mmx.c b/lib/raid/xor/x86/xor-mmx.c index e48c58f92874..63a8b0444fce 100644 --- a/lib/raid/xor/x86/xor-mmx.c +++ b/lib/raid/xor/x86/xor-mmx.c @@ -21,8 +21,6 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 7; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -55,8 +53,6 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1, "+r" (p1), "+r" (p2) : : "memory"); - - kernel_fpu_end(); } static void @@ -66,8 +62,6 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 7; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -105,8 +99,6 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1, "+r" (p1), "+r" (p2), "+r" (p3) : : "memory"); - - kernel_fpu_end(); } static void @@ -117,8 +109,6 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 7; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -161,8 +151,6 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1, "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) : : "memory"); - - kernel_fpu_end(); } @@ -175,8 +163,6 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 7; - kernel_fpu_begin(); - /* Make sure GCC forgets anything it knows about p4 or p5, such that it won't pass to the asm volatile below a register that is shared with any other variable. That's @@ -237,8 +223,6 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1, Clobber them just to be sure nobody does something stupid like assuming they have some legal value. */ asm("" : "=r" (p4), "=r" (p5)); - - kernel_fpu_end(); } #undef LD @@ -255,8 +239,6 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 6; - kernel_fpu_begin(); - asm volatile( " .align 32 ;\n" " 1: ;\n" @@ -293,8 +275,6 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1, "+r" (p1), "+r" (p2) : : "memory"); - - kernel_fpu_end(); } static void @@ -304,8 +284,6 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 6; - kernel_fpu_begin(); - asm volatile( " .align 32,0x90 ;\n" " 1: ;\n" @@ -351,8 +329,6 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1, "+r" (p1), "+r" (p2), "+r" (p3) : : "memory" ); - - kernel_fpu_end(); } static void @@ -363,8 +339,6 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 6; - kernel_fpu_begin(); - asm volatile( " .align 32,0x90 ;\n" " 1: ;\n" @@ -419,8 +393,6 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1, "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) : : "memory"); - - kernel_fpu_end(); } static void @@ -432,8 +404,6 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 6; - kernel_fpu_begin(); - /* Make sure GCC forgets anything it knows about p4 or p5, such that it won't pass to the asm volatile below a register that is shared with any other variable. That's @@ -510,22 +480,36 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1, Clobber them just to be sure nobody does something stupid like assuming they have some legal value. */ asm("" : "=r" (p4), "=r" (p5)); +} + +DO_XOR_BLOCKS(pII_mmx_inner, xor_pII_mmx_2, xor_pII_mmx_3, xor_pII_mmx_4, + xor_pII_mmx_5); +static void xor_gen_pII_mmx(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) +{ + kernel_fpu_begin(); + xor_gen_pII_mmx_inner(dest, srcs, src_cnt, bytes); kernel_fpu_end(); } struct xor_block_template xor_block_pII_mmx = { - .name = "pII_mmx", - .do_2 = xor_pII_mmx_2, - .do_3 = xor_pII_mmx_3, - .do_4 = xor_pII_mmx_4, - .do_5 = xor_pII_mmx_5, + .name = "pII_mmx", + .xor_gen = xor_gen_pII_mmx, }; +DO_XOR_BLOCKS(p5_mmx_inner, xor_p5_mmx_2, xor_p5_mmx_3, xor_p5_mmx_4, + xor_p5_mmx_5); + +static void xor_gen_p5_mmx(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) +{ + kernel_fpu_begin(); + xor_gen_p5_mmx_inner(dest, srcs, src_cnt, bytes); + kernel_fpu_end(); +} + struct xor_block_template xor_block_p5_mmx = { - .name = "p5_mmx", - .do_2 = xor_p5_mmx_2, - .do_3 = xor_p5_mmx_3, - .do_4 = xor_p5_mmx_4, - .do_5 = xor_p5_mmx_5, + .name = "p5_mmx", + .xor_gen = xor_gen_p5_mmx, }; diff --git a/lib/raid/xor/x86/xor-sse.c b/lib/raid/xor/x86/xor-sse.c index 5993ed688c15..c6626ecae6ba 100644 --- a/lib/raid/xor/x86/xor-sse.c +++ b/lib/raid/xor/x86/xor-sse.c @@ -51,8 +51,6 @@ xor_sse_2(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -93,8 +91,6 @@ xor_sse_2(unsigned long bytes, unsigned long * __restrict p1, [p1] "+r" (p1), [p2] "+r" (p2) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); - - kernel_fpu_end(); } static void @@ -103,8 +99,6 @@ xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -128,8 +122,6 @@ xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1, [p1] "+r" (p1), [p2] "+r" (p2) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); - - kernel_fpu_end(); } static void @@ -139,8 +131,6 @@ xor_sse_3(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -188,8 +178,6 @@ xor_sse_3(unsigned long bytes, unsigned long * __restrict p1, [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); - - kernel_fpu_end(); } static void @@ -199,8 +187,6 @@ xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -226,8 +212,6 @@ xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1, [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); - - kernel_fpu_end(); } static void @@ -238,8 +222,6 @@ xor_sse_4(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -294,8 +276,6 @@ xor_sse_4(unsigned long bytes, unsigned long * __restrict p1, [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); - - kernel_fpu_end(); } static void @@ -306,8 +286,6 @@ xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -335,8 +313,6 @@ xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1, [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); - - kernel_fpu_end(); } static void @@ -348,8 +324,6 @@ xor_sse_5(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -411,8 +385,6 @@ xor_sse_5(unsigned long bytes, unsigned long * __restrict p1, [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); - - kernel_fpu_end(); } static void @@ -424,8 +396,6 @@ xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -455,22 +425,35 @@ xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1, [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); +} + +DO_XOR_BLOCKS(sse_inner, xor_sse_2, xor_sse_3, xor_sse_4, xor_sse_5); +static void xor_gen_sse(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) +{ + kernel_fpu_begin(); + xor_gen_sse_inner(dest, srcs, src_cnt, bytes); kernel_fpu_end(); } struct xor_block_template xor_block_sse = { - .name = "sse", - .do_2 = xor_sse_2, - .do_3 = xor_sse_3, - .do_4 = xor_sse_4, - .do_5 = xor_sse_5, + .name = "sse", + .xor_gen = xor_gen_sse, }; +DO_XOR_BLOCKS(sse_pf64_inner, xor_sse_2_pf64, xor_sse_3_pf64, xor_sse_4_pf64, + xor_sse_5_pf64); + +static void xor_gen_sse_pf64(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) +{ + kernel_fpu_begin(); + xor_gen_sse_pf64_inner(dest, srcs, src_cnt, bytes); + kernel_fpu_end(); +} + struct xor_block_template xor_block_sse_pf64 = { - .name = "prefetch64-sse", - .do_2 = xor_sse_2_pf64, - .do_3 = xor_sse_3_pf64, - .do_4 = xor_sse_4_pf64, - .do_5 = xor_sse_5_pf64, + .name = "prefetch64-sse", + .xor_gen = xor_gen_sse_pf64, }; diff --git a/lib/raid/xor/xor-32regs-prefetch.c b/lib/raid/xor/xor-32regs-prefetch.c index 2856a8e50cb8..ade2a7d8cbe2 100644 --- a/lib/raid/xor/xor-32regs-prefetch.c +++ b/lib/raid/xor/xor-32regs-prefetch.c @@ -258,10 +258,10 @@ xor_32regs_p_5(unsigned long bytes, unsigned long * __restrict p1, goto once_more; } +DO_XOR_BLOCKS(32regs_p, xor_32regs_p_2, xor_32regs_p_3, xor_32regs_p_4, + xor_32regs_p_5); + struct xor_block_template xor_block_32regs_p = { - .name = "32regs_prefetch", - .do_2 = xor_32regs_p_2, - .do_3 = xor_32regs_p_3, - .do_4 = xor_32regs_p_4, - .do_5 = xor_32regs_p_5, + .name = "32regs_prefetch", + .xor_gen = xor_gen_32regs_p, }; diff --git a/lib/raid/xor/xor-32regs.c b/lib/raid/xor/xor-32regs.c index cc44d64032fa..acb4a10d1e95 100644 --- a/lib/raid/xor/xor-32regs.c +++ b/lib/raid/xor/xor-32regs.c @@ -209,10 +209,9 @@ xor_32regs_5(unsigned long bytes, unsigned long * __restrict p1, } while (--lines > 0); } +DO_XOR_BLOCKS(32regs, xor_32regs_2, xor_32regs_3, xor_32regs_4, xor_32regs_5); + struct xor_block_template xor_block_32regs = { - .name = "32regs", - .do_2 = xor_32regs_2, - .do_3 = xor_32regs_3, - .do_4 = xor_32regs_4, - .do_5 = xor_32regs_5, + .name = "32regs", + .xor_gen = xor_gen_32regs, }; diff --git a/lib/raid/xor/xor-8regs-prefetch.c b/lib/raid/xor/xor-8regs-prefetch.c index 1d53aec50d27..451527a951b1 100644 --- a/lib/raid/xor/xor-8regs-prefetch.c +++ b/lib/raid/xor/xor-8regs-prefetch.c @@ -136,10 +136,11 @@ xor_8regs_p_5(unsigned long bytes, unsigned long * __restrict p1, goto once_more; } + +DO_XOR_BLOCKS(8regs_p, xor_8regs_p_2, xor_8regs_p_3, xor_8regs_p_4, + xor_8regs_p_5); + struct xor_block_template xor_block_8regs_p = { - .name = "8regs_prefetch", - .do_2 = xor_8regs_p_2, - .do_3 = xor_8regs_p_3, - .do_4 = xor_8regs_p_4, - .do_5 = xor_8regs_p_5, + .name = "8regs_prefetch", + .xor_gen = xor_gen_8regs_p, }; diff --git a/lib/raid/xor/xor-8regs.c b/lib/raid/xor/xor-8regs.c index 72a44e898c55..1edaed8acffe 100644 --- a/lib/raid/xor/xor-8regs.c +++ b/lib/raid/xor/xor-8regs.c @@ -94,11 +94,10 @@ xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1, } #ifndef NO_TEMPLATE +DO_XOR_BLOCKS(8regs, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5); + struct xor_block_template xor_block_8regs = { - .name = "8regs", - .do_2 = xor_8regs_2, - .do_3 = xor_8regs_3, - .do_4 = xor_8regs_4, - .do_5 = xor_8regs_5, + .name = "8regs", + .xor_gen = xor_gen_8regs, }; #endif /* NO_TEMPLATE */ diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c index 2e46b6b83b0a..9e043d8c3a7a 100644 --- a/lib/raid/xor/xor-core.c +++ b/lib/raid/xor/xor-core.c @@ -13,39 +13,9 @@ #include #include "xor_impl.h" -/* The xor routines to use. */ +/* The xor routine to use. */ static struct xor_block_template *active_template; -void -xor_blocks(unsigned int src_count, unsigned int bytes, void *dest, void **srcs) -{ - unsigned long *p1, *p2, *p3, *p4; - - WARN_ON_ONCE(!in_task() || irqs_disabled() || softirq_count()); - - p1 = (unsigned long *) srcs[0]; - if (src_count == 1) { - active_template->do_2(bytes, dest, p1); - return; - } - - p2 = (unsigned long *) srcs[1]; - if (src_count == 2) { - active_template->do_3(bytes, dest, p1, p2); - return; - } - - p3 = (unsigned long *) srcs[2]; - if (src_count == 3) { - active_template->do_4(bytes, dest, p1, p2, p3); - return; - } - - p4 = (unsigned long *) srcs[3]; - active_template->do_5(bytes, dest, p1, p2, p3, p4); -} -EXPORT_SYMBOL(xor_blocks); - /** * xor_gen - generate RAID-style XOR information * @dest: destination vector @@ -63,20 +33,11 @@ EXPORT_SYMBOL(xor_blocks); */ void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes) { - unsigned int src_off = 0; - - WARN_ON_ONCE(in_interrupt()); + WARN_ON_ONCE(!in_task() || irqs_disabled() || softirq_count()); WARN_ON_ONCE(bytes == 0); WARN_ON_ONCE(bytes & 511); - while (src_cnt > 0) { - unsigned int this_cnt = min(src_cnt, MAX_XOR_BLOCKS); - - xor_blocks(this_cnt, bytes, dest, srcs + src_off); - - src_cnt -= this_cnt; - src_off += this_cnt; - } + active_template->xor_gen(dest, srcs, src_cnt, bytes); } EXPORT_SYMBOL(xor_gen); @@ -120,6 +81,7 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) int speed; unsigned long reps; ktime_t min, start, t0; + void *srcs[1] = { b2 }; preempt_disable(); @@ -130,7 +92,7 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) cpu_relax(); do { mb(); /* prevent loop optimization */ - tmpl->do_2(BENCH_SIZE, b1, b2); + tmpl->xor_gen(b1, srcs, 1, BENCH_SIZE); mb(); } while (reps++ < REPS || (t0 = ktime_get()) == start); min = ktime_sub(t0, start); diff --git a/lib/raid/xor/xor_impl.h b/lib/raid/xor/xor_impl.h index 44b6c99e2093..09ae2916f71e 100644 --- a/lib/raid/xor/xor_impl.h +++ b/lib/raid/xor/xor_impl.h @@ -3,27 +3,47 @@ #define _XOR_IMPL_H #include +#include struct xor_block_template { struct xor_block_template *next; const char *name; int speed; - void (*do_2)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_3)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_4)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_5)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); + void (*xor_gen)(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes); }; +#define __DO_XOR_BLOCKS(_name, _handle1, _handle2, _handle3, _handle4) \ +void \ +xor_gen_##_name(void *dest, void **srcs, unsigned int src_cnt, \ + unsigned int bytes) \ +{ \ + unsigned int src_off = 0; \ + \ + while (src_cnt > 0) { \ + unsigned int this_cnt = min(src_cnt, 4); \ + \ + if (this_cnt == 1) \ + _handle1(bytes, dest, srcs[src_off]); \ + else if (this_cnt == 2) \ + _handle2(bytes, dest, srcs[src_off], \ + srcs[src_off + 1]); \ + else if (this_cnt == 3) \ + _handle3(bytes, dest, srcs[src_off], \ + srcs[src_off + 1], srcs[src_off + 2]); \ + else \ + _handle4(bytes, dest, srcs[src_off], \ + srcs[src_off + 1], srcs[src_off + 2], \ + srcs[src_off + 3]); \ + \ + src_cnt -= this_cnt; \ + src_off += this_cnt; \ + } \ +} + +#define DO_XOR_BLOCKS(_name, _handle1, _handle2, _handle3, _handle4) \ + static __DO_XOR_BLOCKS(_name, _handle1, _handle2, _handle3, _handle4) + /* generic implementations */ extern struct xor_block_template xor_block_8regs; extern struct xor_block_template xor_block_32regs; -- cgit v1.2.3