diff options
Diffstat (limited to 'arch/x86/lib')
-rw-r--r-- | arch/x86/lib/Makefile | 7 | ||||
-rw-r--r-- | arch/x86/lib/copy_user_64.S | 18 | ||||
-rw-r--r-- | arch/x86/lib/crc-t10dif-glue.c | 51 | ||||
-rw-r--r-- | arch/x86/lib/crc32-glue.c | 124 | ||||
-rw-r--r-- | arch/x86/lib/crc32-pclmul.S | 217 | ||||
-rw-r--r-- | arch/x86/lib/crc32c-3way.S | 360 | ||||
-rw-r--r-- | arch/x86/lib/crct10dif-pcl-asm_64.S | 332 | ||||
-rw-r--r-- | arch/x86/lib/getuser.S | 5 |
8 files changed, 1111 insertions, 3 deletions
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 98583a9dbab3..8a59c61624c2 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -38,6 +38,13 @@ lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o +obj-$(CONFIG_CRC32_ARCH) += crc32-x86.o +crc32-x86-y := crc32-glue.o crc32-pclmul.o +crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o + +obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o +crc-t10dif-x86-y := crc-t10dif-glue.o crct10dif-pcl-asm_64.o + obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o obj-y += iomem.o diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index fc9fb5d06174..b8f74d80f35c 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -74,6 +74,24 @@ SYM_FUNC_START(rep_movs_alternative) _ASM_EXTABLE_UA( 0b, 1b) .Llarge_movsq: + /* Do the first possibly unaligned word */ +0: movq (%rsi),%rax +1: movq %rax,(%rdi) + + _ASM_EXTABLE_UA( 0b, .Lcopy_user_tail) + _ASM_EXTABLE_UA( 1b, .Lcopy_user_tail) + + /* What would be the offset to the aligned destination? */ + leaq 8(%rdi),%rax + andq $-8,%rax + subq %rdi,%rax + + /* .. and update pointers and count to match */ + addq %rax,%rdi + addq %rax,%rsi + subq %rax,%rcx + + /* make %rcx contain the number of words, %rax the remainder */ movq %rcx,%rax shrq $3,%rcx andl $7,%eax diff --git a/arch/x86/lib/crc-t10dif-glue.c b/arch/x86/lib/crc-t10dif-glue.c new file mode 100644 index 000000000000..13f07ddc9122 --- /dev/null +++ b/arch/x86/lib/crc-t10dif-glue.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * CRC-T10DIF using PCLMULQDQ instructions + * + * Copyright 2024 Google LLC + */ + +#include <asm/cpufeatures.h> +#include <asm/simd.h> +#include <crypto/internal/simd.h> +#include <linux/crc-t10dif.h> +#include <linux/module.h> + +static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); + +asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len); + +u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len) +{ + if (len >= 16 && + static_key_enabled(&have_pclmulqdq) && crypto_simd_usable()) { + kernel_fpu_begin(); + crc = crc_t10dif_pcl(crc, p, len); + kernel_fpu_end(); + return crc; + } + return crc_t10dif_generic(crc, p, len); +} +EXPORT_SYMBOL(crc_t10dif_arch); + +static int __init crc_t10dif_x86_init(void) +{ + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) + static_branch_enable(&have_pclmulqdq); + return 0; +} +arch_initcall(crc_t10dif_x86_init); + +static void __exit crc_t10dif_x86_exit(void) +{ +} +module_exit(crc_t10dif_x86_exit); + +bool crc_t10dif_is_optimized(void) +{ + return static_key_enabled(&have_pclmulqdq); +} +EXPORT_SYMBOL(crc_t10dif_is_optimized); + +MODULE_DESCRIPTION("CRC-T10DIF using PCLMULQDQ instructions"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c new file mode 100644 index 000000000000..2dd18a886ded --- /dev/null +++ b/arch/x86/lib/crc32-glue.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * x86-optimized CRC32 functions + * + * Copyright (C) 2008 Intel Corporation + * Copyright 2012 Xyratex Technology Limited + * Copyright 2024 Google LLC + */ + +#include <asm/cpufeatures.h> +#include <asm/simd.h> +#include <crypto/internal/simd.h> +#include <linux/crc32.h> +#include <linux/linkage.h> +#include <linux/module.h> + +/* minimum size of buffer for crc32_pclmul_le_16 */ +#define CRC32_PCLMUL_MIN_LEN 64 + +static DEFINE_STATIC_KEY_FALSE(have_crc32); +static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); + +u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len); + +u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (len >= CRC32_PCLMUL_MIN_LEN + 15 && + static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { + size_t n = -(uintptr_t)p & 15; + + /* align p to 16-byte boundary */ + if (n) { + crc = crc32_le_base(crc, p, n); + p += n; + len -= n; + } + n = round_down(len, 16); + kernel_fpu_begin(); + crc = crc32_pclmul_le_16(crc, p, n); + kernel_fpu_end(); + p += n; + len -= n; + } + if (len) + crc = crc32_le_base(crc, p, len); + return crc; +} +EXPORT_SYMBOL(crc32_le_arch); + +#ifdef CONFIG_X86_64 +#define CRC32_INST "crc32q %1, %q0" +#else +#define CRC32_INST "crc32l %1, %0" +#endif + +/* + * Use carryless multiply version of crc32c when buffer size is >= 512 to + * account for FPU state save/restore overhead. + */ +#define CRC32C_PCLMUL_BREAKEVEN 512 + +asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); + +u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len) +{ + size_t num_longs; + + if (!static_branch_likely(&have_crc32)) + return crc32c_le_base(crc, p, len); + + if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && + static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { + kernel_fpu_begin(); + crc = crc32c_x86_3way(crc, p, len); + kernel_fpu_end(); + return crc; + } + + for (num_longs = len / sizeof(unsigned long); + num_longs != 0; num_longs--, p += sizeof(unsigned long)) + asm(CRC32_INST : "+r" (crc) : "rm" (*(unsigned long *)p)); + + for (len %= sizeof(unsigned long); len; len--, p++) + asm("crc32b %1, %0" : "+r" (crc) : "rm" (*p)); + + return crc; +} +EXPORT_SYMBOL(crc32c_le_arch); + +u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) +{ + return crc32_be_base(crc, p, len); +} +EXPORT_SYMBOL(crc32_be_arch); + +static int __init crc32_x86_init(void) +{ + if (boot_cpu_has(X86_FEATURE_XMM4_2)) + static_branch_enable(&have_crc32); + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) + static_branch_enable(&have_pclmulqdq); + return 0; +} +arch_initcall(crc32_x86_init); + +static void __exit crc32_x86_exit(void) +{ +} +module_exit(crc32_x86_exit); + +u32 crc32_optimizations(void) +{ + u32 optimizations = 0; + + if (static_key_enabled(&have_crc32)) + optimizations |= CRC32C_OPTIMIZATION; + if (static_key_enabled(&have_pclmulqdq)) + optimizations |= CRC32_LE_OPTIMIZATION; + return optimizations; +} +EXPORT_SYMBOL(crc32_optimizations); + +MODULE_DESCRIPTION("x86-optimized CRC32 functions"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S new file mode 100644 index 000000000000..f9637789cac1 --- /dev/null +++ b/arch/x86/lib/crc32-pclmul.S @@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2012 Xyratex Technology Limited + * + * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 + * calculation. + * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) + * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found + * at: + * http://www.intel.com/products/processor/manuals/ + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual + * Volume 2B: Instruction Set Reference, N-Z + * + * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> + * Alexander Boyko <Alexander_Boyko@xyratex.com> + */ + +#include <linux/linkage.h> + + +.section .rodata +.align 16 +/* + * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 + * #define CONSTANT_R1 0x154442bd4LL + * + * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 + * #define CONSTANT_R2 0x1c6e41596LL + */ +.Lconstant_R2R1: + .octa 0x00000001c6e415960000000154442bd4 +/* + * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 + * #define CONSTANT_R3 0x1751997d0LL + * + * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e + * #define CONSTANT_R4 0x0ccaa009eLL + */ +.Lconstant_R4R3: + .octa 0x00000000ccaa009e00000001751997d0 +/* + * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 + * #define CONSTANT_R5 0x163cd6124LL + */ +.Lconstant_R5: + .octa 0x00000000000000000000000163cd6124 +.Lconstant_mask32: + .octa 0x000000000000000000000000FFFFFFFF +/* + * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL + * + * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL + * #define CONSTANT_RU 0x1F7011641LL + */ +.Lconstant_RUpoly: + .octa 0x00000001F701164100000001DB710641 + +#define CONSTANT %xmm0 + +#ifdef __x86_64__ +#define CRC %edi +#define BUF %rsi +#define LEN %rdx +#else +#define CRC %eax +#define BUF %edx +#define LEN %ecx +#endif + + + +.text +/** + * Calculate crc32 + * CRC - initial crc32 + * BUF - buffer (16 bytes aligned) + * LEN - sizeof buffer (16 bytes aligned), LEN should be greater than 63 + * return %eax crc32 + * u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len); + */ + +SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ + movdqa (BUF), %xmm1 + movdqa 0x10(BUF), %xmm2 + movdqa 0x20(BUF), %xmm3 + movdqa 0x30(BUF), %xmm4 + movd CRC, CONSTANT + pxor CONSTANT, %xmm1 + sub $0x40, LEN + add $0x40, BUF + cmp $0x40, LEN + jb .Lless_64 + +#ifdef __x86_64__ + movdqa .Lconstant_R2R1(%rip), CONSTANT +#else + movdqa .Lconstant_R2R1, CONSTANT +#endif + +.Lloop_64:/* 64 bytes Full cache line folding */ + prefetchnta 0x40(BUF) + movdqa %xmm1, %xmm5 + movdqa %xmm2, %xmm6 + movdqa %xmm3, %xmm7 +#ifdef __x86_64__ + movdqa %xmm4, %xmm8 +#endif + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x00, CONSTANT, %xmm2 + pclmulqdq $0x00, CONSTANT, %xmm3 +#ifdef __x86_64__ + pclmulqdq $0x00, CONSTANT, %xmm4 +#endif + pclmulqdq $0x11, CONSTANT, %xmm5 + pclmulqdq $0x11, CONSTANT, %xmm6 + pclmulqdq $0x11, CONSTANT, %xmm7 +#ifdef __x86_64__ + pclmulqdq $0x11, CONSTANT, %xmm8 +#endif + pxor %xmm5, %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 +#ifdef __x86_64__ + pxor %xmm8, %xmm4 +#else + /* xmm8 unsupported for x32 */ + movdqa %xmm4, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm4 + pclmulqdq $0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm4 +#endif + + pxor (BUF), %xmm1 + pxor 0x10(BUF), %xmm2 + pxor 0x20(BUF), %xmm3 + pxor 0x30(BUF), %xmm4 + + sub $0x40, LEN + add $0x40, BUF + cmp $0x40, LEN + jge .Lloop_64 +.Lless_64:/* Folding cache line into 128bit */ +#ifdef __x86_64__ + movdqa .Lconstant_R4R3(%rip), CONSTANT +#else + movdqa .Lconstant_R4R3, CONSTANT +#endif + prefetchnta (BUF) + + movdqa %xmm1, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm2, %xmm1 + + movdqa %xmm1, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm3, %xmm1 + + movdqa %xmm1, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm4, %xmm1 + + cmp $0x10, LEN + jb .Lfold_64 +.Lloop_16:/* Folding rest buffer into 128bit */ + movdqa %xmm1, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor (BUF), %xmm1 + sub $0x10, LEN + add $0x10, BUF + cmp $0x10, LEN + jge .Lloop_16 + +.Lfold_64: + /* perform the last 64 bit fold, also adds 32 zeroes + * to the input stream */ + pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ + psrldq $0x08, %xmm1 + pxor CONSTANT, %xmm1 + + /* final 32-bit fold */ + movdqa %xmm1, %xmm2 +#ifdef __x86_64__ + movdqa .Lconstant_R5(%rip), CONSTANT + movdqa .Lconstant_mask32(%rip), %xmm3 +#else + movdqa .Lconstant_R5, CONSTANT + movdqa .Lconstant_mask32, %xmm3 +#endif + psrldq $0x04, %xmm2 + pand %xmm3, %xmm1 + pclmulqdq $0x00, CONSTANT, %xmm1 + pxor %xmm2, %xmm1 + + /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ +#ifdef __x86_64__ + movdqa .Lconstant_RUpoly(%rip), CONSTANT +#else + movdqa .Lconstant_RUpoly, CONSTANT +#endif + movdqa %xmm1, %xmm2 + pand %xmm3, %xmm1 + pclmulqdq $0x10, CONSTANT, %xmm1 + pand %xmm3, %xmm1 + pclmulqdq $0x00, CONSTANT, %xmm1 + pxor %xmm2, %xmm1 + pextrd $0x01, %xmm1, %eax + + RET +SYM_FUNC_END(crc32_pclmul_le_16) diff --git a/arch/x86/lib/crc32c-3way.S b/arch/x86/lib/crc32c-3way.S new file mode 100644 index 000000000000..9b8770503bbc --- /dev/null +++ b/arch/x86/lib/crc32c-3way.S @@ -0,0 +1,360 @@ +/* + * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) + * + * The white papers on CRC32C calculations with PCLMULQDQ instruction can be + * downloaded from: + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf + * + * Copyright (C) 2012 Intel Corporation. + * Copyright 2024 Google LLC + * + * Authors: + * Wajdi Feghali <wajdi.k.feghali@intel.com> + * James Guilford <james.guilford@intel.com> + * David Cote <david.m.cote@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/linkage.h> + +## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction + +# Define threshold below which buffers are considered "small" and routed to +# regular CRC code that does not interleave the CRC instructions. +#define SMALL_SIZE 200 + +# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); + +.text +SYM_FUNC_START(crc32c_x86_3way) +#define crc0 %edi +#define crc0_q %rdi +#define bufp %rsi +#define bufp_d %esi +#define len %rdx +#define len_dw %edx +#define n_misaligned %ecx /* overlaps chunk_bytes! */ +#define n_misaligned_q %rcx +#define chunk_bytes %ecx /* overlaps n_misaligned! */ +#define chunk_bytes_q %rcx +#define crc1 %r8 +#define crc2 %r9 + + cmp $SMALL_SIZE, len + jb .Lsmall + + ################################################################ + ## 1) ALIGN: + ################################################################ + mov bufp_d, n_misaligned + neg n_misaligned + and $7, n_misaligned # calculate the misalignment amount of + # the address + je .Laligned # Skip if aligned + + # Process 1 <= n_misaligned <= 7 bytes individually in order to align + # the remaining data to an 8-byte boundary. +.Ldo_align: + movq (bufp), %rax + add n_misaligned_q, bufp + sub n_misaligned_q, len +.Lalign_loop: + crc32b %al, crc0 # compute crc32 of 1-byte + shr $8, %rax # get next byte + dec n_misaligned + jne .Lalign_loop +.Laligned: + + ################################################################ + ## 2) PROCESS BLOCK: + ################################################################ + + cmp $128*24, len + jae .Lfull_block + +.Lpartial_block: + # Compute floor(len / 24) to get num qwords to process from each lane. + imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24) + shr $16, %eax + jmp .Lcrc_3lanes + +.Lfull_block: + # Processing 128 qwords from each lane. + mov $128, %eax + + ################################################################ + ## 3) CRC each of three lanes: + ################################################################ + +.Lcrc_3lanes: + xor crc1,crc1 + xor crc2,crc2 + mov %eax, chunk_bytes + shl $3, chunk_bytes # num bytes to process from each lane + sub $5, %eax # 4 for 4x_loop, 1 for special last iter + jl .Lcrc_3lanes_4x_done + + # Unroll the loop by a factor of 4 to reduce the overhead of the loop + # bookkeeping instructions, which can compete with crc32q for the ALUs. +.Lcrc_3lanes_4x_loop: + crc32q (bufp), crc0_q + crc32q (bufp,chunk_bytes_q), crc1 + crc32q (bufp,chunk_bytes_q,2), crc2 + crc32q 8(bufp), crc0_q + crc32q 8(bufp,chunk_bytes_q), crc1 + crc32q 8(bufp,chunk_bytes_q,2), crc2 + crc32q 16(bufp), crc0_q + crc32q 16(bufp,chunk_bytes_q), crc1 + crc32q 16(bufp,chunk_bytes_q,2), crc2 + crc32q 24(bufp), crc0_q + crc32q 24(bufp,chunk_bytes_q), crc1 + crc32q 24(bufp,chunk_bytes_q,2), crc2 + add $32, bufp + sub $4, %eax + jge .Lcrc_3lanes_4x_loop + +.Lcrc_3lanes_4x_done: + add $4, %eax + jz .Lcrc_3lanes_last_qword + +.Lcrc_3lanes_1x_loop: + crc32q (bufp), crc0_q + crc32q (bufp,chunk_bytes_q), crc1 + crc32q (bufp,chunk_bytes_q,2), crc2 + add $8, bufp + dec %eax + jnz .Lcrc_3lanes_1x_loop + +.Lcrc_3lanes_last_qword: + crc32q (bufp), crc0_q + crc32q (bufp,chunk_bytes_q), crc1 +# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet + + ################################################################ + ## 4) Combine three results: + ################################################################ + + lea (K_table-8)(%rip), %rax # first entry is for idx 1 + pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2 + lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3 + sub %rax, len # len -= chunk_bytes * 3 + + movq crc0_q, %xmm1 # CRC for block 1 + pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 + + movq crc1, %xmm2 # CRC for block 2 + pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 + + pxor %xmm2,%xmm1 + movq %xmm1, %rax + xor (bufp,chunk_bytes_q,2), %rax + mov crc2, crc0_q + crc32 %rax, crc0_q + lea 8(bufp,chunk_bytes_q,2), bufp + + ################################################################ + ## 5) If more blocks remain, goto (2): + ################################################################ + + cmp $128*24, len + jae .Lfull_block + cmp $SMALL_SIZE, len + jae .Lpartial_block + + ####################################################################### + ## 6) Process any remainder without interleaving: + ####################################################################### +.Lsmall: + test len_dw, len_dw + jz .Ldone + mov len_dw, %eax + shr $3, %eax + jz .Ldo_dword +.Ldo_qwords: + crc32q (bufp), crc0_q + add $8, bufp + dec %eax + jnz .Ldo_qwords +.Ldo_dword: + test $4, len_dw + jz .Ldo_word + crc32l (bufp), crc0 + add $4, bufp +.Ldo_word: + test $2, len_dw + jz .Ldo_byte + crc32w (bufp), crc0 + add $2, bufp +.Ldo_byte: + test $1, len_dw + jz .Ldone + crc32b (bufp), crc0 +.Ldone: + mov crc0, %eax + RET +SYM_FUNC_END(crc32c_x86_3way) + +.section .rodata, "a", @progbits + ################################################################ + ## PCLMULQDQ tables + ## Table is 128 entries x 2 words (8 bytes) each + ################################################################ +.align 8 +K_table: + .long 0x493c7d27, 0x00000001 + .long 0xba4fc28e, 0x493c7d27 + .long 0xddc0152b, 0xf20c0dfe + .long 0x9e4addf8, 0xba4fc28e + .long 0x39d3b296, 0x3da6d0cb + .long 0x0715ce53, 0xddc0152b + .long 0x47db8317, 0x1c291d04 + .long 0x0d3b6092, 0x9e4addf8 + .long 0xc96cfdc0, 0x740eef02 + .long 0x878a92a7, 0x39d3b296 + .long 0xdaece73e, 0x083a6eec + .long 0xab7aff2a, 0x0715ce53 + .long 0x2162d385, 0xc49f4f67 + .long 0x83348832, 0x47db8317 + .long 0x299847d5, 0x2ad91c30 + .long 0xb9e02b86, 0x0d3b6092 + .long 0x18b33a4e, 0x6992cea2 + .long 0xb6dd949b, 0xc96cfdc0 + .long 0x78d9ccb7, 0x7e908048 + .long 0xbac2fd7b, 0x878a92a7 + .long 0xa60ce07b, 0x1b3d8f29 + .long 0xce7f39f4, 0xdaece73e + .long 0x61d82e56, 0xf1d0f55e + .long 0xd270f1a2, 0xab7aff2a + .long 0xc619809d, 0xa87ab8a8 + .long 0x2b3cac5d, 0x2162d385 + .long 0x65863b64, 0x8462d800 + .long 0x1b03397f, 0x83348832 + .long 0xebb883bd, 0x71d111a8 + .long 0xb3e32c28, 0x299847d5 + .long 0x064f7f26, 0xffd852c6 + .long 0xdd7e3b0c, 0xb9e02b86 + .long 0xf285651c, 0xdcb17aa4 + .long 0x10746f3c, 0x18b33a4e + .long 0xc7a68855, 0xf37c5aee + .long 0x271d9844, 0xb6dd949b + .long 0x8e766a0c, 0x6051d5a2 + .long 0x93a5f730, 0x78d9ccb7 + .long 0x6cb08e5c, 0x18b0d4ff + .long 0x6b749fb2, 0xbac2fd7b + .long 0x1393e203, 0x21f3d99c + .long 0xcec3662e, 0xa60ce07b + .long 0x96c515bb, 0x8f158014 + .long 0xe6fc4e6a, 0xce7f39f4 + .long 0x8227bb8a, 0xa00457f7 + .long 0xb0cd4768, 0x61d82e56 + .long 0x39c7ff35, 0x8d6d2c43 + .long 0xd7a4825c, 0xd270f1a2 + .long 0x0ab3844b, 0x00ac29cf + .long 0x0167d312, 0xc619809d + .long 0xf6076544, 0xe9adf796 + .long 0x26f6a60a, 0x2b3cac5d + .long 0xa741c1bf, 0x96638b34 + .long 0x98d8d9cb, 0x65863b64 + .long 0x49c3cc9c, 0xe0e9f351 + .long 0x68bce87a, 0x1b03397f + .long 0x57a3d037, 0x9af01f2d + .long 0x6956fc3b, 0xebb883bd + .long 0x42d98888, 0x2cff42cf + .long 0x3771e98f, 0xb3e32c28 + .long 0xb42ae3d9, 0x88f25a3a + .long 0x2178513a, 0x064f7f26 + .long 0xe0ac139e, 0x4e36f0b0 + .long 0x170076fa, 0xdd7e3b0c + .long 0x444dd413, 0xbd6f81f8 + .long 0x6f345e45, 0xf285651c + .long 0x41d17b64, 0x91c9bd4b + .long 0xff0dba97, 0x10746f3c + .long 0xa2b73df1, 0x885f087b + .long 0xf872e54c, 0xc7a68855 + .long 0x1e41e9fc, 0x4c144932 + .long 0x86d8e4d2, 0x271d9844 + .long 0x651bd98b, 0x52148f02 + .long 0x5bb8f1bc, 0x8e766a0c + .long 0xa90fd27a, 0xa3c6f37a + .long 0xb3af077a, 0x93a5f730 + .long 0x4984d782, 0xd7c0557f + .long 0xca6ef3ac, 0x6cb08e5c + .long 0x234e0b26, 0x63ded06a + .long 0xdd66cbbb, 0x6b749fb2 + .long 0x4597456a, 0x4d56973c + .long 0xe9e28eb4, 0x1393e203 + .long 0x7b3ff57a, 0x9669c9df + .long 0xc9c8b782, 0xcec3662e + .long 0x3f70cc6f, 0xe417f38a + .long 0x93e106a4, 0x96c515bb + .long 0x62ec6c6d, 0x4b9e0f71 + .long 0xd813b325, 0xe6fc4e6a + .long 0x0df04680, 0xd104b8fc + .long 0x2342001e, 0x8227bb8a + .long 0x0a2a8d7e, 0x5b397730 + .long 0x6d9a4957, 0xb0cd4768 + .long 0xe8b6368b, 0xe78eb416 + .long 0xd2c3ed1a, 0x39c7ff35 + .long 0x995a5724, 0x61ff0e01 + .long 0x9ef68d35, 0xd7a4825c + .long 0x0c139b31, 0x8d96551c + .long 0xf2271e60, 0x0ab3844b + .long 0x0b0bf8ca, 0x0bf80dd2 + .long 0x2664fd8b, 0x0167d312 + .long 0xed64812d, 0x8821abed + .long 0x02ee03b2, 0xf6076544 + .long 0x8604ae0f, 0x6a45d2b2 + .long 0x363bd6b3, 0x26f6a60a + .long 0x135c83fd, 0xd8d26619 + .long 0x5fabe670, 0xa741c1bf + .long 0x35ec3279, 0xde87806c + .long 0x00bcf5f6, 0x98d8d9cb + .long 0x8ae00689, 0x14338754 + .long 0x17f27698, 0x49c3cc9c + .long 0x58ca5f00, 0x5bd2011f + .long 0xaa7c7ad5, 0x68bce87a + .long 0xb5cfca28, 0xdd07448e + .long 0xded288f8, 0x57a3d037 + .long 0x59f229bc, 0xdde8f5b9 + .long 0x6d390dec, 0x6956fc3b + .long 0x37170390, 0xa3e3e02c + .long 0x6353c1cc, 0x42d98888 + .long 0xc4584f5c, 0xd73c7bea + .long 0xf48642e9, 0x3771e98f + .long 0x531377e2, 0x80ff0093 + .long 0xdd35bc8d, 0xb42ae3d9 + .long 0xb25b29f2, 0x8fe4c34d + .long 0x9a5ede41, 0x2178513a + .long 0xa563905d, 0xdf99fc11 + .long 0x45cddf4e, 0xe0ac139e + .long 0xacfa3103, 0x6c23e841 + .long 0xa51b6135, 0x170076fa diff --git a/arch/x86/lib/crct10dif-pcl-asm_64.S b/arch/x86/lib/crct10dif-pcl-asm_64.S new file mode 100644 index 000000000000..5286db5b8165 --- /dev/null +++ b/arch/x86/lib/crct10dif-pcl-asm_64.S @@ -0,0 +1,332 @@ +######################################################################## +# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +# +# Copyright (c) 2013, Intel Corporation +# +# Authors: +# Erdinc Ozturk <erdinc.ozturk@intel.com> +# Vinodh Gopal <vinodh.gopal@intel.com> +# James Guilford <james.guilford@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Reference paper titled "Fast CRC Computation for Generic +# Polynomials Using PCLMULQDQ Instruction" +# URL: http://www.intel.com/content/dam/www/public/us/en/documents +# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +# + +#include <linux/linkage.h> + +.text + +#define init_crc %edi +#define buf %rsi +#define len %rdx + +#define FOLD_CONSTS %xmm10 +#define BSWAP_MASK %xmm11 + +# Fold reg1, reg2 into the next 32 data bytes, storing the result back into +# reg1, reg2. +.macro fold_32_bytes offset, reg1, reg2 + movdqu \offset(buf), %xmm9 + movdqu \offset+16(buf), %xmm12 + pshufb BSWAP_MASK, %xmm9 + pshufb BSWAP_MASK, %xmm12 + movdqa \reg1, %xmm8 + movdqa \reg2, %xmm13 + pclmulqdq $0x00, FOLD_CONSTS, \reg1 + pclmulqdq $0x11, FOLD_CONSTS, %xmm8 + pclmulqdq $0x00, FOLD_CONSTS, \reg2 + pclmulqdq $0x11, FOLD_CONSTS, %xmm13 + pxor %xmm9 , \reg1 + xorps %xmm8 , \reg1 + pxor %xmm12, \reg2 + xorps %xmm13, \reg2 +.endm + +# Fold src_reg into dst_reg. +.macro fold_16_bytes src_reg, dst_reg + movdqa \src_reg, %xmm8 + pclmulqdq $0x11, FOLD_CONSTS, \src_reg + pclmulqdq $0x00, FOLD_CONSTS, %xmm8 + pxor %xmm8, \dst_reg + xorps \src_reg, \dst_reg +.endm + +# +# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len); +# +# Assumes len >= 16. +# +SYM_FUNC_START(crc_t10dif_pcl) + + movdqa .Lbswap_mask(%rip), BSWAP_MASK + + # For sizes less than 256 bytes, we can't fold 128 bytes at a time. + cmp $256, len + jl .Lless_than_256_bytes + + # Load the first 128 data bytes. Byte swapping is necessary to make the + # bit order match the polynomial coefficient order. + movdqu 16*0(buf), %xmm0 + movdqu 16*1(buf), %xmm1 + movdqu 16*2(buf), %xmm2 + movdqu 16*3(buf), %xmm3 + movdqu 16*4(buf), %xmm4 + movdqu 16*5(buf), %xmm5 + movdqu 16*6(buf), %xmm6 + movdqu 16*7(buf), %xmm7 + add $128, buf + pshufb BSWAP_MASK, %xmm0 + pshufb BSWAP_MASK, %xmm1 + pshufb BSWAP_MASK, %xmm2 + pshufb BSWAP_MASK, %xmm3 + pshufb BSWAP_MASK, %xmm4 + pshufb BSWAP_MASK, %xmm5 + pshufb BSWAP_MASK, %xmm6 + pshufb BSWAP_MASK, %xmm7 + + # XOR the first 16 data *bits* with the initial CRC value. + pxor %xmm8, %xmm8 + pinsrw $7, init_crc, %xmm8 + pxor %xmm8, %xmm0 + + movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS + + # Subtract 128 for the 128 data bytes just consumed. Subtract another + # 128 to simplify the termination condition of the following loop. + sub $256, len + + # While >= 128 data bytes remain (not counting xmm0-7), fold the 128 + # bytes xmm0-7 into them, storing the result back into xmm0-7. +.Lfold_128_bytes_loop: + fold_32_bytes 0, %xmm0, %xmm1 + fold_32_bytes 32, %xmm2, %xmm3 + fold_32_bytes 64, %xmm4, %xmm5 + fold_32_bytes 96, %xmm6, %xmm7 + add $128, buf + sub $128, len + jge .Lfold_128_bytes_loop + + # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7. + + # Fold across 64 bytes. + movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS + fold_16_bytes %xmm0, %xmm4 + fold_16_bytes %xmm1, %xmm5 + fold_16_bytes %xmm2, %xmm6 + fold_16_bytes %xmm3, %xmm7 + # Fold across 32 bytes. + movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS + fold_16_bytes %xmm4, %xmm6 + fold_16_bytes %xmm5, %xmm7 + # Fold across 16 bytes. + movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS + fold_16_bytes %xmm6, %xmm7 + + # Add 128 to get the correct number of data bytes remaining in 0...127 + # (not counting xmm7), following the previous extra subtraction by 128. + # Then subtract 16 to simplify the termination condition of the + # following loop. + add $128-16, len + + # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes + # xmm7 into them, storing the result back into xmm7. + jl .Lfold_16_bytes_loop_done +.Lfold_16_bytes_loop: + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, FOLD_CONSTS, %xmm7 + pclmulqdq $0x00, FOLD_CONSTS, %xmm8 + pxor %xmm8, %xmm7 + movdqu (buf), %xmm0 + pshufb BSWAP_MASK, %xmm0 + pxor %xmm0 , %xmm7 + add $16, buf + sub $16, len + jge .Lfold_16_bytes_loop + +.Lfold_16_bytes_loop_done: + # Add 16 to get the correct number of data bytes remaining in 0...15 + # (not counting xmm7), following the previous extra subtraction by 16. + add $16, len + je .Lreduce_final_16_bytes + +.Lhandle_partial_segment: + # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16 + # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do + # this without needing a fold constant for each possible 'len', redivide + # the bytes into a first chunk of 'len' bytes and a second chunk of 16 + # bytes, then fold the first chunk into the second. + + movdqa %xmm7, %xmm2 + + # xmm1 = last 16 original data bytes + movdqu -16(buf, len), %xmm1 + pshufb BSWAP_MASK, %xmm1 + + # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes. + lea .Lbyteshift_table+16(%rip), %rax + sub len, %rax + movdqu (%rax), %xmm0 + pshufb %xmm0, %xmm2 + + # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes. + pxor .Lmask1(%rip), %xmm0 + pshufb %xmm0, %xmm7 + + # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes), + # then '16-len' bytes from xmm2 (high-order bytes). + pblendvb %xmm2, %xmm1 #xmm0 is implicit + + # Fold the first chunk into the second chunk, storing the result in xmm7. + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, FOLD_CONSTS, %xmm7 + pclmulqdq $0x00, FOLD_CONSTS, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm1, %xmm7 + +.Lreduce_final_16_bytes: + # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC + + # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. + movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS + + # Fold the high 64 bits into the low 64 bits, while also multiplying by + # x^64. This produces a 128-bit value congruent to x^64 * M(x) and + # whose low 48 bits are 0. + movdqa %xmm7, %xmm0 + pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x)) + pslldq $8, %xmm0 + pxor %xmm0, %xmm7 # + low bits * x^64 + + # Fold the high 32 bits into the low 96 bits. This produces a 96-bit + # value congruent to x^64 * M(x) and whose low 48 bits are 0. + movdqa %xmm7, %xmm0 + pand .Lmask2(%rip), %xmm0 # zero high 32 bits + psrldq $12, %xmm7 # extract high 32 bits + pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x)) + pxor %xmm0, %xmm7 # + low bits + + # Load G(x) and floor(x^48 / G(x)). + movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS + + # Use Barrett reduction to compute the final CRC value. + movdqa %xmm7, %xmm0 + pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x)) + psrlq $32, %xmm7 # /= x^32 + pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x) + psrlq $48, %xmm0 + pxor %xmm7, %xmm0 # + low 16 nonzero bits + # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0. + + pextrw $0, %xmm0, %eax + RET + +.align 16 +.Lless_than_256_bytes: + # Checksumming a buffer of length 16...255 bytes + + # Load the first 16 data bytes. + movdqu (buf), %xmm7 + pshufb BSWAP_MASK, %xmm7 + add $16, buf + + # XOR the first 16 data *bits* with the initial CRC value. + pxor %xmm0, %xmm0 + pinsrw $7, init_crc, %xmm0 + pxor %xmm0, %xmm7 + + movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS + cmp $16, len + je .Lreduce_final_16_bytes # len == 16 + sub $32, len + jge .Lfold_16_bytes_loop # 32 <= len <= 255 + add $16, len + jmp .Lhandle_partial_segment # 17 <= len <= 31 +SYM_FUNC_END(crc_t10dif_pcl) + +.section .rodata, "a", @progbits +.align 16 + +# Fold constants precomputed from the polynomial 0x18bb7 +# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 +.Lfold_across_128_bytes_consts: + .quad 0x0000000000006123 # x^(8*128) mod G(x) + .quad 0x0000000000002295 # x^(8*128+64) mod G(x) +.Lfold_across_64_bytes_consts: + .quad 0x0000000000001069 # x^(4*128) mod G(x) + .quad 0x000000000000dd31 # x^(4*128+64) mod G(x) +.Lfold_across_32_bytes_consts: + .quad 0x000000000000857d # x^(2*128) mod G(x) + .quad 0x0000000000007acc # x^(2*128+64) mod G(x) +.Lfold_across_16_bytes_consts: + .quad 0x000000000000a010 # x^(1*128) mod G(x) + .quad 0x0000000000001faa # x^(1*128+64) mod G(x) +.Lfinal_fold_consts: + .quad 0x1368000000000000 # x^48 * (x^48 mod G(x)) + .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x)) +.Lbarrett_reduction_consts: + .quad 0x0000000000018bb7 # G(x) + .quad 0x00000001f65a57f8 # floor(x^48 / G(x)) + +.section .rodata.cst16.mask1, "aM", @progbits, 16 +.align 16 +.Lmask1: + .octa 0x80808080808080808080808080808080 + +.section .rodata.cst16.mask2, "aM", @progbits, 16 +.align 16 +.Lmask2: + .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF + +.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 +.align 16 +.Lbswap_mask: + .octa 0x000102030405060708090A0B0C0D0E0F + +.section .rodata.cst32.byteshift_table, "aM", @progbits, 32 +.align 16 +# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len] +# is the index vector to shift left by 'len' bytes, and is also {0x80, ..., +# 0x80} XOR the index vector to shift right by '16 - len' bytes. +.Lbyteshift_table: + .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 + .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 4357ec2a0bfc..89ecd57c9d42 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -44,9 +44,8 @@ .pushsection runtime_ptr_USER_PTR_MAX,"a" .long 1b - 8 - . .popsection - cmp %rax, %rdx - sbb %rdx, %rdx - or %rdx, %rax + cmp %rdx, %rax + cmova %rdx, %rax .else cmp $TASK_SIZE_MAX-\size+1, %eax jae .Lbad_get_user |