summaryrefslogtreecommitdiff
path: root/arch/x86/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/lib')
-rw-r--r--arch/x86/lib/Makefile7
-rw-r--r--arch/x86/lib/copy_user_64.S18
-rw-r--r--arch/x86/lib/crc-t10dif-glue.c51
-rw-r--r--arch/x86/lib/crc32-glue.c124
-rw-r--r--arch/x86/lib/crc32-pclmul.S217
-rw-r--r--arch/x86/lib/crc32c-3way.S360
-rw-r--r--arch/x86/lib/crct10dif-pcl-asm_64.S332
-rw-r--r--arch/x86/lib/getuser.S5
8 files changed, 1111 insertions, 3 deletions
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 98583a9dbab3..8a59c61624c2 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -38,6 +38,13 @@ lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o
+obj-$(CONFIG_CRC32_ARCH) += crc32-x86.o
+crc32-x86-y := crc32-glue.o crc32-pclmul.o
+crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o
+
+obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o
+crc-t10dif-x86-y := crc-t10dif-glue.o crct10dif-pcl-asm_64.o
+
obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
obj-y += iomem.o
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index fc9fb5d06174..b8f74d80f35c 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -74,6 +74,24 @@ SYM_FUNC_START(rep_movs_alternative)
_ASM_EXTABLE_UA( 0b, 1b)
.Llarge_movsq:
+ /* Do the first possibly unaligned word */
+0: movq (%rsi),%rax
+1: movq %rax,(%rdi)
+
+ _ASM_EXTABLE_UA( 0b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA( 1b, .Lcopy_user_tail)
+
+ /* What would be the offset to the aligned destination? */
+ leaq 8(%rdi),%rax
+ andq $-8,%rax
+ subq %rdi,%rax
+
+ /* .. and update pointers and count to match */
+ addq %rax,%rdi
+ addq %rax,%rsi
+ subq %rax,%rcx
+
+ /* make %rcx contain the number of words, %rax the remainder */
movq %rcx,%rax
shrq $3,%rcx
andl $7,%eax
diff --git a/arch/x86/lib/crc-t10dif-glue.c b/arch/x86/lib/crc-t10dif-glue.c
new file mode 100644
index 000000000000..13f07ddc9122
--- /dev/null
+++ b/arch/x86/lib/crc-t10dif-glue.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * CRC-T10DIF using PCLMULQDQ instructions
+ *
+ * Copyright 2024 Google LLC
+ */
+
+#include <asm/cpufeatures.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <linux/crc-t10dif.h>
+#include <linux/module.h>
+
+static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+
+asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);
+
+u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
+{
+ if (len >= 16 &&
+ static_key_enabled(&have_pclmulqdq) && crypto_simd_usable()) {
+ kernel_fpu_begin();
+ crc = crc_t10dif_pcl(crc, p, len);
+ kernel_fpu_end();
+ return crc;
+ }
+ return crc_t10dif_generic(crc, p, len);
+}
+EXPORT_SYMBOL(crc_t10dif_arch);
+
+static int __init crc_t10dif_x86_init(void)
+{
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ))
+ static_branch_enable(&have_pclmulqdq);
+ return 0;
+}
+arch_initcall(crc_t10dif_x86_init);
+
+static void __exit crc_t10dif_x86_exit(void)
+{
+}
+module_exit(crc_t10dif_x86_exit);
+
+bool crc_t10dif_is_optimized(void)
+{
+ return static_key_enabled(&have_pclmulqdq);
+}
+EXPORT_SYMBOL(crc_t10dif_is_optimized);
+
+MODULE_DESCRIPTION("CRC-T10DIF using PCLMULQDQ instructions");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c
new file mode 100644
index 000000000000..2dd18a886ded
--- /dev/null
+++ b/arch/x86/lib/crc32-glue.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * x86-optimized CRC32 functions
+ *
+ * Copyright (C) 2008 Intel Corporation
+ * Copyright 2012 Xyratex Technology Limited
+ * Copyright 2024 Google LLC
+ */
+
+#include <asm/cpufeatures.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <linux/crc32.h>
+#include <linux/linkage.h>
+#include <linux/module.h>
+
+/* minimum size of buffer for crc32_pclmul_le_16 */
+#define CRC32_PCLMUL_MIN_LEN 64
+
+static DEFINE_STATIC_KEY_FALSE(have_crc32);
+static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+
+u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len);
+
+u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (len >= CRC32_PCLMUL_MIN_LEN + 15 &&
+ static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
+ size_t n = -(uintptr_t)p & 15;
+
+ /* align p to 16-byte boundary */
+ if (n) {
+ crc = crc32_le_base(crc, p, n);
+ p += n;
+ len -= n;
+ }
+ n = round_down(len, 16);
+ kernel_fpu_begin();
+ crc = crc32_pclmul_le_16(crc, p, n);
+ kernel_fpu_end();
+ p += n;
+ len -= n;
+ }
+ if (len)
+ crc = crc32_le_base(crc, p, len);
+ return crc;
+}
+EXPORT_SYMBOL(crc32_le_arch);
+
+#ifdef CONFIG_X86_64
+#define CRC32_INST "crc32q %1, %q0"
+#else
+#define CRC32_INST "crc32l %1, %0"
+#endif
+
+/*
+ * Use carryless multiply version of crc32c when buffer size is >= 512 to
+ * account for FPU state save/restore overhead.
+ */
+#define CRC32C_PCLMUL_BREAKEVEN 512
+
+asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
+
+u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ size_t num_longs;
+
+ if (!static_branch_likely(&have_crc32))
+ return crc32c_le_base(crc, p, len);
+
+ if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
+ static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
+ kernel_fpu_begin();
+ crc = crc32c_x86_3way(crc, p, len);
+ kernel_fpu_end();
+ return crc;
+ }
+
+ for (num_longs = len / sizeof(unsigned long);
+ num_longs != 0; num_longs--, p += sizeof(unsigned long))
+ asm(CRC32_INST : "+r" (crc) : "rm" (*(unsigned long *)p));
+
+ for (len %= sizeof(unsigned long); len; len--, p++)
+ asm("crc32b %1, %0" : "+r" (crc) : "rm" (*p));
+
+ return crc;
+}
+EXPORT_SYMBOL(crc32c_le_arch);
+
+u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
+{
+ return crc32_be_base(crc, p, len);
+}
+EXPORT_SYMBOL(crc32_be_arch);
+
+static int __init crc32_x86_init(void)
+{
+ if (boot_cpu_has(X86_FEATURE_XMM4_2))
+ static_branch_enable(&have_crc32);
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ))
+ static_branch_enable(&have_pclmulqdq);
+ return 0;
+}
+arch_initcall(crc32_x86_init);
+
+static void __exit crc32_x86_exit(void)
+{
+}
+module_exit(crc32_x86_exit);
+
+u32 crc32_optimizations(void)
+{
+ u32 optimizations = 0;
+
+ if (static_key_enabled(&have_crc32))
+ optimizations |= CRC32C_OPTIMIZATION;
+ if (static_key_enabled(&have_pclmulqdq))
+ optimizations |= CRC32_LE_OPTIMIZATION;
+ return optimizations;
+}
+EXPORT_SYMBOL(crc32_optimizations);
+
+MODULE_DESCRIPTION("x86-optimized CRC32 functions");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S
new file mode 100644
index 000000000000..f9637789cac1
--- /dev/null
+++ b/arch/x86/lib/crc32-pclmul.S
@@ -0,0 +1,217 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ * Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#include <linux/linkage.h>
+
+
+.section .rodata
+.align 16
+/*
+ * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
+ * #define CONSTANT_R1 0x154442bd4LL
+ *
+ * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
+ * #define CONSTANT_R2 0x1c6e41596LL
+ */
+.Lconstant_R2R1:
+ .octa 0x00000001c6e415960000000154442bd4
+/*
+ * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
+ * #define CONSTANT_R3 0x1751997d0LL
+ *
+ * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
+ * #define CONSTANT_R4 0x0ccaa009eLL
+ */
+.Lconstant_R4R3:
+ .octa 0x00000000ccaa009e00000001751997d0
+/*
+ * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
+ * #define CONSTANT_R5 0x163cd6124LL
+ */
+.Lconstant_R5:
+ .octa 0x00000000000000000000000163cd6124
+.Lconstant_mask32:
+ .octa 0x000000000000000000000000FFFFFFFF
+/*
+ * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+ *
+ * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
+ * #define CONSTANT_RU 0x1F7011641LL
+ */
+.Lconstant_RUpoly:
+ .octa 0x00000001F701164100000001DB710641
+
+#define CONSTANT %xmm0
+
+#ifdef __x86_64__
+#define CRC %edi
+#define BUF %rsi
+#define LEN %rdx
+#else
+#define CRC %eax
+#define BUF %edx
+#define LEN %ecx
+#endif
+
+
+
+.text
+/**
+ * Calculate crc32
+ * CRC - initial crc32
+ * BUF - buffer (16 bytes aligned)
+ * LEN - sizeof buffer (16 bytes aligned), LEN should be greater than 63
+ * return %eax crc32
+ * u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len);
+ */
+
+SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
+ movdqa (BUF), %xmm1
+ movdqa 0x10(BUF), %xmm2
+ movdqa 0x20(BUF), %xmm3
+ movdqa 0x30(BUF), %xmm4
+ movd CRC, CONSTANT
+ pxor CONSTANT, %xmm1
+ sub $0x40, LEN
+ add $0x40, BUF
+ cmp $0x40, LEN
+ jb .Lless_64
+
+#ifdef __x86_64__
+ movdqa .Lconstant_R2R1(%rip), CONSTANT
+#else
+ movdqa .Lconstant_R2R1, CONSTANT
+#endif
+
+.Lloop_64:/* 64 bytes Full cache line folding */
+ prefetchnta 0x40(BUF)
+ movdqa %xmm1, %xmm5
+ movdqa %xmm2, %xmm6
+ movdqa %xmm3, %xmm7
+#ifdef __x86_64__
+ movdqa %xmm4, %xmm8
+#endif
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pclmulqdq $0x00, CONSTANT, %xmm2
+ pclmulqdq $0x00, CONSTANT, %xmm3
+#ifdef __x86_64__
+ pclmulqdq $0x00, CONSTANT, %xmm4
+#endif
+ pclmulqdq $0x11, CONSTANT, %xmm5
+ pclmulqdq $0x11, CONSTANT, %xmm6
+ pclmulqdq $0x11, CONSTANT, %xmm7
+#ifdef __x86_64__
+ pclmulqdq $0x11, CONSTANT, %xmm8
+#endif
+ pxor %xmm5, %xmm1
+ pxor %xmm6, %xmm2
+ pxor %xmm7, %xmm3
+#ifdef __x86_64__
+ pxor %xmm8, %xmm4
+#else
+ /* xmm8 unsupported for x32 */
+ movdqa %xmm4, %xmm5
+ pclmulqdq $0x00, CONSTANT, %xmm4
+ pclmulqdq $0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm4
+#endif
+
+ pxor (BUF), %xmm1
+ pxor 0x10(BUF), %xmm2
+ pxor 0x20(BUF), %xmm3
+ pxor 0x30(BUF), %xmm4
+
+ sub $0x40, LEN
+ add $0x40, BUF
+ cmp $0x40, LEN
+ jge .Lloop_64
+.Lless_64:/* Folding cache line into 128bit */
+#ifdef __x86_64__
+ movdqa .Lconstant_R4R3(%rip), CONSTANT
+#else
+ movdqa .Lconstant_R4R3, CONSTANT
+#endif
+ prefetchnta (BUF)
+
+ movdqa %xmm1, %xmm5
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pclmulqdq $0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor %xmm2, %xmm1
+
+ movdqa %xmm1, %xmm5
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pclmulqdq $0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor %xmm3, %xmm1
+
+ movdqa %xmm1, %xmm5
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pclmulqdq $0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor %xmm4, %xmm1
+
+ cmp $0x10, LEN
+ jb .Lfold_64
+.Lloop_16:/* Folding rest buffer into 128bit */
+ movdqa %xmm1, %xmm5
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pclmulqdq $0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor (BUF), %xmm1
+ sub $0x10, LEN
+ add $0x10, BUF
+ cmp $0x10, LEN
+ jge .Lloop_16
+
+.Lfold_64:
+ /* perform the last 64 bit fold, also adds 32 zeroes
+ * to the input stream */
+ pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
+ psrldq $0x08, %xmm1
+ pxor CONSTANT, %xmm1
+
+ /* final 32-bit fold */
+ movdqa %xmm1, %xmm2
+#ifdef __x86_64__
+ movdqa .Lconstant_R5(%rip), CONSTANT
+ movdqa .Lconstant_mask32(%rip), %xmm3
+#else
+ movdqa .Lconstant_R5, CONSTANT
+ movdqa .Lconstant_mask32, %xmm3
+#endif
+ psrldq $0x04, %xmm2
+ pand %xmm3, %xmm1
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pxor %xmm2, %xmm1
+
+ /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+#ifdef __x86_64__
+ movdqa .Lconstant_RUpoly(%rip), CONSTANT
+#else
+ movdqa .Lconstant_RUpoly, CONSTANT
+#endif
+ movdqa %xmm1, %xmm2
+ pand %xmm3, %xmm1
+ pclmulqdq $0x10, CONSTANT, %xmm1
+ pand %xmm3, %xmm1
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pxor %xmm2, %xmm1
+ pextrd $0x01, %xmm1, %eax
+
+ RET
+SYM_FUNC_END(crc32_pclmul_le_16)
diff --git a/arch/x86/lib/crc32c-3way.S b/arch/x86/lib/crc32c-3way.S
new file mode 100644
index 000000000000..9b8770503bbc
--- /dev/null
+++ b/arch/x86/lib/crc32c-3way.S
@@ -0,0 +1,360 @@
+/*
+ * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
+ *
+ * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
+ * downloaded from:
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ * Copyright 2024 Google LLC
+ *
+ * Authors:
+ * Wajdi Feghali <wajdi.k.feghali@intel.com>
+ * James Guilford <james.guilford@intel.com>
+ * David Cote <david.m.cote@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/linkage.h>
+
+## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
+
+# Define threshold below which buffers are considered "small" and routed to
+# regular CRC code that does not interleave the CRC instructions.
+#define SMALL_SIZE 200
+
+# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
+
+.text
+SYM_FUNC_START(crc32c_x86_3way)
+#define crc0 %edi
+#define crc0_q %rdi
+#define bufp %rsi
+#define bufp_d %esi
+#define len %rdx
+#define len_dw %edx
+#define n_misaligned %ecx /* overlaps chunk_bytes! */
+#define n_misaligned_q %rcx
+#define chunk_bytes %ecx /* overlaps n_misaligned! */
+#define chunk_bytes_q %rcx
+#define crc1 %r8
+#define crc2 %r9
+
+ cmp $SMALL_SIZE, len
+ jb .Lsmall
+
+ ################################################################
+ ## 1) ALIGN:
+ ################################################################
+ mov bufp_d, n_misaligned
+ neg n_misaligned
+ and $7, n_misaligned # calculate the misalignment amount of
+ # the address
+ je .Laligned # Skip if aligned
+
+ # Process 1 <= n_misaligned <= 7 bytes individually in order to align
+ # the remaining data to an 8-byte boundary.
+.Ldo_align:
+ movq (bufp), %rax
+ add n_misaligned_q, bufp
+ sub n_misaligned_q, len
+.Lalign_loop:
+ crc32b %al, crc0 # compute crc32 of 1-byte
+ shr $8, %rax # get next byte
+ dec n_misaligned
+ jne .Lalign_loop
+.Laligned:
+
+ ################################################################
+ ## 2) PROCESS BLOCK:
+ ################################################################
+
+ cmp $128*24, len
+ jae .Lfull_block
+
+.Lpartial_block:
+ # Compute floor(len / 24) to get num qwords to process from each lane.
+ imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
+ shr $16, %eax
+ jmp .Lcrc_3lanes
+
+.Lfull_block:
+ # Processing 128 qwords from each lane.
+ mov $128, %eax
+
+ ################################################################
+ ## 3) CRC each of three lanes:
+ ################################################################
+
+.Lcrc_3lanes:
+ xor crc1,crc1
+ xor crc2,crc2
+ mov %eax, chunk_bytes
+ shl $3, chunk_bytes # num bytes to process from each lane
+ sub $5, %eax # 4 for 4x_loop, 1 for special last iter
+ jl .Lcrc_3lanes_4x_done
+
+ # Unroll the loop by a factor of 4 to reduce the overhead of the loop
+ # bookkeeping instructions, which can compete with crc32q for the ALUs.
+.Lcrc_3lanes_4x_loop:
+ crc32q (bufp), crc0_q
+ crc32q (bufp,chunk_bytes_q), crc1
+ crc32q (bufp,chunk_bytes_q,2), crc2
+ crc32q 8(bufp), crc0_q
+ crc32q 8(bufp,chunk_bytes_q), crc1
+ crc32q 8(bufp,chunk_bytes_q,2), crc2
+ crc32q 16(bufp), crc0_q
+ crc32q 16(bufp,chunk_bytes_q), crc1
+ crc32q 16(bufp,chunk_bytes_q,2), crc2
+ crc32q 24(bufp), crc0_q
+ crc32q 24(bufp,chunk_bytes_q), crc1
+ crc32q 24(bufp,chunk_bytes_q,2), crc2
+ add $32, bufp
+ sub $4, %eax
+ jge .Lcrc_3lanes_4x_loop
+
+.Lcrc_3lanes_4x_done:
+ add $4, %eax
+ jz .Lcrc_3lanes_last_qword
+
+.Lcrc_3lanes_1x_loop:
+ crc32q (bufp), crc0_q
+ crc32q (bufp,chunk_bytes_q), crc1
+ crc32q (bufp,chunk_bytes_q,2), crc2
+ add $8, bufp
+ dec %eax
+ jnz .Lcrc_3lanes_1x_loop
+
+.Lcrc_3lanes_last_qword:
+ crc32q (bufp), crc0_q
+ crc32q (bufp,chunk_bytes_q), crc1
+# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
+
+ ################################################################
+ ## 4) Combine three results:
+ ################################################################
+
+ lea (K_table-8)(%rip), %rax # first entry is for idx 1
+ pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
+ lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
+ sub %rax, len # len -= chunk_bytes * 3
+
+ movq crc0_q, %xmm1 # CRC for block 1
+ pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
+
+ movq crc1, %xmm2 # CRC for block 2
+ pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
+
+ pxor %xmm2,%xmm1
+ movq %xmm1, %rax
+ xor (bufp,chunk_bytes_q,2), %rax
+ mov crc2, crc0_q
+ crc32 %rax, crc0_q
+ lea 8(bufp,chunk_bytes_q,2), bufp
+
+ ################################################################
+ ## 5) If more blocks remain, goto (2):
+ ################################################################
+
+ cmp $128*24, len
+ jae .Lfull_block
+ cmp $SMALL_SIZE, len
+ jae .Lpartial_block
+
+ #######################################################################
+ ## 6) Process any remainder without interleaving:
+ #######################################################################
+.Lsmall:
+ test len_dw, len_dw
+ jz .Ldone
+ mov len_dw, %eax
+ shr $3, %eax
+ jz .Ldo_dword
+.Ldo_qwords:
+ crc32q (bufp), crc0_q
+ add $8, bufp
+ dec %eax
+ jnz .Ldo_qwords
+.Ldo_dword:
+ test $4, len_dw
+ jz .Ldo_word
+ crc32l (bufp), crc0
+ add $4, bufp
+.Ldo_word:
+ test $2, len_dw
+ jz .Ldo_byte
+ crc32w (bufp), crc0
+ add $2, bufp
+.Ldo_byte:
+ test $1, len_dw
+ jz .Ldone
+ crc32b (bufp), crc0
+.Ldone:
+ mov crc0, %eax
+ RET
+SYM_FUNC_END(crc32c_x86_3way)
+
+.section .rodata, "a", @progbits
+ ################################################################
+ ## PCLMULQDQ tables
+ ## Table is 128 entries x 2 words (8 bytes) each
+ ################################################################
+.align 8
+K_table:
+ .long 0x493c7d27, 0x00000001
+ .long 0xba4fc28e, 0x493c7d27
+ .long 0xddc0152b, 0xf20c0dfe
+ .long 0x9e4addf8, 0xba4fc28e
+ .long 0x39d3b296, 0x3da6d0cb
+ .long 0x0715ce53, 0xddc0152b
+ .long 0x47db8317, 0x1c291d04
+ .long 0x0d3b6092, 0x9e4addf8
+ .long 0xc96cfdc0, 0x740eef02
+ .long 0x878a92a7, 0x39d3b296
+ .long 0xdaece73e, 0x083a6eec
+ .long 0xab7aff2a, 0x0715ce53
+ .long 0x2162d385, 0xc49f4f67
+ .long 0x83348832, 0x47db8317
+ .long 0x299847d5, 0x2ad91c30
+ .long 0xb9e02b86, 0x0d3b6092
+ .long 0x18b33a4e, 0x6992cea2
+ .long 0xb6dd949b, 0xc96cfdc0
+ .long 0x78d9ccb7, 0x7e908048
+ .long 0xbac2fd7b, 0x878a92a7
+ .long 0xa60ce07b, 0x1b3d8f29
+ .long 0xce7f39f4, 0xdaece73e
+ .long 0x61d82e56, 0xf1d0f55e
+ .long 0xd270f1a2, 0xab7aff2a
+ .long 0xc619809d, 0xa87ab8a8
+ .long 0x2b3cac5d, 0x2162d385
+ .long 0x65863b64, 0x8462d800
+ .long 0x1b03397f, 0x83348832
+ .long 0xebb883bd, 0x71d111a8
+ .long 0xb3e32c28, 0x299847d5
+ .long 0x064f7f26, 0xffd852c6
+ .long 0xdd7e3b0c, 0xb9e02b86
+ .long 0xf285651c, 0xdcb17aa4
+ .long 0x10746f3c, 0x18b33a4e
+ .long 0xc7a68855, 0xf37c5aee
+ .long 0x271d9844, 0xb6dd949b
+ .long 0x8e766a0c, 0x6051d5a2
+ .long 0x93a5f730, 0x78d9ccb7
+ .long 0x6cb08e5c, 0x18b0d4ff
+ .long 0x6b749fb2, 0xbac2fd7b
+ .long 0x1393e203, 0x21f3d99c
+ .long 0xcec3662e, 0xa60ce07b
+ .long 0x96c515bb, 0x8f158014
+ .long 0xe6fc4e6a, 0xce7f39f4
+ .long 0x8227bb8a, 0xa00457f7
+ .long 0xb0cd4768, 0x61d82e56
+ .long 0x39c7ff35, 0x8d6d2c43
+ .long 0xd7a4825c, 0xd270f1a2
+ .long 0x0ab3844b, 0x00ac29cf
+ .long 0x0167d312, 0xc619809d
+ .long 0xf6076544, 0xe9adf796
+ .long 0x26f6a60a, 0x2b3cac5d
+ .long 0xa741c1bf, 0x96638b34
+ .long 0x98d8d9cb, 0x65863b64
+ .long 0x49c3cc9c, 0xe0e9f351
+ .long 0x68bce87a, 0x1b03397f
+ .long 0x57a3d037, 0x9af01f2d
+ .long 0x6956fc3b, 0xebb883bd
+ .long 0x42d98888, 0x2cff42cf
+ .long 0x3771e98f, 0xb3e32c28
+ .long 0xb42ae3d9, 0x88f25a3a
+ .long 0x2178513a, 0x064f7f26
+ .long 0xe0ac139e, 0x4e36f0b0
+ .long 0x170076fa, 0xdd7e3b0c
+ .long 0x444dd413, 0xbd6f81f8
+ .long 0x6f345e45, 0xf285651c
+ .long 0x41d17b64, 0x91c9bd4b
+ .long 0xff0dba97, 0x10746f3c
+ .long 0xa2b73df1, 0x885f087b
+ .long 0xf872e54c, 0xc7a68855
+ .long 0x1e41e9fc, 0x4c144932
+ .long 0x86d8e4d2, 0x271d9844
+ .long 0x651bd98b, 0x52148f02
+ .long 0x5bb8f1bc, 0x8e766a0c
+ .long 0xa90fd27a, 0xa3c6f37a
+ .long 0xb3af077a, 0x93a5f730
+ .long 0x4984d782, 0xd7c0557f
+ .long 0xca6ef3ac, 0x6cb08e5c
+ .long 0x234e0b26, 0x63ded06a
+ .long 0xdd66cbbb, 0x6b749fb2
+ .long 0x4597456a, 0x4d56973c
+ .long 0xe9e28eb4, 0x1393e203
+ .long 0x7b3ff57a, 0x9669c9df
+ .long 0xc9c8b782, 0xcec3662e
+ .long 0x3f70cc6f, 0xe417f38a
+ .long 0x93e106a4, 0x96c515bb
+ .long 0x62ec6c6d, 0x4b9e0f71
+ .long 0xd813b325, 0xe6fc4e6a
+ .long 0x0df04680, 0xd104b8fc
+ .long 0x2342001e, 0x8227bb8a
+ .long 0x0a2a8d7e, 0x5b397730
+ .long 0x6d9a4957, 0xb0cd4768
+ .long 0xe8b6368b, 0xe78eb416
+ .long 0xd2c3ed1a, 0x39c7ff35
+ .long 0x995a5724, 0x61ff0e01
+ .long 0x9ef68d35, 0xd7a4825c
+ .long 0x0c139b31, 0x8d96551c
+ .long 0xf2271e60, 0x0ab3844b
+ .long 0x0b0bf8ca, 0x0bf80dd2
+ .long 0x2664fd8b, 0x0167d312
+ .long 0xed64812d, 0x8821abed
+ .long 0x02ee03b2, 0xf6076544
+ .long 0x8604ae0f, 0x6a45d2b2
+ .long 0x363bd6b3, 0x26f6a60a
+ .long 0x135c83fd, 0xd8d26619
+ .long 0x5fabe670, 0xa741c1bf
+ .long 0x35ec3279, 0xde87806c
+ .long 0x00bcf5f6, 0x98d8d9cb
+ .long 0x8ae00689, 0x14338754
+ .long 0x17f27698, 0x49c3cc9c
+ .long 0x58ca5f00, 0x5bd2011f
+ .long 0xaa7c7ad5, 0x68bce87a
+ .long 0xb5cfca28, 0xdd07448e
+ .long 0xded288f8, 0x57a3d037
+ .long 0x59f229bc, 0xdde8f5b9
+ .long 0x6d390dec, 0x6956fc3b
+ .long 0x37170390, 0xa3e3e02c
+ .long 0x6353c1cc, 0x42d98888
+ .long 0xc4584f5c, 0xd73c7bea
+ .long 0xf48642e9, 0x3771e98f
+ .long 0x531377e2, 0x80ff0093
+ .long 0xdd35bc8d, 0xb42ae3d9
+ .long 0xb25b29f2, 0x8fe4c34d
+ .long 0x9a5ede41, 0x2178513a
+ .long 0xa563905d, 0xdf99fc11
+ .long 0x45cddf4e, 0xe0ac139e
+ .long 0xacfa3103, 0x6c23e841
+ .long 0xa51b6135, 0x170076fa
diff --git a/arch/x86/lib/crct10dif-pcl-asm_64.S b/arch/x86/lib/crct10dif-pcl-asm_64.S
new file mode 100644
index 000000000000..5286db5b8165
--- /dev/null
+++ b/arch/x86/lib/crct10dif-pcl-asm_64.S
@@ -0,0 +1,332 @@
+########################################################################
+# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+#
+# Copyright (c) 2013, Intel Corporation
+#
+# Authors:
+# Erdinc Ozturk <erdinc.ozturk@intel.com>
+# Vinodh Gopal <vinodh.gopal@intel.com>
+# James Guilford <james.guilford@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the
+# distribution.
+#
+# * Neither the name of the Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+#
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Reference paper titled "Fast CRC Computation for Generic
+# Polynomials Using PCLMULQDQ Instruction"
+# URL: http://www.intel.com/content/dam/www/public/us/en/documents
+# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+#
+
+#include <linux/linkage.h>
+
+.text
+
+#define init_crc %edi
+#define buf %rsi
+#define len %rdx
+
+#define FOLD_CONSTS %xmm10
+#define BSWAP_MASK %xmm11
+
+# Fold reg1, reg2 into the next 32 data bytes, storing the result back into
+# reg1, reg2.
+.macro fold_32_bytes offset, reg1, reg2
+ movdqu \offset(buf), %xmm9
+ movdqu \offset+16(buf), %xmm12
+ pshufb BSWAP_MASK, %xmm9
+ pshufb BSWAP_MASK, %xmm12
+ movdqa \reg1, %xmm8
+ movdqa \reg2, %xmm13
+ pclmulqdq $0x00, FOLD_CONSTS, \reg1
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm8
+ pclmulqdq $0x00, FOLD_CONSTS, \reg2
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm13
+ pxor %xmm9 , \reg1
+ xorps %xmm8 , \reg1
+ pxor %xmm12, \reg2
+ xorps %xmm13, \reg2
+.endm
+
+# Fold src_reg into dst_reg.
+.macro fold_16_bytes src_reg, dst_reg
+ movdqa \src_reg, %xmm8
+ pclmulqdq $0x11, FOLD_CONSTS, \src_reg
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
+ pxor %xmm8, \dst_reg
+ xorps \src_reg, \dst_reg
+.endm
+
+#
+# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len);
+#
+# Assumes len >= 16.
+#
+SYM_FUNC_START(crc_t10dif_pcl)
+
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
+
+ # For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+ cmp $256, len
+ jl .Lless_than_256_bytes
+
+ # Load the first 128 data bytes. Byte swapping is necessary to make the
+ # bit order match the polynomial coefficient order.
+ movdqu 16*0(buf), %xmm0
+ movdqu 16*1(buf), %xmm1
+ movdqu 16*2(buf), %xmm2
+ movdqu 16*3(buf), %xmm3
+ movdqu 16*4(buf), %xmm4
+ movdqu 16*5(buf), %xmm5
+ movdqu 16*6(buf), %xmm6
+ movdqu 16*7(buf), %xmm7
+ add $128, buf
+ pshufb BSWAP_MASK, %xmm0
+ pshufb BSWAP_MASK, %xmm1
+ pshufb BSWAP_MASK, %xmm2
+ pshufb BSWAP_MASK, %xmm3
+ pshufb BSWAP_MASK, %xmm4
+ pshufb BSWAP_MASK, %xmm5
+ pshufb BSWAP_MASK, %xmm6
+ pshufb BSWAP_MASK, %xmm7
+
+ # XOR the first 16 data *bits* with the initial CRC value.
+ pxor %xmm8, %xmm8
+ pinsrw $7, init_crc, %xmm8
+ pxor %xmm8, %xmm0
+
+ movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS
+
+ # Subtract 128 for the 128 data bytes just consumed. Subtract another
+ # 128 to simplify the termination condition of the following loop.
+ sub $256, len
+
+ # While >= 128 data bytes remain (not counting xmm0-7), fold the 128
+ # bytes xmm0-7 into them, storing the result back into xmm0-7.
+.Lfold_128_bytes_loop:
+ fold_32_bytes 0, %xmm0, %xmm1
+ fold_32_bytes 32, %xmm2, %xmm3
+ fold_32_bytes 64, %xmm4, %xmm5
+ fold_32_bytes 96, %xmm6, %xmm7
+ add $128, buf
+ sub $128, len
+ jge .Lfold_128_bytes_loop
+
+ # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
+
+ # Fold across 64 bytes.
+ movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS
+ fold_16_bytes %xmm0, %xmm4
+ fold_16_bytes %xmm1, %xmm5
+ fold_16_bytes %xmm2, %xmm6
+ fold_16_bytes %xmm3, %xmm7
+ # Fold across 32 bytes.
+ movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS
+ fold_16_bytes %xmm4, %xmm6
+ fold_16_bytes %xmm5, %xmm7
+ # Fold across 16 bytes.
+ movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
+ fold_16_bytes %xmm6, %xmm7
+
+ # Add 128 to get the correct number of data bytes remaining in 0...127
+ # (not counting xmm7), following the previous extra subtraction by 128.
+ # Then subtract 16 to simplify the termination condition of the
+ # following loop.
+ add $128-16, len
+
+ # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes
+ # xmm7 into them, storing the result back into xmm7.
+ jl .Lfold_16_bytes_loop_done
+.Lfold_16_bytes_loop:
+ movdqa %xmm7, %xmm8
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
+ pxor %xmm8, %xmm7
+ movdqu (buf), %xmm0
+ pshufb BSWAP_MASK, %xmm0
+ pxor %xmm0 , %xmm7
+ add $16, buf
+ sub $16, len
+ jge .Lfold_16_bytes_loop
+
+.Lfold_16_bytes_loop_done:
+ # Add 16 to get the correct number of data bytes remaining in 0...15
+ # (not counting xmm7), following the previous extra subtraction by 16.
+ add $16, len
+ je .Lreduce_final_16_bytes
+
+.Lhandle_partial_segment:
+ # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16
+ # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do
+ # this without needing a fold constant for each possible 'len', redivide
+ # the bytes into a first chunk of 'len' bytes and a second chunk of 16
+ # bytes, then fold the first chunk into the second.
+
+ movdqa %xmm7, %xmm2
+
+ # xmm1 = last 16 original data bytes
+ movdqu -16(buf, len), %xmm1
+ pshufb BSWAP_MASK, %xmm1
+
+ # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
+ lea .Lbyteshift_table+16(%rip), %rax
+ sub len, %rax
+ movdqu (%rax), %xmm0
+ pshufb %xmm0, %xmm2
+
+ # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
+ pxor .Lmask1(%rip), %xmm0
+ pshufb %xmm0, %xmm7
+
+ # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
+ # then '16-len' bytes from xmm2 (high-order bytes).
+ pblendvb %xmm2, %xmm1 #xmm0 is implicit
+
+ # Fold the first chunk into the second chunk, storing the result in xmm7.
+ movdqa %xmm7, %xmm8
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
+ pxor %xmm8, %xmm7
+ pxor %xmm1, %xmm7
+
+.Lreduce_final_16_bytes:
+ # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
+
+ # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+ movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS
+
+ # Fold the high 64 bits into the low 64 bits, while also multiplying by
+ # x^64. This produces a 128-bit value congruent to x^64 * M(x) and
+ # whose low 48 bits are 0.
+ movdqa %xmm7, %xmm0
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
+ pslldq $8, %xmm0
+ pxor %xmm0, %xmm7 # + low bits * x^64
+
+ # Fold the high 32 bits into the low 96 bits. This produces a 96-bit
+ # value congruent to x^64 * M(x) and whose low 48 bits are 0.
+ movdqa %xmm7, %xmm0
+ pand .Lmask2(%rip), %xmm0 # zero high 32 bits
+ psrldq $12, %xmm7 # extract high 32 bits
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
+ pxor %xmm0, %xmm7 # + low bits
+
+ # Load G(x) and floor(x^48 / G(x)).
+ movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS
+
+ # Use Barrett reduction to compute the final CRC value.
+ movdqa %xmm7, %xmm0
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
+ psrlq $32, %xmm7 # /= x^32
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x)
+ psrlq $48, %xmm0
+ pxor %xmm7, %xmm0 # + low 16 nonzero bits
+ # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
+
+ pextrw $0, %xmm0, %eax
+ RET
+
+.align 16
+.Lless_than_256_bytes:
+ # Checksumming a buffer of length 16...255 bytes
+
+ # Load the first 16 data bytes.
+ movdqu (buf), %xmm7
+ pshufb BSWAP_MASK, %xmm7
+ add $16, buf
+
+ # XOR the first 16 data *bits* with the initial CRC value.
+ pxor %xmm0, %xmm0
+ pinsrw $7, init_crc, %xmm0
+ pxor %xmm0, %xmm7
+
+ movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
+ cmp $16, len
+ je .Lreduce_final_16_bytes # len == 16
+ sub $32, len
+ jge .Lfold_16_bytes_loop # 32 <= len <= 255
+ add $16, len
+ jmp .Lhandle_partial_segment # 17 <= len <= 31
+SYM_FUNC_END(crc_t10dif_pcl)
+
+.section .rodata, "a", @progbits
+.align 16
+
+# Fold constants precomputed from the polynomial 0x18bb7
+# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+ .quad 0x0000000000006123 # x^(8*128) mod G(x)
+ .quad 0x0000000000002295 # x^(8*128+64) mod G(x)
+.Lfold_across_64_bytes_consts:
+ .quad 0x0000000000001069 # x^(4*128) mod G(x)
+ .quad 0x000000000000dd31 # x^(4*128+64) mod G(x)
+.Lfold_across_32_bytes_consts:
+ .quad 0x000000000000857d # x^(2*128) mod G(x)
+ .quad 0x0000000000007acc # x^(2*128+64) mod G(x)
+.Lfold_across_16_bytes_consts:
+ .quad 0x000000000000a010 # x^(1*128) mod G(x)
+ .quad 0x0000000000001faa # x^(1*128+64) mod G(x)
+.Lfinal_fold_consts:
+ .quad 0x1368000000000000 # x^48 * (x^48 mod G(x))
+ .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x))
+.Lbarrett_reduction_consts:
+ .quad 0x0000000000018bb7 # G(x)
+ .quad 0x00000001f65a57f8 # floor(x^48 / G(x))
+
+.section .rodata.cst16.mask1, "aM", @progbits, 16
+.align 16
+.Lmask1:
+ .octa 0x80808080808080808080808080808080
+
+.section .rodata.cst16.mask2, "aM", @progbits, 16
+.align 16
+.Lmask2:
+ .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+
+.section .rodata.cst16.bswap_mask, "aM", @progbits, 16
+.align 16
+.Lbswap_mask:
+ .octa 0x000102030405060708090A0B0C0D0E0F
+
+.section .rodata.cst32.byteshift_table, "aM", @progbits, 32
+.align 16
+# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
+# is the index vector to shift left by 'len' bytes, and is also {0x80, ...,
+# 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
+ .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+ .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 4357ec2a0bfc..89ecd57c9d42 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -44,9 +44,8 @@
.pushsection runtime_ptr_USER_PTR_MAX,"a"
.long 1b - 8 - .
.popsection
- cmp %rax, %rdx
- sbb %rdx, %rdx
- or %rdx, %rax
+ cmp %rdx, %rax
+ cmova %rdx, %rax
.else
cmp $TASK_SIZE_MAX-\size+1, %eax
jae .Lbad_get_user