summaryrefslogtreecommitdiff
path: root/lib/crc/x86/crc32.h
diff options
context:
space:
mode:
Diffstat (limited to 'lib/crc/x86/crc32.h')
-rw-r--r--lib/crc/x86/crc32.h137
1 files changed, 137 insertions, 0 deletions
diff --git a/lib/crc/x86/crc32.h b/lib/crc/x86/crc32.h
new file mode 100644
index 000000000000..cea2c96d08d0
--- /dev/null
+++ b/lib/crc/x86/crc32.h
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * x86-optimized CRC32 functions
+ *
+ * Copyright (C) 2008 Intel Corporation
+ * Copyright 2012 Xyratex Technology Limited
+ * Copyright 2024 Google LLC
+ */
+
+#include "crc-pclmul-template.h"
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);
+
+DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
+
+static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
+ have_pclmulqdq);
+ return crc32_le_base(crc, p, len);
+}
+
+#ifdef CONFIG_X86_64
+#define CRC32_INST "crc32q %1, %q0"
+#else
+#define CRC32_INST "crc32l %1, %0"
+#endif
+
+/*
+ * Use carryless multiply version of crc32c when buffer size is >= 512 to
+ * account for FPU state save/restore overhead.
+ */
+#define CRC32C_PCLMUL_BREAKEVEN 512
+
+asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ size_t num_longs;
+
+ if (!static_branch_likely(&have_crc32))
+ return crc32c_base(crc, p, len);
+
+ if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
+ static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
+ /*
+ * Long length, the vector registers are usable, and the CPU is
+ * 64-bit and supports both CRC32 and PCLMULQDQ instructions.
+ * It is worthwhile to divide the data into multiple streams,
+ * CRC them independently, and combine them using PCLMULQDQ.
+ * crc32c_x86_3way() does this using 3 streams, which is the
+ * most that x86_64 CPUs have traditionally been capable of.
+ *
+ * However, due to improved VPCLMULQDQ performance on newer
+ * CPUs, use crc32_lsb_vpclmul_avx512() instead of
+ * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
+ * "good" implementation of AVX-512.
+ *
+ * Future work: the optimal strategy on Zen 3--5 is actually to
+ * use both crc32q and VPCLMULQDQ in parallel. Unfortunately,
+ * different numbers of streams and vector lengths are optimal
+ * on each CPU microarchitecture, making it challenging to take
+ * advantage of this. (Zen 5 even supports 7 parallel crc32q, a
+ * major upgrade.) For now, just choose between
+ * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter
+ * is needed anyway for crc32_le(), so we just reuse it here.
+ */
+ kernel_fpu_begin();
+ if (static_branch_likely(&have_vpclmul_avx512))
+ crc = crc32_lsb_vpclmul_avx512(crc, p, len,
+ crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
+ else
+ crc = crc32c_x86_3way(crc, p, len);
+ kernel_fpu_end();
+ return crc;
+ }
+
+ /*
+ * Short length, XMM registers unusable, or the CPU is 32-bit; but the
+ * CPU supports CRC32 instructions. Just issue a single stream of CRC32
+ * instructions inline. While this doesn't use the CPU's CRC32
+ * throughput very well, it avoids the need to combine streams. Stream
+ * combination would be inefficient here.
+ */
+
+ for (num_longs = len / sizeof(unsigned long);
+ num_longs != 0; num_longs--, p += sizeof(unsigned long))
+ asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
+
+ if (sizeof(unsigned long) > 4 && (len & 4)) {
+ asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
+ p += 4;
+ }
+ if (len & 2) {
+ asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
+ p += 2;
+ }
+ if (len & 1)
+ asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
+
+ return crc;
+}
+
+#define crc32_be_arch crc32_be_base /* not implemented on this arch */
+
+#define crc32_mod_init_arch crc32_mod_init_arch
+static inline void crc32_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_XMM4_2))
+ static_branch_enable(&have_crc32);
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+ static_branch_enable(&have_pclmulqdq);
+ if (have_vpclmul()) {
+ if (have_avx512()) {
+ static_call_update(crc32_lsb_pclmul,
+ crc32_lsb_vpclmul_avx512);
+ static_branch_enable(&have_vpclmul_avx512);
+ } else {
+ static_call_update(crc32_lsb_pclmul,
+ crc32_lsb_vpclmul_avx2);
+ }
+ }
+ }
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ u32 optimizations = 0;
+
+ if (static_key_enabled(&have_crc32))
+ optimizations |= CRC32C_OPTIMIZATION;
+ if (static_key_enabled(&have_pclmulqdq))
+ optimizations |= CRC32_LE_OPTIMIZATION;
+ return optimizations;
+}