From b4a8528d17fbcd9027290c168efd6ba7ac4d4cd2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 10 Dec 2025 17:18:36 -0800 Subject: lib/crypto: arm64/nh: Migrate optimized code into library Migrate the arm64 NEON implementation of NH into lib/crypto/. This makes the nh() function be optimized on arm64 kernels. Note: this temporarily makes the adiantum template not utilize the arm64 optimized NH code. This is resolved in a later commit that converts the adiantum template to use nh() instead of "nhpoly1305". Link: https://lore.kernel.org/r/20251211011846.8179-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm64/crypto/Kconfig | 10 --- arch/arm64/crypto/Makefile | 3 - arch/arm64/crypto/nh-neon-core.S | 104 ------------------------------- arch/arm64/crypto/nhpoly1305-neon-glue.c | 79 ----------------------- lib/crypto/Kconfig | 1 + lib/crypto/Makefile | 1 + lib/crypto/arm64/nh-neon-core.S | 103 ++++++++++++++++++++++++++++++ lib/crypto/arm64/nh.h | 34 ++++++++++ 8 files changed, 139 insertions(+), 196 deletions(-) delete mode 100644 arch/arm64/crypto/nh-neon-core.S delete mode 100644 arch/arm64/crypto/nhpoly1305-neon-glue.c create mode 100644 lib/crypto/arm64/nh-neon-core.S create mode 100644 lib/crypto/arm64/nh.h diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index bdd276a6e540..da1c9ea8ea83 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -15,16 +15,6 @@ config CRYPTO_GHASH_ARM64_CE Architecture: arm64 using: - ARMv8 Crypto Extensions -config CRYPTO_NHPOLY1305_NEON - tristate "Hash functions: NHPoly1305 (NEON)" - depends on KERNEL_MODE_NEON - select CRYPTO_NHPOLY1305 - help - NHPoly1305 hash function (Adiantum) - - Architecture: arm64 using: - - NEON (Advanced SIMD) extensions - config CRYPTO_SM3_NEON tristate "Hash functions: SM3 (NEON)" depends on KERNEL_MODE_NEON diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index 1e330aa08d3f..3ab4b58e5c4c 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -41,9 +41,6 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o aes-neon-blk-y := aes-glue-neon.o aes-neon.o -obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o -nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o - obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o diff --git a/arch/arm64/crypto/nh-neon-core.S b/arch/arm64/crypto/nh-neon-core.S deleted file mode 100644 index 13eda08fda1e..000000000000 --- a/arch/arm64/crypto/nh-neon-core.S +++ /dev/null @@ -1,104 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * NH - ε-almost-universal hash function, ARM64 NEON accelerated version - * - * Copyright 2018 Google LLC - * - * Author: Eric Biggers - */ - -#include -#include - - KEY .req x0 - MESSAGE .req x1 - MESSAGE_LEN .req x2 - HASH .req x3 - - PASS0_SUMS .req v0 - PASS1_SUMS .req v1 - PASS2_SUMS .req v2 - PASS3_SUMS .req v3 - K0 .req v4 - K1 .req v5 - K2 .req v6 - K3 .req v7 - T0 .req v8 - T1 .req v9 - T2 .req v10 - T3 .req v11 - T4 .req v12 - T5 .req v13 - T6 .req v14 - T7 .req v15 - -.macro _nh_stride k0, k1, k2, k3 - - // Load next message stride - ld1 {T3.16b}, [MESSAGE], #16 - - // Load next key stride - ld1 {\k3\().4s}, [KEY], #16 - - // Add message words to key words - add T0.4s, T3.4s, \k0\().4s - add T1.4s, T3.4s, \k1\().4s - add T2.4s, T3.4s, \k2\().4s - add T3.4s, T3.4s, \k3\().4s - - // Multiply 32x32 => 64 and accumulate - mov T4.d[0], T0.d[1] - mov T5.d[0], T1.d[1] - mov T6.d[0], T2.d[1] - mov T7.d[0], T3.d[1] - umlal PASS0_SUMS.2d, T0.2s, T4.2s - umlal PASS1_SUMS.2d, T1.2s, T5.2s - umlal PASS2_SUMS.2d, T2.2s, T6.2s - umlal PASS3_SUMS.2d, T3.2s, T7.2s -.endm - -/* - * void nh_neon(const u32 *key, const u8 *message, size_t message_len, - * __le64 hash[NH_NUM_PASSES]) - * - * It's guaranteed that message_len % 16 == 0. - */ -SYM_TYPED_FUNC_START(nh_neon) - - ld1 {K0.4s,K1.4s}, [KEY], #32 - movi PASS0_SUMS.2d, #0 - movi PASS1_SUMS.2d, #0 - ld1 {K2.4s}, [KEY], #16 - movi PASS2_SUMS.2d, #0 - movi PASS3_SUMS.2d, #0 - - subs MESSAGE_LEN, MESSAGE_LEN, #64 - blt .Lloop4_done -.Lloop4: - _nh_stride K0, K1, K2, K3 - _nh_stride K1, K2, K3, K0 - _nh_stride K2, K3, K0, K1 - _nh_stride K3, K0, K1, K2 - subs MESSAGE_LEN, MESSAGE_LEN, #64 - bge .Lloop4 - -.Lloop4_done: - ands MESSAGE_LEN, MESSAGE_LEN, #63 - beq .Ldone - _nh_stride K0, K1, K2, K3 - - subs MESSAGE_LEN, MESSAGE_LEN, #16 - beq .Ldone - _nh_stride K1, K2, K3, K0 - - subs MESSAGE_LEN, MESSAGE_LEN, #16 - beq .Ldone - _nh_stride K2, K3, K0, K1 - -.Ldone: - // Sum the accumulators for each pass, then store the sums to 'hash' - addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d - addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d - st1 {T0.16b,T1.16b}, [HASH] - ret -SYM_FUNC_END(nh_neon) diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c deleted file mode 100644 index 013de6ac569a..000000000000 --- a/arch/arm64/crypto/nhpoly1305-neon-glue.c +++ /dev/null @@ -1,79 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum - * (ARM64 NEON accelerated version) - * - * Copyright 2018 Google LLC - */ - -#include -#include -#include -#include -#include -#include - -asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len, - __le64 hash[NH_NUM_PASSES]); - -static int nhpoly1305_neon_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) -{ - if (srclen < 64 || !crypto_simd_usable()) - return crypto_nhpoly1305_update(desc, src, srclen); - - do { - unsigned int n = min_t(unsigned int, srclen, SZ_4K); - - scoped_ksimd() - crypto_nhpoly1305_update_helper(desc, src, n, nh_neon); - src += n; - srclen -= n; - } while (srclen); - return 0; -} - -static int nhpoly1305_neon_digest(struct shash_desc *desc, - const u8 *src, unsigned int srclen, u8 *out) -{ - return crypto_nhpoly1305_init(desc) ?: - nhpoly1305_neon_update(desc, src, srclen) ?: - crypto_nhpoly1305_final(desc, out); -} - -static struct shash_alg nhpoly1305_alg = { - .base.cra_name = "nhpoly1305", - .base.cra_driver_name = "nhpoly1305-neon", - .base.cra_priority = 200, - .base.cra_ctxsize = sizeof(struct nhpoly1305_key), - .base.cra_module = THIS_MODULE, - .digestsize = POLY1305_DIGEST_SIZE, - .init = crypto_nhpoly1305_init, - .update = nhpoly1305_neon_update, - .final = crypto_nhpoly1305_final, - .digest = nhpoly1305_neon_digest, - .setkey = crypto_nhpoly1305_setkey, - .descsize = sizeof(struct nhpoly1305_state), -}; - -static int __init nhpoly1305_mod_init(void) -{ - if (!cpu_have_named_feature(ASIMD)) - return -ENODEV; - - return crypto_register_shash(&nhpoly1305_alg); -} - -static void __exit nhpoly1305_mod_exit(void) -{ - crypto_unregister_shash(&nhpoly1305_alg); -} - -module_init(nhpoly1305_mod_init); -module_exit(nhpoly1305_mod_exit); - -MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)"); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Eric Biggers "); -MODULE_ALIAS_CRYPTO("nhpoly1305"); -MODULE_ALIAS_CRYPTO("nhpoly1305-neon"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index c6ee7ca77632..aa3f850ece24 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -118,6 +118,7 @@ config CRYPTO_LIB_NH_ARCH bool depends on CRYPTO_LIB_NH && !UML default y if ARM && KERNEL_MODE_NEON + default y if ARM64 && KERNEL_MODE_NEON config CRYPTO_LIB_POLY1305 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 6dae7e182847..e3a13952bc2a 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -136,6 +136,7 @@ libnh-y := nh.o ifeq ($(CONFIG_CRYPTO_LIB_NH_ARCH),y) CFLAGS_nh.o += -I$(src)/$(SRCARCH) libnh-$(CONFIG_ARM) += arm/nh-neon-core.o +libnh-$(CONFIG_ARM64) += arm64/nh-neon-core.o endif ################################################################################ diff --git a/lib/crypto/arm64/nh-neon-core.S b/lib/crypto/arm64/nh-neon-core.S new file mode 100644 index 000000000000..6fa57fce8085 --- /dev/null +++ b/lib/crypto/arm64/nh-neon-core.S @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NH - ε-almost-universal hash function, ARM64 NEON accelerated version + * + * Copyright 2018 Google LLC + * + * Author: Eric Biggers + */ + +#include + + KEY .req x0 + MESSAGE .req x1 + MESSAGE_LEN .req x2 + HASH .req x3 + + PASS0_SUMS .req v0 + PASS1_SUMS .req v1 + PASS2_SUMS .req v2 + PASS3_SUMS .req v3 + K0 .req v4 + K1 .req v5 + K2 .req v6 + K3 .req v7 + T0 .req v8 + T1 .req v9 + T2 .req v10 + T3 .req v11 + T4 .req v12 + T5 .req v13 + T6 .req v14 + T7 .req v15 + +.macro _nh_stride k0, k1, k2, k3 + + // Load next message stride + ld1 {T3.16b}, [MESSAGE], #16 + + // Load next key stride + ld1 {\k3\().4s}, [KEY], #16 + + // Add message words to key words + add T0.4s, T3.4s, \k0\().4s + add T1.4s, T3.4s, \k1\().4s + add T2.4s, T3.4s, \k2\().4s + add T3.4s, T3.4s, \k3\().4s + + // Multiply 32x32 => 64 and accumulate + mov T4.d[0], T0.d[1] + mov T5.d[0], T1.d[1] + mov T6.d[0], T2.d[1] + mov T7.d[0], T3.d[1] + umlal PASS0_SUMS.2d, T0.2s, T4.2s + umlal PASS1_SUMS.2d, T1.2s, T5.2s + umlal PASS2_SUMS.2d, T2.2s, T6.2s + umlal PASS3_SUMS.2d, T3.2s, T7.2s +.endm + +/* + * void nh_neon(const u32 *key, const u8 *message, size_t message_len, + * __le64 hash[NH_NUM_PASSES]) + * + * It's guaranteed that message_len % 16 == 0. + */ +SYM_FUNC_START(nh_neon) + + ld1 {K0.4s,K1.4s}, [KEY], #32 + movi PASS0_SUMS.2d, #0 + movi PASS1_SUMS.2d, #0 + ld1 {K2.4s}, [KEY], #16 + movi PASS2_SUMS.2d, #0 + movi PASS3_SUMS.2d, #0 + + subs MESSAGE_LEN, MESSAGE_LEN, #64 + blt .Lloop4_done +.Lloop4: + _nh_stride K0, K1, K2, K3 + _nh_stride K1, K2, K3, K0 + _nh_stride K2, K3, K0, K1 + _nh_stride K3, K0, K1, K2 + subs MESSAGE_LEN, MESSAGE_LEN, #64 + bge .Lloop4 + +.Lloop4_done: + ands MESSAGE_LEN, MESSAGE_LEN, #63 + beq .Ldone + _nh_stride K0, K1, K2, K3 + + subs MESSAGE_LEN, MESSAGE_LEN, #16 + beq .Ldone + _nh_stride K1, K2, K3, K0 + + subs MESSAGE_LEN, MESSAGE_LEN, #16 + beq .Ldone + _nh_stride K2, K3, K0, K1 + +.Ldone: + // Sum the accumulators for each pass, then store the sums to 'hash' + addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d + addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d + st1 {T0.16b,T1.16b}, [HASH] + ret +SYM_FUNC_END(nh_neon) diff --git a/lib/crypto/arm64/nh.h b/lib/crypto/arm64/nh.h new file mode 100644 index 000000000000..08902630bdd1 --- /dev/null +++ b/lib/crypto/arm64/nh.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * ARM64 accelerated implementation of NH + * + * Copyright 2018 Google LLC + */ + +#include +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len, + __le64 hash[NH_NUM_PASSES]); + +static bool nh_arch(const u32 *key, const u8 *message, size_t message_len, + __le64 hash[NH_NUM_PASSES]) +{ + if (static_branch_likely(&have_neon) && message_len >= 64 && + may_use_simd()) { + scoped_ksimd() + nh_neon(key, message, message_len, hash); + return true; + } + return false; +} + +#define nh_mod_init_arch nh_mod_init_arch +static void nh_mod_init_arch(void) +{ + if (cpu_have_named_feature(ASIMD)) + static_branch_enable(&have_neon); +} -- cgit v1.2.3