diff options
Diffstat (limited to 'arch/powerpc/crypto')
23 files changed, 124 insertions, 6174 deletions
diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index 951a43726461..caaa359f4742 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -3,52 +3,20 @@ menu "Accelerated Cryptographic Algorithms for CPU (powerpc)" config CRYPTO_CURVE25519_PPC64 - tristate "Public key crypto: Curve25519 (PowerPC64)" + tristate depends on PPC64 && CPU_LITTLE_ENDIAN + select CRYPTO_KPP select CRYPTO_LIB_CURVE25519_GENERIC select CRYPTO_ARCH_HAVE_LIB_CURVE25519 + default CRYPTO_LIB_CURVE25519_INTERNAL help Curve25519 algorithm Architecture: PowerPC64 - Little-endian -config CRYPTO_CRC32C_VPMSUM - tristate "CRC32c" - depends on PPC64 && ALTIVEC - select CRYPTO_HASH - select CRC32 - help - CRC32c CRC algorithm with the iSCSI polynomial (RFC 3385 and RFC 3720) - - Architecture: powerpc64 using - - AltiVec extensions - - Enable on POWER8 and newer processors for improved performance. - -config CRYPTO_CRCT10DIF_VPMSUM - tristate "CRC32T10DIF" - depends on PPC64 && ALTIVEC && CRC_T10DIF - select CRYPTO_HASH - help - CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF) - - Architecture: powerpc64 using - - AltiVec extensions - - Enable on POWER8 and newer processors for improved performance. - -config CRYPTO_VPMSUM_TESTER - tristate "CRC32c and CRC32T10DIF hardware acceleration tester" - depends on CRYPTO_CRCT10DIF_VPMSUM && CRYPTO_CRC32C_VPMSUM - help - Stress test for CRC32c and CRCT10DIF algorithms implemented with - powerpc64 AltiVec extensions (POWER8 vpmsum instructions). - Unless you are testing these algorithms, you don't need this. - config CRYPTO_MD5_PPC tristate "Digests: MD5" - depends on PPC select CRYPTO_HASH help MD5 message digest algorithm (RFC1321) @@ -57,7 +25,6 @@ config CRYPTO_MD5_PPC config CRYPTO_SHA1_PPC tristate "Hash functions: SHA-1" - depends on PPC help SHA-1 secure hash algorithm (FIPS 180) @@ -65,27 +32,16 @@ config CRYPTO_SHA1_PPC config CRYPTO_SHA1_PPC_SPE tristate "Hash functions: SHA-1 (SPE)" - depends on PPC && SPE + depends on SPE help SHA-1 secure hash algorithm (FIPS 180) Architecture: powerpc using - SPE (Signal Processing Engine) extensions -config CRYPTO_SHA256_PPC_SPE - tristate "Hash functions: SHA-224 and SHA-256 (SPE)" - depends on PPC && SPE - select CRYPTO_SHA256 - select CRYPTO_HASH - help - SHA-224 and SHA-256 secure hash algorithms (FIPS 180) - - Architecture: powerpc using - - SPE (Signal Processing Engine) extensions - config CRYPTO_AES_PPC_SPE tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (SPE)" - depends on PPC && SPE + depends on SPE select CRYPTO_SKCIPHER help Block ciphers: AES cipher algorithms (FIPS-197) @@ -123,32 +79,6 @@ config CRYPTO_AES_GCM_P10 Support for cryptographic acceleration instructions on Power10 or later CPU. This module supports stitched acceleration for AES/GCM. -config CRYPTO_CHACHA20_P10 - tristate "Ciphers: ChaCha20, XChacha20, XChacha12 (P10 or later)" - depends on PPC64 && CPU_LITTLE_ENDIAN && VSX - select CRYPTO_SKCIPHER - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - help - Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12 - stream cipher algorithms - - Architecture: PowerPC64 - - Power10 or later - - Little-endian - -config CRYPTO_POLY1305_P10 - tristate "Hash functions: Poly1305 (P10 or later)" - depends on PPC64 && CPU_LITTLE_ENDIAN && VSX - select CRYPTO_HASH - select CRYPTO_LIB_POLY1305_GENERIC - help - Poly1305 authenticator algorithm (RFC7539) - - Architecture: PowerPC64 - - Power10 or later - - Little-endian - config CRYPTO_DEV_VMX bool "Support for VMX cryptographic acceleration instructions" depends on PPC64 && VSX diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index 59808592f0a1..8c2936ae466f 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -9,13 +9,7 @@ obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o -obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o -obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o -obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o -obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o -obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o -obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o @@ -23,12 +17,7 @@ aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes- md5-ppc-y := md5-asm.o md5-glue.o sha1-powerpc-y := sha1-powerpc-asm.o sha1.o sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o -sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o -crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o -crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o -chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o -poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o @@ -56,3 +45,4 @@ $(obj)/aesp8-ppc.S $(obj)/ghashp8-ppc.S: $(obj)/%.S: $(src)/%.pl FORCE OBJECT_FILES_NON_STANDARD_aesp10-ppc.o := y OBJECT_FILES_NON_STANDARD_ghashp10-ppc.o := y OBJECT_FILES_NON_STANDARD_aesp8-ppc.o := y +OBJECT_FILES_NON_STANDARD_ghashp8-ppc.o := y diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c b/arch/powerpc/crypto/aes-gcm-p10-glue.c index f37b3d13fc53..85f4fd4b1bdc 100644 --- a/arch/powerpc/crypto/aes-gcm-p10-glue.c +++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c @@ -35,9 +35,9 @@ MODULE_ALIAS_CRYPTO("aes"); asmlinkage int aes_p10_set_encrypt_key(const u8 *userKey, const int bits, void *key); asmlinkage void aes_p10_encrypt(const u8 *in, u8 *out, const void *key); -asmlinkage void aes_p10_gcm_encrypt(u8 *in, u8 *out, size_t len, +asmlinkage void aes_p10_gcm_encrypt(const u8 *in, u8 *out, size_t len, void *rkey, u8 *iv, void *Xi); -asmlinkage void aes_p10_gcm_decrypt(u8 *in, u8 *out, size_t len, +asmlinkage void aes_p10_gcm_decrypt(const u8 *in, u8 *out, size_t len, void *rkey, u8 *iv, void *Xi); asmlinkage void gcm_init_htable(unsigned char htable[], unsigned char Xi[]); asmlinkage void gcm_ghash_p10(unsigned char *Xi, unsigned char *Htable, @@ -214,7 +214,6 @@ static int p10_aes_gcm_crypt(struct aead_request *req, u8 *riv, struct gcm_ctx *gctx = PTR_ALIGN((void *)databuf, PPC_ALIGN); u8 hashbuf[sizeof(struct Hash_ctx) + PPC_ALIGN]; struct Hash_ctx *hash = PTR_ALIGN((void *)hashbuf, PPC_ALIGN); - struct scatter_walk assoc_sg_walk; struct skcipher_walk walk; u8 *assocmem = NULL; u8 *assoc; @@ -234,8 +233,7 @@ static int p10_aes_gcm_crypt(struct aead_request *req, u8 *riv, /* Linearize assoc, if not already linear */ if (req->src->length >= assoclen && req->src->length) { - scatterwalk_start(&assoc_sg_walk, req->src); - assoc = scatterwalk_map(&assoc_sg_walk); + assoc = sg_virt(req->src); /* ppc64 is !HIGHMEM */ } else { gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? GFP_KERNEL : GFP_ATOMIC; @@ -253,10 +251,7 @@ static int p10_aes_gcm_crypt(struct aead_request *req, u8 *riv, gcmp10_init(gctx, iv, (unsigned char *) &ctx->enc_key, hash, assoc, assoclen); vsx_end(); - if (!assocmem) - scatterwalk_unmap(assoc); - else - kfree(assocmem); + kfree(assocmem); if (enc) ret = skcipher_walk_aead_encrypt(&walk, req, false); @@ -266,7 +261,7 @@ static int p10_aes_gcm_crypt(struct aead_request *req, u8 *riv, return ret; while ((nbytes = walk.nbytes) > 0 && ret == 0) { - u8 *src = walk.src.virt.addr; + const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; u8 buf[AES_BLOCK_SIZE]; diff --git a/arch/powerpc/crypto/aes.c b/arch/powerpc/crypto/aes.c index ec06189fbf99..3f1e5e894902 100644 --- a/arch/powerpc/crypto/aes.c +++ b/arch/powerpc/crypto/aes.c @@ -7,15 +7,15 @@ * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com> */ -#include <linux/types.h> -#include <linux/err.h> -#include <linux/crypto.h> -#include <linux/delay.h> #include <asm/simd.h> #include <asm/switch_to.h> #include <crypto/aes.h> #include <crypto/internal/cipher.h> #include <crypto/internal/simd.h> +#include <linux/err.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/uaccess.h> #include "aesp8-ppc.h" diff --git a/arch/powerpc/crypto/aes_cbc.c b/arch/powerpc/crypto/aes_cbc.c index ed0debc7acb5..5f2a4f375eef 100644 --- a/arch/powerpc/crypto/aes_cbc.c +++ b/arch/powerpc/crypto/aes_cbc.c @@ -12,6 +12,10 @@ #include <crypto/aes.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> +#include <linux/err.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/uaccess.h> #include "aesp8-ppc.h" diff --git a/arch/powerpc/crypto/aes_ctr.c b/arch/powerpc/crypto/aes_ctr.c index 9a3da8cd62f3..e27c4036e711 100644 --- a/arch/powerpc/crypto/aes_ctr.c +++ b/arch/powerpc/crypto/aes_ctr.c @@ -12,6 +12,10 @@ #include <crypto/aes.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> +#include <linux/err.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/uaccess.h> #include "aesp8-ppc.h" @@ -69,9 +73,9 @@ static int p8_aes_ctr_setkey(struct crypto_skcipher *tfm, const u8 *key, static void p8_aes_ctr_final(const struct p8_aes_ctr_ctx *ctx, struct skcipher_walk *walk) { + const u8 *src = walk->src.virt.addr; u8 *ctrblk = walk->iv; u8 keystream[AES_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; u8 *dst = walk->dst.virt.addr; unsigned int nbytes = walk->nbytes; diff --git a/arch/powerpc/crypto/aes_xts.c b/arch/powerpc/crypto/aes_xts.c index dabbccb41550..9440e771cede 100644 --- a/arch/powerpc/crypto/aes_xts.c +++ b/arch/powerpc/crypto/aes_xts.c @@ -13,6 +13,10 @@ #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/xts.h> +#include <linux/err.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/uaccess.h> #include "aesp8-ppc.h" diff --git a/arch/powerpc/crypto/chacha-p10-glue.c b/arch/powerpc/crypto/chacha-p10-glue.c deleted file mode 100644 index 7c728755852e..000000000000 --- a/arch/powerpc/crypto/chacha-p10-glue.c +++ /dev/null @@ -1,227 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * PowerPC P10 (ppc64le) accelerated ChaCha and XChaCha stream ciphers, - * including ChaCha20 (RFC7539) - * - * Copyright 2023- IBM Corp. All rights reserved. - */ - -#include <crypto/algapi.h> -#include <crypto/internal/chacha.h> -#include <crypto/internal/simd.h> -#include <crypto/internal/skcipher.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/cpufeature.h> -#include <linux/sizes.h> -#include <asm/simd.h> -#include <asm/switch_to.h> - -asmlinkage void chacha_p10le_8x(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); - -static void vsx_begin(void) -{ - preempt_disable(); - enable_kernel_vsx(); -} - -static void vsx_end(void) -{ - disable_kernel_vsx(); - preempt_enable(); -} - -static void chacha_p10_do_8x(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - unsigned int l = bytes & ~0x0FF; - - if (l > 0) { - chacha_p10le_8x(state, dst, src, l, nrounds); - bytes -= l; - src += l; - dst += l; - state[12] += l / CHACHA_BLOCK_SIZE; - } - - if (bytes > 0) - chacha_crypt_generic(state, dst, src, bytes, nrounds); -} - -void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) -{ - hchacha_block_generic(state, stream, nrounds); -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) -{ - chacha_init_generic(state, key, iv); -} -EXPORT_SYMBOL(chacha_init_arch); - -void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, - int nrounds) -{ - if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE || - !crypto_simd_usable()) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - vsx_begin(); - chacha_p10_do_8x(state, dst, src, todo, nrounds); - vsx_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -static int chacha_p10_stream_xor(struct skcipher_request *req, - const struct chacha_ctx *ctx, const u8 *iv) -{ - struct skcipher_walk walk; - u32 state[16]; - int err; - - err = skcipher_walk_virt(&walk, req, false); - if (err) - return err; - - chacha_init_generic(state, ctx->key, iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = rounddown(nbytes, walk.stride); - - if (!crypto_simd_usable()) { - chacha_crypt_generic(state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, - ctx->nrounds); - } else { - vsx_begin(); - chacha_p10_do_8x(state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, ctx->nrounds); - vsx_end(); - } - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - if (err) - break; - } - - return err; -} - -static int chacha_p10(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - - return chacha_p10_stream_xor(req, ctx, req->iv); -} - -static int xchacha_p10(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct chacha_ctx subctx; - u32 state[16]; - u8 real_iv[16]; - - chacha_init_generic(state, ctx->key, req->iv); - hchacha_block_arch(state, subctx.key, ctx->nrounds); - subctx.nrounds = ctx->nrounds; - - memcpy(&real_iv[0], req->iv + 24, 8); - memcpy(&real_iv[8], req->iv + 16, 8); - return chacha_p10_stream_xor(req, &subctx, real_iv); -} - -static struct skcipher_alg algs[] = { - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-p10", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = chacha_p10, - .decrypt = chacha_p10, - }, { - .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-p10", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = xchacha_p10, - .decrypt = xchacha_p10, - }, { - .base.cra_name = "xchacha12", - .base.cra_driver_name = "xchacha12-p10", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha12_setkey, - .encrypt = xchacha_p10, - .decrypt = xchacha_p10, - } -}; - -static int __init chacha_p10_init(void) -{ - if (!cpu_has_feature(CPU_FTR_ARCH_31)) - return 0; - - static_branch_enable(&have_p10); - - return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); -} - -static void __exit chacha_p10_exit(void) -{ - if (!static_branch_likely(&have_p10)) - return; - - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); -} - -module_init(chacha_p10_init); -module_exit(chacha_p10_exit); - -MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (P10 accelerated)"); -MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("chacha20"); -MODULE_ALIAS_CRYPTO("chacha20-p10"); -MODULE_ALIAS_CRYPTO("xchacha20"); -MODULE_ALIAS_CRYPTO("xchacha20-p10"); -MODULE_ALIAS_CRYPTO("xchacha12"); -MODULE_ALIAS_CRYPTO("xchacha12-p10"); diff --git a/arch/powerpc/crypto/chacha-p10le-8x.S b/arch/powerpc/crypto/chacha-p10le-8x.S deleted file mode 100644 index 17bedb66b822..000000000000 --- a/arch/powerpc/crypto/chacha-p10le-8x.S +++ /dev/null @@ -1,842 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -# -# Accelerated chacha20 implementation for ppc64le. -# -# Copyright 2023- IBM Corp. All rights reserved -# -#=================================================================================== -# Written by Danny Tsen <dtsen@us.ibm.com> -# -# chacha_p10le_8x(u32 *state, byte *dst, const byte *src, -# size_t len, int nrounds); -# -# do rounds, 8 quarter rounds -# 1. a += b; d ^= a; d <<<= 16; -# 2. c += d; b ^= c; b <<<= 12; -# 3. a += b; d ^= a; d <<<= 8; -# 4. c += d; b ^= c; b <<<= 7 -# -# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16 -# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12 -# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8 -# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7 -# -# 4 blocks (a b c d) -# -# a0 b0 c0 d0 -# a1 b1 c1 d1 -# ... -# a4 b4 c4 d4 -# ... -# a8 b8 c8 d8 -# ... -# a12 b12 c12 d12 -# a13 ... -# a14 ... -# a15 b15 c15 d15 -# -# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) -# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) -# - -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> -#include <asm/asm-compat.h> -#include <linux/linkage.h> - -.machine "any" -.text - -.macro SAVE_GPR GPR OFFSET FRAME - std \GPR,\OFFSET(\FRAME) -.endm - -.macro SAVE_VRS VRS OFFSET FRAME - li 16, \OFFSET - stvx \VRS, 16, \FRAME -.endm - -.macro SAVE_VSX VSX OFFSET FRAME - li 16, \OFFSET - stxvx \VSX, 16, \FRAME -.endm - -.macro RESTORE_GPR GPR OFFSET FRAME - ld \GPR,\OFFSET(\FRAME) -.endm - -.macro RESTORE_VRS VRS OFFSET FRAME - li 16, \OFFSET - lvx \VRS, 16, \FRAME -.endm - -.macro RESTORE_VSX VSX OFFSET FRAME - li 16, \OFFSET - lxvx \VSX, 16, \FRAME -.endm - -.macro SAVE_REGS - mflr 0 - std 0, 16(1) - stdu 1,-752(1) - - SAVE_GPR 14, 112, 1 - SAVE_GPR 15, 120, 1 - SAVE_GPR 16, 128, 1 - SAVE_GPR 17, 136, 1 - SAVE_GPR 18, 144, 1 - SAVE_GPR 19, 152, 1 - SAVE_GPR 20, 160, 1 - SAVE_GPR 21, 168, 1 - SAVE_GPR 22, 176, 1 - SAVE_GPR 23, 184, 1 - SAVE_GPR 24, 192, 1 - SAVE_GPR 25, 200, 1 - SAVE_GPR 26, 208, 1 - SAVE_GPR 27, 216, 1 - SAVE_GPR 28, 224, 1 - SAVE_GPR 29, 232, 1 - SAVE_GPR 30, 240, 1 - SAVE_GPR 31, 248, 1 - - addi 9, 1, 256 - SAVE_VRS 20, 0, 9 - SAVE_VRS 21, 16, 9 - SAVE_VRS 22, 32, 9 - SAVE_VRS 23, 48, 9 - SAVE_VRS 24, 64, 9 - SAVE_VRS 25, 80, 9 - SAVE_VRS 26, 96, 9 - SAVE_VRS 27, 112, 9 - SAVE_VRS 28, 128, 9 - SAVE_VRS 29, 144, 9 - SAVE_VRS 30, 160, 9 - SAVE_VRS 31, 176, 9 - - SAVE_VSX 14, 192, 9 - SAVE_VSX 15, 208, 9 - SAVE_VSX 16, 224, 9 - SAVE_VSX 17, 240, 9 - SAVE_VSX 18, 256, 9 - SAVE_VSX 19, 272, 9 - SAVE_VSX 20, 288, 9 - SAVE_VSX 21, 304, 9 - SAVE_VSX 22, 320, 9 - SAVE_VSX 23, 336, 9 - SAVE_VSX 24, 352, 9 - SAVE_VSX 25, 368, 9 - SAVE_VSX 26, 384, 9 - SAVE_VSX 27, 400, 9 - SAVE_VSX 28, 416, 9 - SAVE_VSX 29, 432, 9 - SAVE_VSX 30, 448, 9 - SAVE_VSX 31, 464, 9 -.endm # SAVE_REGS - -.macro RESTORE_REGS - addi 9, 1, 256 - RESTORE_VRS 20, 0, 9 - RESTORE_VRS 21, 16, 9 - RESTORE_VRS 22, 32, 9 - RESTORE_VRS 23, 48, 9 - RESTORE_VRS 24, 64, 9 - RESTORE_VRS 25, 80, 9 - RESTORE_VRS 26, 96, 9 - RESTORE_VRS 27, 112, 9 - RESTORE_VRS 28, 128, 9 - RESTORE_VRS 29, 144, 9 - RESTORE_VRS 30, 160, 9 - RESTORE_VRS 31, 176, 9 - - RESTORE_VSX 14, 192, 9 - RESTORE_VSX 15, 208, 9 - RESTORE_VSX 16, 224, 9 - RESTORE_VSX 17, 240, 9 - RESTORE_VSX 18, 256, 9 - RESTORE_VSX 19, 272, 9 - RESTORE_VSX 20, 288, 9 - RESTORE_VSX 21, 304, 9 - RESTORE_VSX 22, 320, 9 - RESTORE_VSX 23, 336, 9 - RESTORE_VSX 24, 352, 9 - RESTORE_VSX 25, 368, 9 - RESTORE_VSX 26, 384, 9 - RESTORE_VSX 27, 400, 9 - RESTORE_VSX 28, 416, 9 - RESTORE_VSX 29, 432, 9 - RESTORE_VSX 30, 448, 9 - RESTORE_VSX 31, 464, 9 - - RESTORE_GPR 14, 112, 1 - RESTORE_GPR 15, 120, 1 - RESTORE_GPR 16, 128, 1 - RESTORE_GPR 17, 136, 1 - RESTORE_GPR 18, 144, 1 - RESTORE_GPR 19, 152, 1 - RESTORE_GPR 20, 160, 1 - RESTORE_GPR 21, 168, 1 - RESTORE_GPR 22, 176, 1 - RESTORE_GPR 23, 184, 1 - RESTORE_GPR 24, 192, 1 - RESTORE_GPR 25, 200, 1 - RESTORE_GPR 26, 208, 1 - RESTORE_GPR 27, 216, 1 - RESTORE_GPR 28, 224, 1 - RESTORE_GPR 29, 232, 1 - RESTORE_GPR 30, 240, 1 - RESTORE_GPR 31, 248, 1 - - addi 1, 1, 752 - ld 0, 16(1) - mtlr 0 -.endm # RESTORE_REGS - -.macro QT_loop_8x - # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) - xxlor 0, 32+25, 32+25 - xxlor 32+25, 20, 20 - vadduwm 0, 0, 4 - vadduwm 1, 1, 5 - vadduwm 2, 2, 6 - vadduwm 3, 3, 7 - vadduwm 16, 16, 20 - vadduwm 17, 17, 21 - vadduwm 18, 18, 22 - vadduwm 19, 19, 23 - - vpermxor 12, 12, 0, 25 - vpermxor 13, 13, 1, 25 - vpermxor 14, 14, 2, 25 - vpermxor 15, 15, 3, 25 - vpermxor 28, 28, 16, 25 - vpermxor 29, 29, 17, 25 - vpermxor 30, 30, 18, 25 - vpermxor 31, 31, 19, 25 - xxlor 32+25, 0, 0 - vadduwm 8, 8, 12 - vadduwm 9, 9, 13 - vadduwm 10, 10, 14 - vadduwm 11, 11, 15 - vadduwm 24, 24, 28 - vadduwm 25, 25, 29 - vadduwm 26, 26, 30 - vadduwm 27, 27, 31 - vxor 4, 4, 8 - vxor 5, 5, 9 - vxor 6, 6, 10 - vxor 7, 7, 11 - vxor 20, 20, 24 - vxor 21, 21, 25 - vxor 22, 22, 26 - vxor 23, 23, 27 - - xxlor 0, 32+25, 32+25 - xxlor 32+25, 21, 21 - vrlw 4, 4, 25 # - vrlw 5, 5, 25 - vrlw 6, 6, 25 - vrlw 7, 7, 25 - vrlw 20, 20, 25 # - vrlw 21, 21, 25 - vrlw 22, 22, 25 - vrlw 23, 23, 25 - xxlor 32+25, 0, 0 - vadduwm 0, 0, 4 - vadduwm 1, 1, 5 - vadduwm 2, 2, 6 - vadduwm 3, 3, 7 - vadduwm 16, 16, 20 - vadduwm 17, 17, 21 - vadduwm 18, 18, 22 - vadduwm 19, 19, 23 - - xxlor 0, 32+25, 32+25 - xxlor 32+25, 22, 22 - vpermxor 12, 12, 0, 25 - vpermxor 13, 13, 1, 25 - vpermxor 14, 14, 2, 25 - vpermxor 15, 15, 3, 25 - vpermxor 28, 28, 16, 25 - vpermxor 29, 29, 17, 25 - vpermxor 30, 30, 18, 25 - vpermxor 31, 31, 19, 25 - xxlor 32+25, 0, 0 - vadduwm 8, 8, 12 - vadduwm 9, 9, 13 - vadduwm 10, 10, 14 - vadduwm 11, 11, 15 - vadduwm 24, 24, 28 - vadduwm 25, 25, 29 - vadduwm 26, 26, 30 - vadduwm 27, 27, 31 - xxlor 0, 32+28, 32+28 - xxlor 32+28, 23, 23 - vxor 4, 4, 8 - vxor 5, 5, 9 - vxor 6, 6, 10 - vxor 7, 7, 11 - vxor 20, 20, 24 - vxor 21, 21, 25 - vxor 22, 22, 26 - vxor 23, 23, 27 - vrlw 4, 4, 28 # - vrlw 5, 5, 28 - vrlw 6, 6, 28 - vrlw 7, 7, 28 - vrlw 20, 20, 28 # - vrlw 21, 21, 28 - vrlw 22, 22, 28 - vrlw 23, 23, 28 - xxlor 32+28, 0, 0 - - # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) - xxlor 0, 32+25, 32+25 - xxlor 32+25, 20, 20 - vadduwm 0, 0, 5 - vadduwm 1, 1, 6 - vadduwm 2, 2, 7 - vadduwm 3, 3, 4 - vadduwm 16, 16, 21 - vadduwm 17, 17, 22 - vadduwm 18, 18, 23 - vadduwm 19, 19, 20 - - vpermxor 15, 15, 0, 25 - vpermxor 12, 12, 1, 25 - vpermxor 13, 13, 2, 25 - vpermxor 14, 14, 3, 25 - vpermxor 31, 31, 16, 25 - vpermxor 28, 28, 17, 25 - vpermxor 29, 29, 18, 25 - vpermxor 30, 30, 19, 25 - - xxlor 32+25, 0, 0 - vadduwm 10, 10, 15 - vadduwm 11, 11, 12 - vadduwm 8, 8, 13 - vadduwm 9, 9, 14 - vadduwm 26, 26, 31 - vadduwm 27, 27, 28 - vadduwm 24, 24, 29 - vadduwm 25, 25, 30 - vxor 5, 5, 10 - vxor 6, 6, 11 - vxor 7, 7, 8 - vxor 4, 4, 9 - vxor 21, 21, 26 - vxor 22, 22, 27 - vxor 23, 23, 24 - vxor 20, 20, 25 - - xxlor 0, 32+25, 32+25 - xxlor 32+25, 21, 21 - vrlw 5, 5, 25 - vrlw 6, 6, 25 - vrlw 7, 7, 25 - vrlw 4, 4, 25 - vrlw 21, 21, 25 - vrlw 22, 22, 25 - vrlw 23, 23, 25 - vrlw 20, 20, 25 - xxlor 32+25, 0, 0 - - vadduwm 0, 0, 5 - vadduwm 1, 1, 6 - vadduwm 2, 2, 7 - vadduwm 3, 3, 4 - vadduwm 16, 16, 21 - vadduwm 17, 17, 22 - vadduwm 18, 18, 23 - vadduwm 19, 19, 20 - - xxlor 0, 32+25, 32+25 - xxlor 32+25, 22, 22 - vpermxor 15, 15, 0, 25 - vpermxor 12, 12, 1, 25 - vpermxor 13, 13, 2, 25 - vpermxor 14, 14, 3, 25 - vpermxor 31, 31, 16, 25 - vpermxor 28, 28, 17, 25 - vpermxor 29, 29, 18, 25 - vpermxor 30, 30, 19, 25 - xxlor 32+25, 0, 0 - - vadduwm 10, 10, 15 - vadduwm 11, 11, 12 - vadduwm 8, 8, 13 - vadduwm 9, 9, 14 - vadduwm 26, 26, 31 - vadduwm 27, 27, 28 - vadduwm 24, 24, 29 - vadduwm 25, 25, 30 - - xxlor 0, 32+28, 32+28 - xxlor 32+28, 23, 23 - vxor 5, 5, 10 - vxor 6, 6, 11 - vxor 7, 7, 8 - vxor 4, 4, 9 - vxor 21, 21, 26 - vxor 22, 22, 27 - vxor 23, 23, 24 - vxor 20, 20, 25 - vrlw 5, 5, 28 - vrlw 6, 6, 28 - vrlw 7, 7, 28 - vrlw 4, 4, 28 - vrlw 21, 21, 28 - vrlw 22, 22, 28 - vrlw 23, 23, 28 - vrlw 20, 20, 28 - xxlor 32+28, 0, 0 -.endm - -.macro QT_loop_4x - # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) - vadduwm 0, 0, 4 - vadduwm 1, 1, 5 - vadduwm 2, 2, 6 - vadduwm 3, 3, 7 - vpermxor 12, 12, 0, 20 - vpermxor 13, 13, 1, 20 - vpermxor 14, 14, 2, 20 - vpermxor 15, 15, 3, 20 - vadduwm 8, 8, 12 - vadduwm 9, 9, 13 - vadduwm 10, 10, 14 - vadduwm 11, 11, 15 - vxor 4, 4, 8 - vxor 5, 5, 9 - vxor 6, 6, 10 - vxor 7, 7, 11 - vrlw 4, 4, 21 - vrlw 5, 5, 21 - vrlw 6, 6, 21 - vrlw 7, 7, 21 - vadduwm 0, 0, 4 - vadduwm 1, 1, 5 - vadduwm 2, 2, 6 - vadduwm 3, 3, 7 - vpermxor 12, 12, 0, 22 - vpermxor 13, 13, 1, 22 - vpermxor 14, 14, 2, 22 - vpermxor 15, 15, 3, 22 - vadduwm 8, 8, 12 - vadduwm 9, 9, 13 - vadduwm 10, 10, 14 - vadduwm 11, 11, 15 - vxor 4, 4, 8 - vxor 5, 5, 9 - vxor 6, 6, 10 - vxor 7, 7, 11 - vrlw 4, 4, 23 - vrlw 5, 5, 23 - vrlw 6, 6, 23 - vrlw 7, 7, 23 - - # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) - vadduwm 0, 0, 5 - vadduwm 1, 1, 6 - vadduwm 2, 2, 7 - vadduwm 3, 3, 4 - vpermxor 15, 15, 0, 20 - vpermxor 12, 12, 1, 20 - vpermxor 13, 13, 2, 20 - vpermxor 14, 14, 3, 20 - vadduwm 10, 10, 15 - vadduwm 11, 11, 12 - vadduwm 8, 8, 13 - vadduwm 9, 9, 14 - vxor 5, 5, 10 - vxor 6, 6, 11 - vxor 7, 7, 8 - vxor 4, 4, 9 - vrlw 5, 5, 21 - vrlw 6, 6, 21 - vrlw 7, 7, 21 - vrlw 4, 4, 21 - vadduwm 0, 0, 5 - vadduwm 1, 1, 6 - vadduwm 2, 2, 7 - vadduwm 3, 3, 4 - vpermxor 15, 15, 0, 22 - vpermxor 12, 12, 1, 22 - vpermxor 13, 13, 2, 22 - vpermxor 14, 14, 3, 22 - vadduwm 10, 10, 15 - vadduwm 11, 11, 12 - vadduwm 8, 8, 13 - vadduwm 9, 9, 14 - vxor 5, 5, 10 - vxor 6, 6, 11 - vxor 7, 7, 8 - vxor 4, 4, 9 - vrlw 5, 5, 23 - vrlw 6, 6, 23 - vrlw 7, 7, 23 - vrlw 4, 4, 23 -.endm - -# Transpose -.macro TP_4x a0 a1 a2 a3 - xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1 - xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3 - xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1 - xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3 - xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3 - xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3 - xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3 - xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3 -.endm - -# key stream = working state + state -.macro Add_state S - vadduwm \S+0, \S+0, 16-\S - vadduwm \S+4, \S+4, 17-\S - vadduwm \S+8, \S+8, 18-\S - vadduwm \S+12, \S+12, 19-\S - - vadduwm \S+1, \S+1, 16-\S - vadduwm \S+5, \S+5, 17-\S - vadduwm \S+9, \S+9, 18-\S - vadduwm \S+13, \S+13, 19-\S - - vadduwm \S+2, \S+2, 16-\S - vadduwm \S+6, \S+6, 17-\S - vadduwm \S+10, \S+10, 18-\S - vadduwm \S+14, \S+14, 19-\S - - vadduwm \S+3, \S+3, 16-\S - vadduwm \S+7, \S+7, 17-\S - vadduwm \S+11, \S+11, 18-\S - vadduwm \S+15, \S+15, 19-\S -.endm - -# -# write 256 bytes -# -.macro Write_256 S - add 9, 14, 5 - add 16, 14, 4 - lxvw4x 0, 0, 9 - lxvw4x 1, 17, 9 - lxvw4x 2, 18, 9 - lxvw4x 3, 19, 9 - lxvw4x 4, 20, 9 - lxvw4x 5, 21, 9 - lxvw4x 6, 22, 9 - lxvw4x 7, 23, 9 - lxvw4x 8, 24, 9 - lxvw4x 9, 25, 9 - lxvw4x 10, 26, 9 - lxvw4x 11, 27, 9 - lxvw4x 12, 28, 9 - lxvw4x 13, 29, 9 - lxvw4x 14, 30, 9 - lxvw4x 15, 31, 9 - - xxlxor \S+32, \S+32, 0 - xxlxor \S+36, \S+36, 1 - xxlxor \S+40, \S+40, 2 - xxlxor \S+44, \S+44, 3 - xxlxor \S+33, \S+33, 4 - xxlxor \S+37, \S+37, 5 - xxlxor \S+41, \S+41, 6 - xxlxor \S+45, \S+45, 7 - xxlxor \S+34, \S+34, 8 - xxlxor \S+38, \S+38, 9 - xxlxor \S+42, \S+42, 10 - xxlxor \S+46, \S+46, 11 - xxlxor \S+35, \S+35, 12 - xxlxor \S+39, \S+39, 13 - xxlxor \S+43, \S+43, 14 - xxlxor \S+47, \S+47, 15 - - stxvw4x \S+32, 0, 16 - stxvw4x \S+36, 17, 16 - stxvw4x \S+40, 18, 16 - stxvw4x \S+44, 19, 16 - - stxvw4x \S+33, 20, 16 - stxvw4x \S+37, 21, 16 - stxvw4x \S+41, 22, 16 - stxvw4x \S+45, 23, 16 - - stxvw4x \S+34, 24, 16 - stxvw4x \S+38, 25, 16 - stxvw4x \S+42, 26, 16 - stxvw4x \S+46, 27, 16 - - stxvw4x \S+35, 28, 16 - stxvw4x \S+39, 29, 16 - stxvw4x \S+43, 30, 16 - stxvw4x \S+47, 31, 16 - -.endm - -# -# chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len, int nrounds); -# -SYM_FUNC_START(chacha_p10le_8x) -.align 5 - cmpdi 6, 0 - ble Out_no_chacha - - SAVE_REGS - - # r17 - r31 mainly for Write_256 macro. - li 17, 16 - li 18, 32 - li 19, 48 - li 20, 64 - li 21, 80 - li 22, 96 - li 23, 112 - li 24, 128 - li 25, 144 - li 26, 160 - li 27, 176 - li 28, 192 - li 29, 208 - li 30, 224 - li 31, 240 - - mr 15, 6 # len - li 14, 0 # offset to inp and outp - - lxvw4x 48, 0, 3 # vr16, constants - lxvw4x 49, 17, 3 # vr17, key 1 - lxvw4x 50, 18, 3 # vr18, key 2 - lxvw4x 51, 19, 3 # vr19, counter, nonce - - # create (0, 1, 2, 3) counters - vspltisw 0, 0 - vspltisw 1, 1 - vspltisw 2, 2 - vspltisw 3, 3 - vmrghw 4, 0, 1 - vmrglw 5, 2, 3 - vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3) - - vspltisw 21, 12 - vspltisw 23, 7 - - addis 11, 2, permx@toc@ha - addi 11, 11, permx@toc@l - lxvw4x 32+20, 0, 11 - lxvw4x 32+22, 17, 11 - - sradi 8, 7, 1 - - mtctr 8 - - # save constants to vsx - xxlor 16, 48, 48 - xxlor 17, 49, 49 - xxlor 18, 50, 50 - xxlor 19, 51, 51 - - vspltisw 25, 4 - vspltisw 26, 8 - - xxlor 25, 32+26, 32+26 - xxlor 24, 32+25, 32+25 - - vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4) - xxlor 30, 32+30, 32+30 - xxlor 31, 32+31, 32+31 - - xxlor 20, 32+20, 32+20 - xxlor 21, 32+21, 32+21 - xxlor 22, 32+22, 32+22 - xxlor 23, 32+23, 32+23 - - cmpdi 6, 512 - blt Loop_last - -Loop_8x: - xxspltw 32+0, 16, 0 - xxspltw 32+1, 16, 1 - xxspltw 32+2, 16, 2 - xxspltw 32+3, 16, 3 - - xxspltw 32+4, 17, 0 - xxspltw 32+5, 17, 1 - xxspltw 32+6, 17, 2 - xxspltw 32+7, 17, 3 - xxspltw 32+8, 18, 0 - xxspltw 32+9, 18, 1 - xxspltw 32+10, 18, 2 - xxspltw 32+11, 18, 3 - xxspltw 32+12, 19, 0 - xxspltw 32+13, 19, 1 - xxspltw 32+14, 19, 2 - xxspltw 32+15, 19, 3 - vadduwm 12, 12, 30 # increase counter - - xxspltw 32+16, 16, 0 - xxspltw 32+17, 16, 1 - xxspltw 32+18, 16, 2 - xxspltw 32+19, 16, 3 - - xxspltw 32+20, 17, 0 - xxspltw 32+21, 17, 1 - xxspltw 32+22, 17, 2 - xxspltw 32+23, 17, 3 - xxspltw 32+24, 18, 0 - xxspltw 32+25, 18, 1 - xxspltw 32+26, 18, 2 - xxspltw 32+27, 18, 3 - xxspltw 32+28, 19, 0 - xxspltw 32+29, 19, 1 - vadduwm 28, 28, 31 # increase counter - xxspltw 32+30, 19, 2 - xxspltw 32+31, 19, 3 - -.align 5 -quarter_loop_8x: - QT_loop_8x - - bdnz quarter_loop_8x - - xxlor 0, 32+30, 32+30 - xxlor 32+30, 30, 30 - vadduwm 12, 12, 30 - xxlor 32+30, 0, 0 - TP_4x 0, 1, 2, 3 - TP_4x 4, 5, 6, 7 - TP_4x 8, 9, 10, 11 - TP_4x 12, 13, 14, 15 - - xxlor 0, 48, 48 - xxlor 1, 49, 49 - xxlor 2, 50, 50 - xxlor 3, 51, 51 - xxlor 48, 16, 16 - xxlor 49, 17, 17 - xxlor 50, 18, 18 - xxlor 51, 19, 19 - Add_state 0 - xxlor 48, 0, 0 - xxlor 49, 1, 1 - xxlor 50, 2, 2 - xxlor 51, 3, 3 - Write_256 0 - addi 14, 14, 256 # offset +=256 - addi 15, 15, -256 # len -=256 - - xxlor 5, 32+31, 32+31 - xxlor 32+31, 31, 31 - vadduwm 28, 28, 31 - xxlor 32+31, 5, 5 - TP_4x 16+0, 16+1, 16+2, 16+3 - TP_4x 16+4, 16+5, 16+6, 16+7 - TP_4x 16+8, 16+9, 16+10, 16+11 - TP_4x 16+12, 16+13, 16+14, 16+15 - - xxlor 32, 16, 16 - xxlor 33, 17, 17 - xxlor 34, 18, 18 - xxlor 35, 19, 19 - Add_state 16 - Write_256 16 - addi 14, 14, 256 # offset +=256 - addi 15, 15, -256 # len +=256 - - xxlor 32+24, 24, 24 - xxlor 32+25, 25, 25 - xxlor 32+30, 30, 30 - vadduwm 30, 30, 25 - vadduwm 31, 30, 24 - xxlor 30, 32+30, 32+30 - xxlor 31, 32+31, 32+31 - - cmpdi 15, 0 - beq Out_loop - - cmpdi 15, 512 - blt Loop_last - - mtctr 8 - b Loop_8x - -Loop_last: - lxvw4x 48, 0, 3 # vr16, constants - lxvw4x 49, 17, 3 # vr17, key 1 - lxvw4x 50, 18, 3 # vr18, key 2 - lxvw4x 51, 19, 3 # vr19, counter, nonce - - vspltisw 21, 12 - vspltisw 23, 7 - addis 11, 2, permx@toc@ha - addi 11, 11, permx@toc@l - lxvw4x 32+20, 0, 11 - lxvw4x 32+22, 17, 11 - - sradi 8, 7, 1 - mtctr 8 - -Loop_4x: - vspltw 0, 16, 0 - vspltw 1, 16, 1 - vspltw 2, 16, 2 - vspltw 3, 16, 3 - - vspltw 4, 17, 0 - vspltw 5, 17, 1 - vspltw 6, 17, 2 - vspltw 7, 17, 3 - vspltw 8, 18, 0 - vspltw 9, 18, 1 - vspltw 10, 18, 2 - vspltw 11, 18, 3 - vspltw 12, 19, 0 - vadduwm 12, 12, 30 # increase counter - vspltw 13, 19, 1 - vspltw 14, 19, 2 - vspltw 15, 19, 3 - -.align 5 -quarter_loop: - QT_loop_4x - - bdnz quarter_loop - - vadduwm 12, 12, 30 - TP_4x 0, 1, 2, 3 - TP_4x 4, 5, 6, 7 - TP_4x 8, 9, 10, 11 - TP_4x 12, 13, 14, 15 - - Add_state 0 - Write_256 0 - addi 14, 14, 256 # offset += 256 - addi 15, 15, -256 # len += 256 - - # Update state counter - vspltisw 25, 4 - vadduwm 30, 30, 25 - - cmpdi 15, 0 - beq Out_loop - cmpdi 15, 256 - blt Out_loop - - mtctr 8 - b Loop_4x - -Out_loop: - RESTORE_REGS - blr - -Out_no_chacha: - li 3, 0 - blr -SYM_FUNC_END(chacha_p10le_8x) - -SYM_DATA_START_LOCAL(PERMX) -.align 5 -permx: -.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd -.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc -SYM_DATA_END(PERMX) diff --git a/arch/powerpc/crypto/crc-vpmsum_test.c b/arch/powerpc/crypto/crc-vpmsum_test.c deleted file mode 100644 index c61a874a3a5c..000000000000 --- a/arch/powerpc/crypto/crc-vpmsum_test.c +++ /dev/null @@ -1,133 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * CRC vpmsum tester - * Copyright 2017 Daniel Axtens, IBM Corporation. - */ - -#include <linux/crc-t10dif.h> -#include <linux/crc32.h> -#include <crypto/internal/hash.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/random.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/cpufeature.h> -#include <asm/switch_to.h> - -static unsigned long iterations = 10000; - -#define MAX_CRC_LENGTH 65535 - - -static int __init crc_test_init(void) -{ - u16 crc16 = 0, verify16 = 0; - __le32 verify32le = 0; - unsigned char *data; - u32 verify32 = 0; - unsigned long i; - __le32 crc32; - int ret; - - struct crypto_shash *crct10dif_tfm; - struct crypto_shash *crc32c_tfm; - - if (!cpu_has_feature(CPU_FTR_ARCH_207S)) - return -ENODEV; - - data = kmalloc(MAX_CRC_LENGTH, GFP_KERNEL); - if (!data) - return -ENOMEM; - - crct10dif_tfm = crypto_alloc_shash("crct10dif", 0, 0); - - if (IS_ERR(crct10dif_tfm)) { - pr_err("Error allocating crc-t10dif\n"); - goto free_buf; - } - - crc32c_tfm = crypto_alloc_shash("crc32c", 0, 0); - - if (IS_ERR(crc32c_tfm)) { - pr_err("Error allocating crc32c\n"); - goto free_16; - } - - do { - SHASH_DESC_ON_STACK(crct10dif_shash, crct10dif_tfm); - SHASH_DESC_ON_STACK(crc32c_shash, crc32c_tfm); - - crct10dif_shash->tfm = crct10dif_tfm; - ret = crypto_shash_init(crct10dif_shash); - - if (ret) { - pr_err("Error initing crc-t10dif\n"); - goto free_32; - } - - - crc32c_shash->tfm = crc32c_tfm; - ret = crypto_shash_init(crc32c_shash); - - if (ret) { - pr_err("Error initing crc32c\n"); - goto free_32; - } - - pr_info("crc-vpmsum_test begins, %lu iterations\n", iterations); - for (i=0; i<iterations; i++) { - size_t offset = get_random_u32_below(16); - size_t len = get_random_u32_below(MAX_CRC_LENGTH); - - if (len <= offset) - continue; - get_random_bytes(data, len); - len -= offset; - - crypto_shash_update(crct10dif_shash, data+offset, len); - crypto_shash_final(crct10dif_shash, (u8 *)(&crc16)); - verify16 = crc_t10dif_generic(verify16, data+offset, len); - - - if (crc16 != verify16) { - pr_err("FAILURE in CRC16: got 0x%04x expected 0x%04x (len %lu)\n", - crc16, verify16, len); - break; - } - - crypto_shash_update(crc32c_shash, data+offset, len); - crypto_shash_final(crc32c_shash, (u8 *)(&crc32)); - verify32 = le32_to_cpu(verify32le); - verify32le = ~cpu_to_le32(__crc32c_le(~verify32, data+offset, len)); - if (crc32 != verify32le) { - pr_err("FAILURE in CRC32: got 0x%08x expected 0x%08x (len %lu)\n", - crc32, verify32, len); - break; - } - cond_resched(); - } - pr_info("crc-vpmsum_test done, completed %lu iterations\n", i); - } while (0); - -free_32: - crypto_free_shash(crc32c_tfm); - -free_16: - crypto_free_shash(crct10dif_tfm); - -free_buf: - kfree(data); - - return 0; -} - -static void __exit crc_test_exit(void) {} - -module_init(crc_test_init); -module_exit(crc_test_exit); -module_param(iterations, long, 0400); - -MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>"); -MODULE_DESCRIPTION("Vector polynomial multiply-sum CRC tester"); -MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/crypto/crc32-vpmsum_core.S b/arch/powerpc/crypto/crc32-vpmsum_core.S deleted file mode 100644 index b0f87f595b26..000000000000 --- a/arch/powerpc/crypto/crc32-vpmsum_core.S +++ /dev/null @@ -1,746 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Core of the accelerated CRC algorithm. - * In your file, define the constants and CRC_FUNCTION_NAME - * Then include this file. - * - * Calculate the checksum of data that is 16 byte aligned and a multiple of - * 16 bytes. - * - * The first step is to reduce it to 1024 bits. We do this in 8 parallel - * chunks in order to mask the latency of the vpmsum instructions. If we - * have more than 32 kB of data to checksum we repeat this step multiple - * times, passing in the previous 1024 bits. - * - * The next step is to reduce the 1024 bits to 64 bits. This step adds - * 32 bits of 0s to the end - this matches what a CRC does. We just - * calculate constants that land the data in this 32 bits. - * - * We then use fixed point Barrett reduction to compute a mod n over GF(2) - * for n = CRC using POWER8 instructions. We use x = 32. - * - * https://en.wikipedia.org/wiki/Barrett_reduction - * - * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM -*/ - -#include <asm/ppc_asm.h> -#include <asm/ppc-opcode.h> - -#define MAX_SIZE 32768 - - .text - -#if defined(__BIG_ENDIAN__) && defined(REFLECT) -#define BYTESWAP_DATA -#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) -#define BYTESWAP_DATA -#else -#undef BYTESWAP_DATA -#endif - -#define off16 r25 -#define off32 r26 -#define off48 r27 -#define off64 r28 -#define off80 r29 -#define off96 r30 -#define off112 r31 - -#define const1 v24 -#define const2 v25 - -#define byteswap v26 -#define mask_32bit v27 -#define mask_64bit v28 -#define zeroes v29 - -#ifdef BYTESWAP_DATA -#define VPERM(A, B, C, D) vperm A, B, C, D -#else -#define VPERM(A, B, C, D) -#endif - -/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */ -FUNC_START(CRC_FUNCTION_NAME) - std r31,-8(r1) - std r30,-16(r1) - std r29,-24(r1) - std r28,-32(r1) - std r27,-40(r1) - std r26,-48(r1) - std r25,-56(r1) - - li off16,16 - li off32,32 - li off48,48 - li off64,64 - li off80,80 - li off96,96 - li off112,112 - li r0,0 - - /* Enough room for saving 10 non volatile VMX registers */ - subi r6,r1,56+10*16 - subi r7,r1,56+2*16 - - stvx v20,0,r6 - stvx v21,off16,r6 - stvx v22,off32,r6 - stvx v23,off48,r6 - stvx v24,off64,r6 - stvx v25,off80,r6 - stvx v26,off96,r6 - stvx v27,off112,r6 - stvx v28,0,r7 - stvx v29,off16,r7 - - mr r10,r3 - - vxor zeroes,zeroes,zeroes - vspltisw v0,-1 - - vsldoi mask_32bit,zeroes,v0,4 - vsldoi mask_64bit,zeroes,v0,8 - - /* Get the initial value into v8 */ - vxor v8,v8,v8 - MTVRD(v8, R3) -#ifdef REFLECT - vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ -#else - vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ -#endif - -#ifdef BYTESWAP_DATA - LOAD_REG_ADDR(r3, .byteswap_constant) - lvx byteswap,0,r3 - addi r3,r3,16 -#endif - - cmpdi r5,256 - blt .Lshort - - rldicr r6,r5,0,56 - - /* Checksum in blocks of MAX_SIZE */ -1: lis r7,MAX_SIZE@h - ori r7,r7,MAX_SIZE@l - mr r9,r7 - cmpd r6,r7 - bgt 2f - mr r7,r6 -2: subf r6,r7,r6 - - /* our main loop does 128 bytes at a time */ - srdi r7,r7,7 - - /* - * Work out the offset into the constants table to start at. Each - * constant is 16 bytes, and it is used against 128 bytes of input - * data - 128 / 16 = 8 - */ - sldi r8,r7,4 - srdi r9,r9,3 - subf r8,r8,r9 - - /* We reduce our final 128 bytes in a separate step */ - addi r7,r7,-1 - mtctr r7 - - LOAD_REG_ADDR(r3, .constants) - - /* Find the start of our constants */ - add r3,r3,r8 - - /* zero v0-v7 which will contain our checksums */ - vxor v0,v0,v0 - vxor v1,v1,v1 - vxor v2,v2,v2 - vxor v3,v3,v3 - vxor v4,v4,v4 - vxor v5,v5,v5 - vxor v6,v6,v6 - vxor v7,v7,v7 - - lvx const1,0,r3 - - /* - * If we are looping back to consume more data we use the values - * already in v16-v23. - */ - cmpdi r0,1 - beq 2f - - /* First warm up pass */ - lvx v16,0,r4 - lvx v17,off16,r4 - VPERM(v16,v16,v16,byteswap) - VPERM(v17,v17,v17,byteswap) - lvx v18,off32,r4 - lvx v19,off48,r4 - VPERM(v18,v18,v18,byteswap) - VPERM(v19,v19,v19,byteswap) - lvx v20,off64,r4 - lvx v21,off80,r4 - VPERM(v20,v20,v20,byteswap) - VPERM(v21,v21,v21,byteswap) - lvx v22,off96,r4 - lvx v23,off112,r4 - VPERM(v22,v22,v22,byteswap) - VPERM(v23,v23,v23,byteswap) - addi r4,r4,8*16 - - /* xor in initial value */ - vxor v16,v16,v8 - -2: bdz .Lfirst_warm_up_done - - addi r3,r3,16 - lvx const2,0,r3 - - /* Second warm up pass */ - VPMSUMD(v8,v16,const1) - lvx v16,0,r4 - VPERM(v16,v16,v16,byteswap) - ori r2,r2,0 - - VPMSUMD(v9,v17,const1) - lvx v17,off16,r4 - VPERM(v17,v17,v17,byteswap) - ori r2,r2,0 - - VPMSUMD(v10,v18,const1) - lvx v18,off32,r4 - VPERM(v18,v18,v18,byteswap) - ori r2,r2,0 - - VPMSUMD(v11,v19,const1) - lvx v19,off48,r4 - VPERM(v19,v19,v19,byteswap) - ori r2,r2,0 - - VPMSUMD(v12,v20,const1) - lvx v20,off64,r4 - VPERM(v20,v20,v20,byteswap) - ori r2,r2,0 - - VPMSUMD(v13,v21,const1) - lvx v21,off80,r4 - VPERM(v21,v21,v21,byteswap) - ori r2,r2,0 - - VPMSUMD(v14,v22,const1) - lvx v22,off96,r4 - VPERM(v22,v22,v22,byteswap) - ori r2,r2,0 - - VPMSUMD(v15,v23,const1) - lvx v23,off112,r4 - VPERM(v23,v23,v23,byteswap) - - addi r4,r4,8*16 - - bdz .Lfirst_cool_down - - /* - * main loop. We modulo schedule it such that it takes three iterations - * to complete - first iteration load, second iteration vpmsum, third - * iteration xor. - */ - .balign 16 -4: lvx const1,0,r3 - addi r3,r3,16 - ori r2,r2,0 - - vxor v0,v0,v8 - VPMSUMD(v8,v16,const2) - lvx v16,0,r4 - VPERM(v16,v16,v16,byteswap) - ori r2,r2,0 - - vxor v1,v1,v9 - VPMSUMD(v9,v17,const2) - lvx v17,off16,r4 - VPERM(v17,v17,v17,byteswap) - ori r2,r2,0 - - vxor v2,v2,v10 - VPMSUMD(v10,v18,const2) - lvx v18,off32,r4 - VPERM(v18,v18,v18,byteswap) - ori r2,r2,0 - - vxor v3,v3,v11 - VPMSUMD(v11,v19,const2) - lvx v19,off48,r4 - VPERM(v19,v19,v19,byteswap) - lvx const2,0,r3 - ori r2,r2,0 - - vxor v4,v4,v12 - VPMSUMD(v12,v20,const1) - lvx v20,off64,r4 - VPERM(v20,v20,v20,byteswap) - ori r2,r2,0 - - vxor v5,v5,v13 - VPMSUMD(v13,v21,const1) - lvx v21,off80,r4 - VPERM(v21,v21,v21,byteswap) - ori r2,r2,0 - - vxor v6,v6,v14 - VPMSUMD(v14,v22,const1) - lvx v22,off96,r4 - VPERM(v22,v22,v22,byteswap) - ori r2,r2,0 - - vxor v7,v7,v15 - VPMSUMD(v15,v23,const1) - lvx v23,off112,r4 - VPERM(v23,v23,v23,byteswap) - - addi r4,r4,8*16 - - bdnz 4b - -.Lfirst_cool_down: - /* First cool down pass */ - lvx const1,0,r3 - addi r3,r3,16 - - vxor v0,v0,v8 - VPMSUMD(v8,v16,const1) - ori r2,r2,0 - - vxor v1,v1,v9 - VPMSUMD(v9,v17,const1) - ori r2,r2,0 - - vxor v2,v2,v10 - VPMSUMD(v10,v18,const1) - ori r2,r2,0 - - vxor v3,v3,v11 - VPMSUMD(v11,v19,const1) - ori r2,r2,0 - - vxor v4,v4,v12 - VPMSUMD(v12,v20,const1) - ori r2,r2,0 - - vxor v5,v5,v13 - VPMSUMD(v13,v21,const1) - ori r2,r2,0 - - vxor v6,v6,v14 - VPMSUMD(v14,v22,const1) - ori r2,r2,0 - - vxor v7,v7,v15 - VPMSUMD(v15,v23,const1) - ori r2,r2,0 - -.Lsecond_cool_down: - /* Second cool down pass */ - vxor v0,v0,v8 - vxor v1,v1,v9 - vxor v2,v2,v10 - vxor v3,v3,v11 - vxor v4,v4,v12 - vxor v5,v5,v13 - vxor v6,v6,v14 - vxor v7,v7,v15 - -#ifdef REFLECT - /* - * vpmsumd produces a 96 bit result in the least significant bits - * of the register. Since we are bit reflected we have to shift it - * left 32 bits so it occupies the least significant bits in the - * bit reflected domain. - */ - vsldoi v0,v0,zeroes,4 - vsldoi v1,v1,zeroes,4 - vsldoi v2,v2,zeroes,4 - vsldoi v3,v3,zeroes,4 - vsldoi v4,v4,zeroes,4 - vsldoi v5,v5,zeroes,4 - vsldoi v6,v6,zeroes,4 - vsldoi v7,v7,zeroes,4 -#endif - - /* xor with last 1024 bits */ - lvx v8,0,r4 - lvx v9,off16,r4 - VPERM(v8,v8,v8,byteswap) - VPERM(v9,v9,v9,byteswap) - lvx v10,off32,r4 - lvx v11,off48,r4 - VPERM(v10,v10,v10,byteswap) - VPERM(v11,v11,v11,byteswap) - lvx v12,off64,r4 - lvx v13,off80,r4 - VPERM(v12,v12,v12,byteswap) - VPERM(v13,v13,v13,byteswap) - lvx v14,off96,r4 - lvx v15,off112,r4 - VPERM(v14,v14,v14,byteswap) - VPERM(v15,v15,v15,byteswap) - - addi r4,r4,8*16 - - vxor v16,v0,v8 - vxor v17,v1,v9 - vxor v18,v2,v10 - vxor v19,v3,v11 - vxor v20,v4,v12 - vxor v21,v5,v13 - vxor v22,v6,v14 - vxor v23,v7,v15 - - li r0,1 - cmpdi r6,0 - addi r6,r6,128 - bne 1b - - /* Work out how many bytes we have left */ - andi. r5,r5,127 - - /* Calculate where in the constant table we need to start */ - subfic r6,r5,128 - add r3,r3,r6 - - /* How many 16 byte chunks are in the tail */ - srdi r7,r5,4 - mtctr r7 - - /* - * Reduce the previously calculated 1024 bits to 64 bits, shifting - * 32 bits to include the trailing 32 bits of zeros - */ - lvx v0,0,r3 - lvx v1,off16,r3 - lvx v2,off32,r3 - lvx v3,off48,r3 - lvx v4,off64,r3 - lvx v5,off80,r3 - lvx v6,off96,r3 - lvx v7,off112,r3 - addi r3,r3,8*16 - - VPMSUMW(v0,v16,v0) - VPMSUMW(v1,v17,v1) - VPMSUMW(v2,v18,v2) - VPMSUMW(v3,v19,v3) - VPMSUMW(v4,v20,v4) - VPMSUMW(v5,v21,v5) - VPMSUMW(v6,v22,v6) - VPMSUMW(v7,v23,v7) - - /* Now reduce the tail (0 - 112 bytes) */ - cmpdi r7,0 - beq 1f - - lvx v16,0,r4 - lvx v17,0,r3 - VPERM(v16,v16,v16,byteswap) - VPMSUMW(v16,v16,v17) - vxor v0,v0,v16 - bdz 1f - - lvx v16,off16,r4 - lvx v17,off16,r3 - VPERM(v16,v16,v16,byteswap) - VPMSUMW(v16,v16,v17) - vxor v0,v0,v16 - bdz 1f - - lvx v16,off32,r4 - lvx v17,off32,r3 - VPERM(v16,v16,v16,byteswap) - VPMSUMW(v16,v16,v17) - vxor v0,v0,v16 - bdz 1f - - lvx v16,off48,r4 - lvx v17,off48,r3 - VPERM(v16,v16,v16,byteswap) - VPMSUMW(v16,v16,v17) - vxor v0,v0,v16 - bdz 1f - - lvx v16,off64,r4 - lvx v17,off64,r3 - VPERM(v16,v16,v16,byteswap) - VPMSUMW(v16,v16,v17) - vxor v0,v0,v16 - bdz 1f - - lvx v16,off80,r4 - lvx v17,off80,r3 - VPERM(v16,v16,v16,byteswap) - VPMSUMW(v16,v16,v17) - vxor v0,v0,v16 - bdz 1f - - lvx v16,off96,r4 - lvx v17,off96,r3 - VPERM(v16,v16,v16,byteswap) - VPMSUMW(v16,v16,v17) - vxor v0,v0,v16 - - /* Now xor all the parallel chunks together */ -1: vxor v0,v0,v1 - vxor v2,v2,v3 - vxor v4,v4,v5 - vxor v6,v6,v7 - - vxor v0,v0,v2 - vxor v4,v4,v6 - - vxor v0,v0,v4 - -.Lbarrett_reduction: - /* Barrett constants */ - LOAD_REG_ADDR(r3, .barrett_constants) - - lvx const1,0,r3 - lvx const2,off16,r3 - - vsldoi v1,v0,v0,8 - vxor v0,v0,v1 /* xor two 64 bit results together */ - -#ifdef REFLECT - /* shift left one bit */ - vspltisb v1,1 - vsl v0,v0,v1 -#endif - - vand v0,v0,mask_64bit -#ifndef REFLECT - /* - * Now for the Barrett reduction algorithm. The idea is to calculate q, - * the multiple of our polynomial that we need to subtract. By - * doing the computation 2x bits higher (ie 64 bits) and shifting the - * result back down 2x bits, we round down to the nearest multiple. - */ - VPMSUMD(v1,v0,const1) /* ma */ - vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ - VPMSUMD(v1,v1,const2) /* qn */ - vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ - - /* - * Get the result into r3. We need to shift it left 8 bytes: - * V0 [ 0 1 2 X ] - * V0 [ 0 X 2 3 ] - */ - vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ -#else - /* - * The reflected version of Barrett reduction. Instead of bit - * reflecting our data (which is expensive to do), we bit reflect our - * constants and our algorithm, which means the intermediate data in - * our vector registers goes from 0-63 instead of 63-0. We can reflect - * the algorithm because we don't carry in mod 2 arithmetic. - */ - vand v1,v0,mask_32bit /* bottom 32 bits of a */ - VPMSUMD(v1,v1,const1) /* ma */ - vand v1,v1,mask_32bit /* bottom 32bits of ma */ - VPMSUMD(v1,v1,const2) /* qn */ - vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ - - /* - * Since we are bit reflected, the result (ie the low 32 bits) is in - * the high 32 bits. We just need to shift it left 4 bytes - * V0 [ 0 1 X 3 ] - * V0 [ 0 X 2 3 ] - */ - vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ -#endif - - /* Get it into r3 */ - MFVRD(R3, v0) - -.Lout: - subi r6,r1,56+10*16 - subi r7,r1,56+2*16 - - lvx v20,0,r6 - lvx v21,off16,r6 - lvx v22,off32,r6 - lvx v23,off48,r6 - lvx v24,off64,r6 - lvx v25,off80,r6 - lvx v26,off96,r6 - lvx v27,off112,r6 - lvx v28,0,r7 - lvx v29,off16,r7 - - ld r31,-8(r1) - ld r30,-16(r1) - ld r29,-24(r1) - ld r28,-32(r1) - ld r27,-40(r1) - ld r26,-48(r1) - ld r25,-56(r1) - - blr - -.Lfirst_warm_up_done: - lvx const1,0,r3 - addi r3,r3,16 - - VPMSUMD(v8,v16,const1) - VPMSUMD(v9,v17,const1) - VPMSUMD(v10,v18,const1) - VPMSUMD(v11,v19,const1) - VPMSUMD(v12,v20,const1) - VPMSUMD(v13,v21,const1) - VPMSUMD(v14,v22,const1) - VPMSUMD(v15,v23,const1) - - b .Lsecond_cool_down - -.Lshort: - cmpdi r5,0 - beq .Lzero - - LOAD_REG_ADDR(r3, .short_constants) - - /* Calculate where in the constant table we need to start */ - subfic r6,r5,256 - add r3,r3,r6 - - /* How many 16 byte chunks? */ - srdi r7,r5,4 - mtctr r7 - - vxor v19,v19,v19 - vxor v20,v20,v20 - - lvx v0,0,r4 - lvx v16,0,r3 - VPERM(v0,v0,v16,byteswap) - vxor v0,v0,v8 /* xor in initial value */ - VPMSUMW(v0,v0,v16) - bdz .Lv0 - - lvx v1,off16,r4 - lvx v17,off16,r3 - VPERM(v1,v1,v17,byteswap) - VPMSUMW(v1,v1,v17) - bdz .Lv1 - - lvx v2,off32,r4 - lvx v16,off32,r3 - VPERM(v2,v2,v16,byteswap) - VPMSUMW(v2,v2,v16) - bdz .Lv2 - - lvx v3,off48,r4 - lvx v17,off48,r3 - VPERM(v3,v3,v17,byteswap) - VPMSUMW(v3,v3,v17) - bdz .Lv3 - - lvx v4,off64,r4 - lvx v16,off64,r3 - VPERM(v4,v4,v16,byteswap) - VPMSUMW(v4,v4,v16) - bdz .Lv4 - - lvx v5,off80,r4 - lvx v17,off80,r3 - VPERM(v5,v5,v17,byteswap) - VPMSUMW(v5,v5,v17) - bdz .Lv5 - - lvx v6,off96,r4 - lvx v16,off96,r3 - VPERM(v6,v6,v16,byteswap) - VPMSUMW(v6,v6,v16) - bdz .Lv6 - - lvx v7,off112,r4 - lvx v17,off112,r3 - VPERM(v7,v7,v17,byteswap) - VPMSUMW(v7,v7,v17) - bdz .Lv7 - - addi r3,r3,128 - addi r4,r4,128 - - lvx v8,0,r4 - lvx v16,0,r3 - VPERM(v8,v8,v16,byteswap) - VPMSUMW(v8,v8,v16) - bdz .Lv8 - - lvx v9,off16,r4 - lvx v17,off16,r3 - VPERM(v9,v9,v17,byteswap) - VPMSUMW(v9,v9,v17) - bdz .Lv9 - - lvx v10,off32,r4 - lvx v16,off32,r3 - VPERM(v10,v10,v16,byteswap) - VPMSUMW(v10,v10,v16) - bdz .Lv10 - - lvx v11,off48,r4 - lvx v17,off48,r3 - VPERM(v11,v11,v17,byteswap) - VPMSUMW(v11,v11,v17) - bdz .Lv11 - - lvx v12,off64,r4 - lvx v16,off64,r3 - VPERM(v12,v12,v16,byteswap) - VPMSUMW(v12,v12,v16) - bdz .Lv12 - - lvx v13,off80,r4 - lvx v17,off80,r3 - VPERM(v13,v13,v17,byteswap) - VPMSUMW(v13,v13,v17) - bdz .Lv13 - - lvx v14,off96,r4 - lvx v16,off96,r3 - VPERM(v14,v14,v16,byteswap) - VPMSUMW(v14,v14,v16) - bdz .Lv14 - - lvx v15,off112,r4 - lvx v17,off112,r3 - VPERM(v15,v15,v17,byteswap) - VPMSUMW(v15,v15,v17) - -.Lv15: vxor v19,v19,v15 -.Lv14: vxor v20,v20,v14 -.Lv13: vxor v19,v19,v13 -.Lv12: vxor v20,v20,v12 -.Lv11: vxor v19,v19,v11 -.Lv10: vxor v20,v20,v10 -.Lv9: vxor v19,v19,v9 -.Lv8: vxor v20,v20,v8 -.Lv7: vxor v19,v19,v7 -.Lv6: vxor v20,v20,v6 -.Lv5: vxor v19,v19,v5 -.Lv4: vxor v20,v20,v4 -.Lv3: vxor v19,v19,v3 -.Lv2: vxor v20,v20,v2 -.Lv1: vxor v19,v19,v1 -.Lv0: vxor v20,v20,v0 - - vxor v0,v19,v20 - - b .Lbarrett_reduction - -.Lzero: - mr r3,r10 - b .Lout - -FUNC_END(CRC_FUNCTION_NAME) diff --git a/arch/powerpc/crypto/crc32c-vpmsum_asm.S b/arch/powerpc/crypto/crc32c-vpmsum_asm.S deleted file mode 100644 index bf442004ea1f..000000000000 --- a/arch/powerpc/crypto/crc32c-vpmsum_asm.S +++ /dev/null @@ -1,842 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Calculate a crc32c with vpmsum acceleration - * - * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM - */ - .section .rodata -.balign 16 - -.byteswap_constant: - /* byte reverse permute constant */ - .octa 0x0F0E0D0C0B0A09080706050403020100 - -.constants: - - /* Reduce 262144 kbits to 1024 bits */ - /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */ - .octa 0x00000000b6ca9e20000000009c37c408 - - /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */ - .octa 0x00000000350249a800000001b51df26c - - /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */ - .octa 0x00000001862dac54000000000724b9d0 - - /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */ - .octa 0x00000001d87fb48c00000001c00532fe - - /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */ - .octa 0x00000001f39b699e00000000f05a9362 - - /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */ - .octa 0x0000000101da11b400000001e1007970 - - /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */ - .octa 0x00000001cab571e000000000a57366ee - - /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */ - .octa 0x00000000c7020cfe0000000192011284 - - /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */ - .octa 0x00000000cdaed1ae0000000162716d9a - - /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */ - .octa 0x00000001e804effc00000000cd97ecde - - /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */ - .octa 0x0000000077c3ea3a0000000058812bc0 - - /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */ - .octa 0x0000000068df31b40000000088b8c12e - - /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */ - .octa 0x00000000b059b6c200000001230b234c - - /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */ - .octa 0x0000000145fb8ed800000001120b416e - - /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */ - .octa 0x00000000cbc0916800000001974aecb0 - - /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */ - .octa 0x000000005ceeedc2000000008ee3f226 - - /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */ - .octa 0x0000000047d74e8600000001089aba9a - - /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */ - .octa 0x00000001407e9e220000000065113872 - - /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */ - .octa 0x00000001da967bda000000005c07ec10 - - /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */ - .octa 0x000000006c8983680000000187590924 - - /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */ - .octa 0x00000000f2d14c9800000000e35da7c6 - - /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */ - .octa 0x00000001993c6ad4000000000415855a - - /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */ - .octa 0x000000014683d1ac0000000073617758 - - /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */ - .octa 0x00000001a7c93e6c0000000176021d28 - - /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */ - .octa 0x000000010211e90a00000001c358fd0a - - /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */ - .octa 0x000000001119403e00000001ff7a2c18 - - /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */ - .octa 0x000000001c3261aa00000000f2d9f7e4 - - /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */ - .octa 0x000000014e37a634000000016cf1f9c8 - - /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */ - .octa 0x0000000073786c0c000000010af9279a - - /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */ - .octa 0x000000011dc037f80000000004f101e8 - - /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */ - .octa 0x0000000031433dfc0000000070bcf184 - - /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */ - .octa 0x000000009cde8348000000000a8de642 - - /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */ - .octa 0x0000000038d3c2a60000000062ea130c - - /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */ - .octa 0x000000011b25f26000000001eb31cbb2 - - /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */ - .octa 0x000000001629e6f00000000170783448 - - /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */ - .octa 0x0000000160838b4c00000001a684b4c6 - - /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */ - .octa 0x000000007a44011c00000000253ca5b4 - - /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */ - .octa 0x00000000226f417a0000000057b4b1e2 - - /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */ - .octa 0x0000000045eb2eb400000000b6bd084c - - /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */ - .octa 0x000000014459d70c0000000123c2d592 - - /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */ - .octa 0x00000001d406ed8200000000159dafce - - /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */ - .octa 0x0000000160c8e1a80000000127e1a64e - - /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */ - .octa 0x0000000027ba80980000000056860754 - - /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */ - .octa 0x000000006d92d01800000001e661aae8 - - /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */ - .octa 0x000000012ed7e3f200000000f82c6166 - - /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */ - .octa 0x000000002dc8778800000000c4f9c7ae - - /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */ - .octa 0x0000000018240bb80000000074203d20 - - /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */ - .octa 0x000000001ad381580000000198173052 - - /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */ - .octa 0x00000001396b78f200000001ce8aba54 - - /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */ - .octa 0x000000011a68133400000001850d5d94 - - /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */ - .octa 0x000000012104732e00000001d609239c - - /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */ - .octa 0x00000000a140d90c000000001595f048 - - /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */ - .octa 0x00000001b7215eda0000000042ccee08 - - /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */ - .octa 0x00000001aaf1df3c000000010a389d74 - - /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */ - .octa 0x0000000029d15b8a000000012a840da6 - - /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */ - .octa 0x00000000f1a96922000000001d181c0c - - /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */ - .octa 0x00000001ac80d03c0000000068b7d1f6 - - /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */ - .octa 0x000000000f11d56a000000005b0f14fc - - /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */ - .octa 0x00000001f1c022a20000000179e9e730 - - /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */ - .octa 0x0000000173d00ae200000001ce1368d6 - - /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */ - .octa 0x00000001d4ffe4ac0000000112c3a84c - - /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */ - .octa 0x000000016edc5ae400000000de940fee - - /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */ - .octa 0x00000001f1a0214000000000fe896b7e - - /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */ - .octa 0x00000000ca0b28a000000001f797431c - - /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */ - .octa 0x00000001928e30a20000000053e989ba - - /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */ - .octa 0x0000000097b1b002000000003920cd16 - - /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */ - .octa 0x00000000b15bf90600000001e6f579b8 - - /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */ - .octa 0x00000000411c5d52000000007493cb0a - - /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */ - .octa 0x00000001c36f330000000001bdd376d8 - - /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */ - .octa 0x00000001119227e0000000016badfee6 - - /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */ - .octa 0x00000000114d47020000000071de5c58 - - /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */ - .octa 0x00000000458b5b9800000000453f317c - - /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */ - .octa 0x000000012e31fb8e0000000121675cce - - /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */ - .octa 0x000000005cf619d800000001f409ee92 - - /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */ - .octa 0x0000000063f4d8b200000000f36b9c88 - - /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */ - .octa 0x000000004138dc8a0000000036b398f4 - - /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */ - .octa 0x00000001d29ee8e000000001748f9adc - - /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */ - .octa 0x000000006a08ace800000001be94ec00 - - /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */ - .octa 0x0000000127d4201000000000b74370d6 - - /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */ - .octa 0x0000000019d76b6200000001174d0b98 - - /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */ - .octa 0x00000001b1471f6e00000000befc06a4 - - /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */ - .octa 0x00000001f64c19cc00000001ae125288 - - /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */ - .octa 0x00000000003c0ea00000000095c19b34 - - /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */ - .octa 0x000000014d73abf600000001a78496f2 - - /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */ - .octa 0x00000001620eb84400000001ac5390a0 - - /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */ - .octa 0x0000000147655048000000002a80ed6e - - /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */ - .octa 0x0000000067b5077e00000001fa9b0128 - - /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */ - .octa 0x0000000010ffe20600000001ea94929e - - /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */ - .octa 0x000000000fee8f1e0000000125f4305c - - /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */ - .octa 0x00000001da26fbae00000001471e2002 - - /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */ - .octa 0x00000001b3a8bd880000000132d2253a - - /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */ - .octa 0x00000000e8f3898e00000000f26b3592 - - /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */ - .octa 0x00000000b0d0d28c00000000bc8b67b0 - - /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */ - .octa 0x0000000030f2a798000000013a826ef2 - - /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */ - .octa 0x000000000fba10020000000081482c84 - - /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */ - .octa 0x00000000bdb9bd7200000000e77307c2 - - /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */ - .octa 0x0000000075d3bf5a00000000d4a07ec8 - - /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */ - .octa 0x00000000ef1f98a00000000017102100 - - /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */ - .octa 0x00000000689c760200000000db406486 - - /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */ - .octa 0x000000016d5fa5fe0000000192db7f88 - - /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */ - .octa 0x00000001d0d2b9ca000000018bf67b1e - - /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */ - .octa 0x0000000041e7b470000000007c09163e - - /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */ - .octa 0x00000001cbb6495e000000000adac060 - - /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */ - .octa 0x000000010052a0b000000000bd8316ae - - /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */ - .octa 0x00000001d8effb5c000000019f09ab54 - - /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */ - .octa 0x00000001d969853c0000000125155542 - - /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */ - .octa 0x00000000523ccce2000000018fdb5882 - - /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */ - .octa 0x000000001e2436bc00000000e794b3f4 - - /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */ - .octa 0x00000000ddd1c3a2000000016f9bb022 - - /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */ - .octa 0x0000000019fcfe3800000000290c9978 - - /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */ - .octa 0x00000001ce95db640000000083c0f350 - - /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */ - .octa 0x00000000af5828060000000173ea6628 - - /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */ - .octa 0x00000001006388f600000001c8b4e00a - - /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */ - .octa 0x0000000179eca00a00000000de95d6aa - - /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */ - .octa 0x0000000122410a6a000000010b7f7248 - - /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */ - .octa 0x000000004288e87c00000001326e3a06 - - /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */ - .octa 0x000000016c5490da00000000bb62c2e6 - - /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */ - .octa 0x00000000d1c71f6e0000000156a4b2c2 - - /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */ - .octa 0x00000001b4ce08a6000000011dfe763a - - /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */ - .octa 0x00000001466ba60c000000007bcca8e2 - - /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */ - .octa 0x00000001f6c488a40000000186118faa - - /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */ - .octa 0x000000013bfb06820000000111a65a88 - - /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */ - .octa 0x00000000690e9e54000000003565e1c4 - - /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */ - .octa 0x00000000281346b6000000012ed02a82 - - /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */ - .octa 0x000000015646402400000000c486ecfc - - /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */ - .octa 0x000000016063a8dc0000000001b951b2 - - /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */ - .octa 0x0000000116a663620000000048143916 - - /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */ - .octa 0x000000017e8aa4d200000001dc2ae124 - - /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */ - .octa 0x00000001728eb10c00000001416c58d6 - - /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */ - .octa 0x00000001b08fd7fa00000000a479744a - - /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */ - .octa 0x00000001092a16e80000000096ca3a26 - - /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */ - .octa 0x00000000a505637c00000000ff223d4e - - /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */ - .octa 0x00000000d94869b2000000010e84da42 - - /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */ - .octa 0x00000001c8b203ae00000001b61ba3d0 - - /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */ - .octa 0x000000005704aea000000000680f2de8 - - /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */ - .octa 0x000000012e295fa2000000008772a9a8 - - /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */ - .octa 0x000000011d0908bc0000000155f295bc - - /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */ - .octa 0x0000000193ed97ea00000000595f9282 - - /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */ - .octa 0x000000013a0f1c520000000164b1c25a - - /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */ - .octa 0x000000010c2c40c000000000fbd67c50 - - /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */ - .octa 0x00000000ff6fac3e0000000096076268 - - /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */ - .octa 0x000000017b3609c000000001d288e4cc - - /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */ - .octa 0x0000000088c8c92200000001eaac1bdc - - /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */ - .octa 0x00000001751baae600000001f1ea39e2 - - /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */ - .octa 0x000000010795297200000001eb6506fc - - /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */ - .octa 0x0000000162b00abe000000010f806ffe - - /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */ - .octa 0x000000000d7b404c000000010408481e - - /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */ - .octa 0x00000000763b13d40000000188260534 - - /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */ - .octa 0x00000000f6dc22d80000000058fc73e0 - - /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */ - .octa 0x000000007daae06000000000391c59b8 - - /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */ - .octa 0x000000013359ab7c000000018b638400 - - /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */ - .octa 0x000000008add438a000000011738f5c4 - - /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */ - .octa 0x00000001edbefdea000000008cf7c6da - - /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */ - .octa 0x000000004104e0f800000001ef97fb16 - - /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */ - .octa 0x00000000b48a82220000000102130e20 - - /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */ - .octa 0x00000001bcb4684400000000db968898 - - /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */ - .octa 0x000000013293ce0a00000000b5047b5e - - /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */ - .octa 0x00000001710d0844000000010b90fdb2 - - /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */ - .octa 0x0000000117907f6e000000004834a32e - - /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */ - .octa 0x0000000087ddf93e0000000059c8f2b0 - - /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */ - .octa 0x000000005970e9b00000000122cec508 - - /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */ - .octa 0x0000000185b2b7d0000000000a330cda - - /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */ - .octa 0x00000001dcee0efc000000014a47148c - - /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */ - .octa 0x0000000030da27220000000042c61cb8 - - /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */ - .octa 0x000000012f925a180000000012fe6960 - - /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */ - .octa 0x00000000dd2e357c00000000dbda2c20 - - /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */ - .octa 0x00000000071c80de000000011122410c - - /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */ - .octa 0x000000011513140a00000000977b2070 - - /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */ - .octa 0x00000001df876e8e000000014050438e - - /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */ - .octa 0x000000015f81d6ce0000000147c840e8 - - /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */ - .octa 0x000000019dd94dbe00000001cc7c88ce - - /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */ - .octa 0x00000001373d206e00000001476b35a4 - - /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */ - .octa 0x00000000668ccade000000013d52d508 - - /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */ - .octa 0x00000001b192d268000000008e4be32e - - /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */ - .octa 0x00000000e30f3a7800000000024120fe - - /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */ - .octa 0x000000010ef1f7bc00000000ddecddb4 - - /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */ - .octa 0x00000001f5ac738000000000d4d403bc - - /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */ - .octa 0x000000011822ea7000000001734b89aa - - /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */ - .octa 0x00000000c3a33848000000010e7a58d6 - - /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */ - .octa 0x00000001bd151c2400000001f9f04e9c - - /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */ - .octa 0x0000000056002d7600000000b692225e - - /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */ - .octa 0x000000014657c4f4000000019b8d3f3e - - /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */ - .octa 0x0000000113742d7c00000001a874f11e - - /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */ - .octa 0x000000019c5920ba000000010d5a4254 - - /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */ - .octa 0x000000005216d2d600000000bbb2f5d6 - - /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */ - .octa 0x0000000136f5ad8a0000000179cc0e36 - - /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */ - .octa 0x000000018b07beb600000001dca1da4a - - /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */ - .octa 0x00000000db1e93b000000000feb1a192 - - /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */ - .octa 0x000000000b96fa3a00000000d1eeedd6 - - /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */ - .octa 0x00000001d9968af0000000008fad9bb4 - - /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */ - .octa 0x000000000e4a77a200000001884938e4 - - /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */ - .octa 0x00000000508c2ac800000001bc2e9bc0 - - /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */ - .octa 0x0000000021572a8000000001f9658a68 - - /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */ - .octa 0x00000001b859daf2000000001b9224fc - - /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */ - .octa 0x000000016f7884740000000055b2fb84 - - /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */ - .octa 0x00000001b438810e000000018b090348 - - /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */ - .octa 0x0000000095ddc6f2000000011ccbd5ea - - /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */ - .octa 0x00000001d977c20c0000000007ae47f8 - - /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */ - .octa 0x00000000ebedb99a0000000172acbec0 - - /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */ - .octa 0x00000001df9e9e9200000001c6e3ff20 - - /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */ - .octa 0x00000001a4a3f95200000000e1b38744 - - /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */ - .octa 0x00000000e2f5122000000000791585b2 - - /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */ - .octa 0x000000004aa01f3e00000000ac53b894 - - /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */ - .octa 0x00000000b3e90a5800000001ed5f2cf4 - - /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */ - .octa 0x000000000c9ca2aa00000001df48b2e0 - - /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */ - .octa 0x000000015168231600000000049c1c62 - - /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */ - .octa 0x0000000036fce78c000000017c460c12 - - /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */ - .octa 0x000000009037dc10000000015be4da7e - - /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */ - .octa 0x00000000d3298582000000010f38f668 - - /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */ - .octa 0x00000001b42e8ad60000000039f40a00 - - /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */ - .octa 0x00000000142a983800000000bd4c10c4 - - /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */ - .octa 0x0000000109c7f1900000000042db1d98 - - /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */ - .octa 0x0000000056ff931000000001c905bae6 - - /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */ - .octa 0x00000001594513aa00000000069d40ea - - /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */ - .octa 0x00000001e3b5b1e8000000008e4fbad0 - - /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */ - .octa 0x000000011dd5fc080000000047bedd46 - - /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */ - .octa 0x00000001675f0cc20000000026396bf8 - - /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */ - .octa 0x00000000d1c8dd4400000000379beb92 - - /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */ - .octa 0x0000000115ebd3d8000000000abae54a - - /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */ - .octa 0x00000001ecbd0dac0000000007e6a128 - - /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */ - .octa 0x00000000cdf67af2000000000ade29d2 - - /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */ - .octa 0x000000004c01ff4c00000000f974c45c - - /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */ - .octa 0x00000000f2d8657e00000000e77ac60a - - /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */ - .octa 0x000000006bae74c40000000145895816 - - /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */ - .octa 0x0000000152af8aa00000000038e362be - - /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */ - .octa 0x0000000004663802000000007f991a64 - - /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */ - .octa 0x00000001ab2f5afc00000000fa366d3a - - /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */ - .octa 0x0000000074a4ebd400000001a2bb34f0 - - /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */ - .octa 0x00000001d7ab3a4c0000000028a9981e - - /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */ - .octa 0x00000001a8da60c600000001dbc672be - - /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */ - .octa 0x000000013cf6382000000000b04d77f6 - - /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */ - .octa 0x00000000bec12e1e0000000124400d96 - - /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */ - .octa 0x00000001c6368010000000014ca4b414 - - /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */ - .octa 0x00000001e6e78758000000012fe2c938 - - /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */ - .octa 0x000000008d7f2b3c00000001faed01e6 - - /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */ - .octa 0x000000016b4a156e000000007e80ecfe - - /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */ - .octa 0x00000001c63cfeb60000000098daee94 - - /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */ - .octa 0x000000015f902670000000010a04edea - - /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */ - .octa 0x00000001cd5de11e00000001c00b4524 - - /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */ - .octa 0x000000001acaec540000000170296550 - - /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */ - .octa 0x000000002bd0ca780000000181afaa48 - - /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */ - .octa 0x0000000032d63d5c0000000185a31ffa - - /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */ - .octa 0x000000001c6d4e4c000000002469f608 - - /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */ - .octa 0x0000000106a60b92000000006980102a - - /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */ - .octa 0x00000000d3855e120000000111ea9ca8 - - /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */ - .octa 0x00000000e312563600000001bd1d29ce - - /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */ - .octa 0x000000009e8f7ea400000001b34b9580 - - /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */ - .octa 0x00000001c82e562c000000003076054e - - /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */ - .octa 0x00000000ca9f09ce000000012a608ea4 - - /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */ - .octa 0x00000000c63764e600000000784d05fe - - /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */ - .octa 0x0000000168d2e49e000000016ef0d82a - - /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */ - .octa 0x00000000e986c1480000000075bda454 - - /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */ - .octa 0x00000000cfb65894000000003dc0a1c4 - - /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */ - .octa 0x0000000111cadee400000000e9a5d8be - - /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */ - .octa 0x0000000171fb63ce00000001609bc4b4 - -.short_constants: - - /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */ - /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */ - .octa 0x7fec2963e5bf80485cf015c388e56f72 - - /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */ - .octa 0x38e888d4844752a9963a18920246e2e6 - - /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */ - .octa 0x42316c00730206ad419a441956993a31 - - /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */ - .octa 0x543d5c543e65ddf9924752ba2b830011 - - /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */ - .octa 0x78e87aaf56767c9255bd7f9518e4a304 - - /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */ - .octa 0x8f68fcec1903da7f6d76739fe0553f1e - - /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */ - .octa 0x3f4840246791d588c133722b1fe0b5c3 - - /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */ - .octa 0x34c96751b04de25a64b67ee0e55ef1f3 - - /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */ - .octa 0x156c8e180b4a395b069db049b8fdb1e7 - - /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */ - .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e - - /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */ - .octa 0x041d37768cd75659817cdc5119b29a35 - - /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */ - .octa 0x3a0777818cfaa9651ce9d94b36c41f1c - - /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */ - .octa 0x0e148e8252377a554f256efcb82be955 - - /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */ - .octa 0x9c25531d19e65ddeec1631edb2dea967 - - /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */ - .octa 0x790606ff9957c0a65d27e147510ac59a - - /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */ - .octa 0x82f63b786ea2d55ca66805eb18b8ea18 - - -.barrett_constants: - /* 33 bit reflected Barrett constant m - (4^32)/n */ - .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */ - /* 33 bit reflected Barrett constant n */ - .octa 0x00000000000000000000000105ec76f1 - -#define CRC_FUNCTION_NAME __crc32c_vpmsum -#define REFLECT -#include "crc32-vpmsum_core.S" diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c b/arch/powerpc/crypto/crc32c-vpmsum_glue.c deleted file mode 100644 index 63760b7dbb76..000000000000 --- a/arch/powerpc/crypto/crc32c-vpmsum_glue.c +++ /dev/null @@ -1,173 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include <linux/crc32.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/cpufeature.h> -#include <asm/simd.h> -#include <asm/switch_to.h> - -#define CHKSUM_BLOCK_SIZE 1 -#define CHKSUM_DIGEST_SIZE 4 - -#define VMX_ALIGN 16 -#define VMX_ALIGN_MASK (VMX_ALIGN-1) - -#define VECTOR_BREAKPOINT 512 - -u32 __crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len); - -static u32 crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len) -{ - unsigned int prealign; - unsigned int tail; - - if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || !crypto_simd_usable()) - return __crc32c_le(crc, p, len); - - if ((unsigned long)p & VMX_ALIGN_MASK) { - prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); - crc = __crc32c_le(crc, p, prealign); - len -= prealign; - p += prealign; - } - - if (len & ~VMX_ALIGN_MASK) { - preempt_disable(); - pagefault_disable(); - enable_kernel_altivec(); - crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); - disable_kernel_altivec(); - pagefault_enable(); - preempt_enable(); - } - - tail = len & VMX_ALIGN_MASK; - if (tail) { - p += len & ~VMX_ALIGN_MASK; - crc = __crc32c_le(crc, p, tail); - } - - return crc; -} - -static int crc32c_vpmsum_cra_init(struct crypto_tfm *tfm) -{ - u32 *key = crypto_tfm_ctx(tfm); - - *key = ~0; - - return 0; -} - -/* - * Setting the seed allows arbitrary accumulators and flexible XOR policy - * If your algorithm starts with ~0, then XOR with ~0 before you set - * the seed. - */ -static int crc32c_vpmsum_setkey(struct crypto_shash *hash, const u8 *key, - unsigned int keylen) -{ - u32 *mctx = crypto_shash_ctx(hash); - - if (keylen != sizeof(u32)) - return -EINVAL; - *mctx = le32_to_cpup((__le32 *)key); - return 0; -} - -static int crc32c_vpmsum_init(struct shash_desc *desc) -{ - u32 *mctx = crypto_shash_ctx(desc->tfm); - u32 *crcp = shash_desc_ctx(desc); - - *crcp = *mctx; - - return 0; -} - -static int crc32c_vpmsum_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - u32 *crcp = shash_desc_ctx(desc); - - *crcp = crc32c_vpmsum(*crcp, data, len); - - return 0; -} - -static int __crc32c_vpmsum_finup(u32 *crcp, const u8 *data, unsigned int len, - u8 *out) -{ - *(__le32 *)out = ~cpu_to_le32(crc32c_vpmsum(*crcp, data, len)); - - return 0; -} - -static int crc32c_vpmsum_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32c_vpmsum_finup(shash_desc_ctx(desc), data, len, out); -} - -static int crc32c_vpmsum_final(struct shash_desc *desc, u8 *out) -{ - u32 *crcp = shash_desc_ctx(desc); - - *(__le32 *)out = ~cpu_to_le32p(crcp); - - return 0; -} - -static int crc32c_vpmsum_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32c_vpmsum_finup(crypto_shash_ctx(desc->tfm), data, len, - out); -} - -static struct shash_alg alg = { - .setkey = crc32c_vpmsum_setkey, - .init = crc32c_vpmsum_init, - .update = crc32c_vpmsum_update, - .final = crc32c_vpmsum_final, - .finup = crc32c_vpmsum_finup, - .digest = crc32c_vpmsum_digest, - .descsize = sizeof(u32), - .digestsize = CHKSUM_DIGEST_SIZE, - .base = { - .cra_name = "crc32c", - .cra_driver_name = "crc32c-vpmsum", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_ctxsize = sizeof(u32), - .cra_module = THIS_MODULE, - .cra_init = crc32c_vpmsum_cra_init, - } -}; - -static int __init crc32c_vpmsum_mod_init(void) -{ - if (!cpu_has_feature(CPU_FTR_ARCH_207S)) - return -ENODEV; - - return crypto_register_shash(&alg); -} - -static void __exit crc32c_vpmsum_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, crc32c_vpmsum_mod_init); -module_exit(crc32c_vpmsum_mod_fini); - -MODULE_AUTHOR("Anton Blanchard <anton@samba.org>"); -MODULE_DESCRIPTION("CRC32C using vector polynomial multiply-sum instructions"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_CRYPTO("crc32c"); -MODULE_ALIAS_CRYPTO("crc32c-vpmsum"); diff --git a/arch/powerpc/crypto/crct10dif-vpmsum_asm.S b/arch/powerpc/crypto/crct10dif-vpmsum_asm.S deleted file mode 100644 index f0b93a0fe168..000000000000 --- a/arch/powerpc/crypto/crct10dif-vpmsum_asm.S +++ /dev/null @@ -1,845 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Calculate a CRC T10DIF with vpmsum acceleration - * - * Constants generated by crc32-vpmsum, available at - * https://github.com/antonblanchard/crc32-vpmsum - * - * crc32-vpmsum is - * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM - */ - .section .rodata -.balign 16 - -.byteswap_constant: - /* byte reverse permute constant */ - .octa 0x0F0E0D0C0B0A09080706050403020100 - -.constants: - - /* Reduce 262144 kbits to 1024 bits */ - /* x^261184 mod p(x), x^261120 mod p(x) */ - .octa 0x0000000056d300000000000052550000 - - /* x^260160 mod p(x), x^260096 mod p(x) */ - .octa 0x00000000ee67000000000000a1e40000 - - /* x^259136 mod p(x), x^259072 mod p(x) */ - .octa 0x0000000060830000000000004ad10000 - - /* x^258112 mod p(x), x^258048 mod p(x) */ - .octa 0x000000008cfe0000000000009ab40000 - - /* x^257088 mod p(x), x^257024 mod p(x) */ - .octa 0x000000003e93000000000000fdb50000 - - /* x^256064 mod p(x), x^256000 mod p(x) */ - .octa 0x000000003c2000000000000045480000 - - /* x^255040 mod p(x), x^254976 mod p(x) */ - .octa 0x00000000b1fc0000000000008d690000 - - /* x^254016 mod p(x), x^253952 mod p(x) */ - .octa 0x00000000f82b00000000000024ad0000 - - /* x^252992 mod p(x), x^252928 mod p(x) */ - .octa 0x0000000044420000000000009f1a0000 - - /* x^251968 mod p(x), x^251904 mod p(x) */ - .octa 0x00000000e88c00000000000066ec0000 - - /* x^250944 mod p(x), x^250880 mod p(x) */ - .octa 0x00000000385c000000000000c87d0000 - - /* x^249920 mod p(x), x^249856 mod p(x) */ - .octa 0x000000003227000000000000c8ff0000 - - /* x^248896 mod p(x), x^248832 mod p(x) */ - .octa 0x00000000a9a900000000000033440000 - - /* x^247872 mod p(x), x^247808 mod p(x) */ - .octa 0x00000000abaa00000000000066eb0000 - - /* x^246848 mod p(x), x^246784 mod p(x) */ - .octa 0x000000001ac3000000000000c4ef0000 - - /* x^245824 mod p(x), x^245760 mod p(x) */ - .octa 0x0000000063f000000000000056f30000 - - /* x^244800 mod p(x), x^244736 mod p(x) */ - .octa 0x0000000032cc00000000000002050000 - - /* x^243776 mod p(x), x^243712 mod p(x) */ - .octa 0x00000000f8b5000000000000568e0000 - - /* x^242752 mod p(x), x^242688 mod p(x) */ - .octa 0x000000008db100000000000064290000 - - /* x^241728 mod p(x), x^241664 mod p(x) */ - .octa 0x0000000059ca0000000000006b660000 - - /* x^240704 mod p(x), x^240640 mod p(x) */ - .octa 0x000000005f5c00000000000018f80000 - - /* x^239680 mod p(x), x^239616 mod p(x) */ - .octa 0x0000000061af000000000000b6090000 - - /* x^238656 mod p(x), x^238592 mod p(x) */ - .octa 0x00000000e29e000000000000099a0000 - - /* x^237632 mod p(x), x^237568 mod p(x) */ - .octa 0x000000000975000000000000a8360000 - - /* x^236608 mod p(x), x^236544 mod p(x) */ - .octa 0x0000000043900000000000004f570000 - - /* x^235584 mod p(x), x^235520 mod p(x) */ - .octa 0x00000000f9cd000000000000134c0000 - - /* x^234560 mod p(x), x^234496 mod p(x) */ - .octa 0x000000007c29000000000000ec380000 - - /* x^233536 mod p(x), x^233472 mod p(x) */ - .octa 0x000000004c6a000000000000b0d10000 - - /* x^232512 mod p(x), x^232448 mod p(x) */ - .octa 0x00000000e7290000000000007d3e0000 - - /* x^231488 mod p(x), x^231424 mod p(x) */ - .octa 0x00000000f1ab000000000000f0b20000 - - /* x^230464 mod p(x), x^230400 mod p(x) */ - .octa 0x0000000039db0000000000009c270000 - - /* x^229440 mod p(x), x^229376 mod p(x) */ - .octa 0x000000005e2800000000000092890000 - - /* x^228416 mod p(x), x^228352 mod p(x) */ - .octa 0x00000000d44e000000000000d5ee0000 - - /* x^227392 mod p(x), x^227328 mod p(x) */ - .octa 0x00000000cd0a00000000000041f50000 - - /* x^226368 mod p(x), x^226304 mod p(x) */ - .octa 0x00000000c5b400000000000010520000 - - /* x^225344 mod p(x), x^225280 mod p(x) */ - .octa 0x00000000fd2100000000000042170000 - - /* x^224320 mod p(x), x^224256 mod p(x) */ - .octa 0x000000002f2500000000000095c20000 - - /* x^223296 mod p(x), x^223232 mod p(x) */ - .octa 0x000000001b0100000000000001ce0000 - - /* x^222272 mod p(x), x^222208 mod p(x) */ - .octa 0x000000000d430000000000002aca0000 - - /* x^221248 mod p(x), x^221184 mod p(x) */ - .octa 0x0000000030a6000000000000385e0000 - - /* x^220224 mod p(x), x^220160 mod p(x) */ - .octa 0x00000000e37b0000000000006f7a0000 - - /* x^219200 mod p(x), x^219136 mod p(x) */ - .octa 0x00000000873600000000000024320000 - - /* x^218176 mod p(x), x^218112 mod p(x) */ - .octa 0x00000000e9fb000000000000bd9c0000 - - /* x^217152 mod p(x), x^217088 mod p(x) */ - .octa 0x000000003b9500000000000054bc0000 - - /* x^216128 mod p(x), x^216064 mod p(x) */ - .octa 0x00000000133e000000000000a4660000 - - /* x^215104 mod p(x), x^215040 mod p(x) */ - .octa 0x00000000784500000000000079930000 - - /* x^214080 mod p(x), x^214016 mod p(x) */ - .octa 0x00000000b9800000000000001bb80000 - - /* x^213056 mod p(x), x^212992 mod p(x) */ - .octa 0x00000000687600000000000024400000 - - /* x^212032 mod p(x), x^211968 mod p(x) */ - .octa 0x00000000aff300000000000029e10000 - - /* x^211008 mod p(x), x^210944 mod p(x) */ - .octa 0x0000000024b50000000000005ded0000 - - /* x^209984 mod p(x), x^209920 mod p(x) */ - .octa 0x0000000017e8000000000000b12e0000 - - /* x^208960 mod p(x), x^208896 mod p(x) */ - .octa 0x00000000128400000000000026d20000 - - /* x^207936 mod p(x), x^207872 mod p(x) */ - .octa 0x000000002115000000000000a32a0000 - - /* x^206912 mod p(x), x^206848 mod p(x) */ - .octa 0x000000009595000000000000a1210000 - - /* x^205888 mod p(x), x^205824 mod p(x) */ - .octa 0x00000000281e000000000000ee8b0000 - - /* x^204864 mod p(x), x^204800 mod p(x) */ - .octa 0x0000000006010000000000003d0d0000 - - /* x^203840 mod p(x), x^203776 mod p(x) */ - .octa 0x00000000e2b600000000000034e90000 - - /* x^202816 mod p(x), x^202752 mod p(x) */ - .octa 0x000000001bd40000000000004cdb0000 - - /* x^201792 mod p(x), x^201728 mod p(x) */ - .octa 0x00000000df2800000000000030e90000 - - /* x^200768 mod p(x), x^200704 mod p(x) */ - .octa 0x0000000049c200000000000042590000 - - /* x^199744 mod p(x), x^199680 mod p(x) */ - .octa 0x000000009b97000000000000df950000 - - /* x^198720 mod p(x), x^198656 mod p(x) */ - .octa 0x000000006184000000000000da7b0000 - - /* x^197696 mod p(x), x^197632 mod p(x) */ - .octa 0x00000000461700000000000012510000 - - /* x^196672 mod p(x), x^196608 mod p(x) */ - .octa 0x000000009b40000000000000f37e0000 - - /* x^195648 mod p(x), x^195584 mod p(x) */ - .octa 0x00000000eeb2000000000000ecf10000 - - /* x^194624 mod p(x), x^194560 mod p(x) */ - .octa 0x00000000b2e800000000000050f20000 - - /* x^193600 mod p(x), x^193536 mod p(x) */ - .octa 0x00000000f59a000000000000e0b30000 - - /* x^192576 mod p(x), x^192512 mod p(x) */ - .octa 0x00000000467f0000000000004d5a0000 - - /* x^191552 mod p(x), x^191488 mod p(x) */ - .octa 0x00000000da92000000000000bb010000 - - /* x^190528 mod p(x), x^190464 mod p(x) */ - .octa 0x000000001e1000000000000022a40000 - - /* x^189504 mod p(x), x^189440 mod p(x) */ - .octa 0x0000000058fe000000000000836f0000 - - /* x^188480 mod p(x), x^188416 mod p(x) */ - .octa 0x00000000b9ce000000000000d78d0000 - - /* x^187456 mod p(x), x^187392 mod p(x) */ - .octa 0x0000000022210000000000004f8d0000 - - /* x^186432 mod p(x), x^186368 mod p(x) */ - .octa 0x00000000744600000000000033760000 - - /* x^185408 mod p(x), x^185344 mod p(x) */ - .octa 0x000000001c2e000000000000a1e50000 - - /* x^184384 mod p(x), x^184320 mod p(x) */ - .octa 0x00000000dcc8000000000000a1a40000 - - /* x^183360 mod p(x), x^183296 mod p(x) */ - .octa 0x00000000910f00000000000019a20000 - - /* x^182336 mod p(x), x^182272 mod p(x) */ - .octa 0x0000000055d5000000000000f6ae0000 - - /* x^181312 mod p(x), x^181248 mod p(x) */ - .octa 0x00000000c8ba000000000000a7ac0000 - - /* x^180288 mod p(x), x^180224 mod p(x) */ - .octa 0x0000000031f8000000000000eea20000 - - /* x^179264 mod p(x), x^179200 mod p(x) */ - .octa 0x000000001966000000000000c4d90000 - - /* x^178240 mod p(x), x^178176 mod p(x) */ - .octa 0x00000000b9810000000000002b470000 - - /* x^177216 mod p(x), x^177152 mod p(x) */ - .octa 0x000000008303000000000000f7cf0000 - - /* x^176192 mod p(x), x^176128 mod p(x) */ - .octa 0x000000002ce500000000000035b30000 - - /* x^175168 mod p(x), x^175104 mod p(x) */ - .octa 0x000000002fae0000000000000c7c0000 - - /* x^174144 mod p(x), x^174080 mod p(x) */ - .octa 0x00000000f50c0000000000009edf0000 - - /* x^173120 mod p(x), x^173056 mod p(x) */ - .octa 0x00000000714f00000000000004cd0000 - - /* x^172096 mod p(x), x^172032 mod p(x) */ - .octa 0x00000000c161000000000000541b0000 - - /* x^171072 mod p(x), x^171008 mod p(x) */ - .octa 0x0000000021c8000000000000e2700000 - - /* x^170048 mod p(x), x^169984 mod p(x) */ - .octa 0x00000000b93d00000000000009a60000 - - /* x^169024 mod p(x), x^168960 mod p(x) */ - .octa 0x00000000fbcf000000000000761c0000 - - /* x^168000 mod p(x), x^167936 mod p(x) */ - .octa 0x0000000026350000000000009db30000 - - /* x^166976 mod p(x), x^166912 mod p(x) */ - .octa 0x00000000b64f0000000000003e9f0000 - - /* x^165952 mod p(x), x^165888 mod p(x) */ - .octa 0x00000000bd0e00000000000078590000 - - /* x^164928 mod p(x), x^164864 mod p(x) */ - .octa 0x00000000d9360000000000008bc80000 - - /* x^163904 mod p(x), x^163840 mod p(x) */ - .octa 0x000000002f140000000000008c9f0000 - - /* x^162880 mod p(x), x^162816 mod p(x) */ - .octa 0x000000006a270000000000006af70000 - - /* x^161856 mod p(x), x^161792 mod p(x) */ - .octa 0x000000006685000000000000e5210000 - - /* x^160832 mod p(x), x^160768 mod p(x) */ - .octa 0x0000000062da00000000000008290000 - - /* x^159808 mod p(x), x^159744 mod p(x) */ - .octa 0x00000000bb4b000000000000e4d00000 - - /* x^158784 mod p(x), x^158720 mod p(x) */ - .octa 0x00000000d2490000000000004ae10000 - - /* x^157760 mod p(x), x^157696 mod p(x) */ - .octa 0x00000000c85b00000000000000e70000 - - /* x^156736 mod p(x), x^156672 mod p(x) */ - .octa 0x00000000c37a00000000000015650000 - - /* x^155712 mod p(x), x^155648 mod p(x) */ - .octa 0x0000000018530000000000001c2f0000 - - /* x^154688 mod p(x), x^154624 mod p(x) */ - .octa 0x00000000b46600000000000037bd0000 - - /* x^153664 mod p(x), x^153600 mod p(x) */ - .octa 0x00000000439b00000000000012190000 - - /* x^152640 mod p(x), x^152576 mod p(x) */ - .octa 0x00000000b1260000000000005ece0000 - - /* x^151616 mod p(x), x^151552 mod p(x) */ - .octa 0x00000000d8110000000000002a5e0000 - - /* x^150592 mod p(x), x^150528 mod p(x) */ - .octa 0x00000000099f00000000000052330000 - - /* x^149568 mod p(x), x^149504 mod p(x) */ - .octa 0x00000000f9f9000000000000f9120000 - - /* x^148544 mod p(x), x^148480 mod p(x) */ - .octa 0x000000005cc00000000000000ddc0000 - - /* x^147520 mod p(x), x^147456 mod p(x) */ - .octa 0x00000000343b00000000000012200000 - - /* x^146496 mod p(x), x^146432 mod p(x) */ - .octa 0x000000009222000000000000d12b0000 - - /* x^145472 mod p(x), x^145408 mod p(x) */ - .octa 0x00000000d781000000000000eb2d0000 - - /* x^144448 mod p(x), x^144384 mod p(x) */ - .octa 0x000000000bf400000000000058970000 - - /* x^143424 mod p(x), x^143360 mod p(x) */ - .octa 0x00000000094200000000000013690000 - - /* x^142400 mod p(x), x^142336 mod p(x) */ - .octa 0x00000000d55100000000000051950000 - - /* x^141376 mod p(x), x^141312 mod p(x) */ - .octa 0x000000008f11000000000000954b0000 - - /* x^140352 mod p(x), x^140288 mod p(x) */ - .octa 0x00000000140f000000000000b29e0000 - - /* x^139328 mod p(x), x^139264 mod p(x) */ - .octa 0x00000000c6db000000000000db5d0000 - - /* x^138304 mod p(x), x^138240 mod p(x) */ - .octa 0x00000000715b000000000000dfaf0000 - - /* x^137280 mod p(x), x^137216 mod p(x) */ - .octa 0x000000000dea000000000000e3b60000 - - /* x^136256 mod p(x), x^136192 mod p(x) */ - .octa 0x000000006f94000000000000ddaf0000 - - /* x^135232 mod p(x), x^135168 mod p(x) */ - .octa 0x0000000024e1000000000000e4f70000 - - /* x^134208 mod p(x), x^134144 mod p(x) */ - .octa 0x000000008810000000000000aa110000 - - /* x^133184 mod p(x), x^133120 mod p(x) */ - .octa 0x0000000030c2000000000000a8e60000 - - /* x^132160 mod p(x), x^132096 mod p(x) */ - .octa 0x00000000e6d0000000000000ccf30000 - - /* x^131136 mod p(x), x^131072 mod p(x) */ - .octa 0x000000004da000000000000079bf0000 - - /* x^130112 mod p(x), x^130048 mod p(x) */ - .octa 0x000000007759000000000000b3a30000 - - /* x^129088 mod p(x), x^129024 mod p(x) */ - .octa 0x00000000597400000000000028790000 - - /* x^128064 mod p(x), x^128000 mod p(x) */ - .octa 0x000000007acd000000000000b5820000 - - /* x^127040 mod p(x), x^126976 mod p(x) */ - .octa 0x00000000e6e400000000000026ad0000 - - /* x^126016 mod p(x), x^125952 mod p(x) */ - .octa 0x000000006d49000000000000985b0000 - - /* x^124992 mod p(x), x^124928 mod p(x) */ - .octa 0x000000000f0800000000000011520000 - - /* x^123968 mod p(x), x^123904 mod p(x) */ - .octa 0x000000002c7f000000000000846c0000 - - /* x^122944 mod p(x), x^122880 mod p(x) */ - .octa 0x000000005ce7000000000000ae1d0000 - - /* x^121920 mod p(x), x^121856 mod p(x) */ - .octa 0x00000000d4cb000000000000e21d0000 - - /* x^120896 mod p(x), x^120832 mod p(x) */ - .octa 0x000000003a2300000000000019bb0000 - - /* x^119872 mod p(x), x^119808 mod p(x) */ - .octa 0x000000000e1700000000000095290000 - - /* x^118848 mod p(x), x^118784 mod p(x) */ - .octa 0x000000006e6400000000000050d20000 - - /* x^117824 mod p(x), x^117760 mod p(x) */ - .octa 0x000000008d5c0000000000000cd10000 - - /* x^116800 mod p(x), x^116736 mod p(x) */ - .octa 0x00000000ef310000000000007b570000 - - /* x^115776 mod p(x), x^115712 mod p(x) */ - .octa 0x00000000645d00000000000053d60000 - - /* x^114752 mod p(x), x^114688 mod p(x) */ - .octa 0x0000000018fc00000000000077510000 - - /* x^113728 mod p(x), x^113664 mod p(x) */ - .octa 0x000000000cb3000000000000a7b70000 - - /* x^112704 mod p(x), x^112640 mod p(x) */ - .octa 0x00000000991b000000000000d0780000 - - /* x^111680 mod p(x), x^111616 mod p(x) */ - .octa 0x00000000845a000000000000be3c0000 - - /* x^110656 mod p(x), x^110592 mod p(x) */ - .octa 0x00000000d3a9000000000000df020000 - - /* x^109632 mod p(x), x^109568 mod p(x) */ - .octa 0x0000000017d7000000000000063e0000 - - /* x^108608 mod p(x), x^108544 mod p(x) */ - .octa 0x000000007a860000000000008ab40000 - - /* x^107584 mod p(x), x^107520 mod p(x) */ - .octa 0x00000000fd7c000000000000c7bd0000 - - /* x^106560 mod p(x), x^106496 mod p(x) */ - .octa 0x00000000a56b000000000000efd60000 - - /* x^105536 mod p(x), x^105472 mod p(x) */ - .octa 0x0000000010e400000000000071380000 - - /* x^104512 mod p(x), x^104448 mod p(x) */ - .octa 0x00000000994500000000000004d30000 - - /* x^103488 mod p(x), x^103424 mod p(x) */ - .octa 0x00000000b83c0000000000003b0e0000 - - /* x^102464 mod p(x), x^102400 mod p(x) */ - .octa 0x00000000d6c10000000000008b020000 - - /* x^101440 mod p(x), x^101376 mod p(x) */ - .octa 0x000000009efc000000000000da940000 - - /* x^100416 mod p(x), x^100352 mod p(x) */ - .octa 0x000000005e87000000000000f9f70000 - - /* x^99392 mod p(x), x^99328 mod p(x) */ - .octa 0x000000006c9b00000000000045e40000 - - /* x^98368 mod p(x), x^98304 mod p(x) */ - .octa 0x00000000178a00000000000083940000 - - /* x^97344 mod p(x), x^97280 mod p(x) */ - .octa 0x00000000f0c8000000000000f0a00000 - - /* x^96320 mod p(x), x^96256 mod p(x) */ - .octa 0x00000000f699000000000000b74b0000 - - /* x^95296 mod p(x), x^95232 mod p(x) */ - .octa 0x00000000316d000000000000c1cf0000 - - /* x^94272 mod p(x), x^94208 mod p(x) */ - .octa 0x00000000987e00000000000072680000 - - /* x^93248 mod p(x), x^93184 mod p(x) */ - .octa 0x00000000acff000000000000e0ab0000 - - /* x^92224 mod p(x), x^92160 mod p(x) */ - .octa 0x00000000a1f6000000000000c5a80000 - - /* x^91200 mod p(x), x^91136 mod p(x) */ - .octa 0x0000000061bd000000000000cf690000 - - /* x^90176 mod p(x), x^90112 mod p(x) */ - .octa 0x00000000c9f2000000000000cbcc0000 - - /* x^89152 mod p(x), x^89088 mod p(x) */ - .octa 0x000000005a33000000000000de050000 - - /* x^88128 mod p(x), x^88064 mod p(x) */ - .octa 0x00000000e416000000000000ccd70000 - - /* x^87104 mod p(x), x^87040 mod p(x) */ - .octa 0x0000000058930000000000002f670000 - - /* x^86080 mod p(x), x^86016 mod p(x) */ - .octa 0x00000000a9d3000000000000152f0000 - - /* x^85056 mod p(x), x^84992 mod p(x) */ - .octa 0x00000000c114000000000000ecc20000 - - /* x^84032 mod p(x), x^83968 mod p(x) */ - .octa 0x00000000b9270000000000007c890000 - - /* x^83008 mod p(x), x^82944 mod p(x) */ - .octa 0x000000002e6000000000000006ee0000 - - /* x^81984 mod p(x), x^81920 mod p(x) */ - .octa 0x00000000dfc600000000000009100000 - - /* x^80960 mod p(x), x^80896 mod p(x) */ - .octa 0x000000004911000000000000ad4e0000 - - /* x^79936 mod p(x), x^79872 mod p(x) */ - .octa 0x00000000ae1b000000000000b04d0000 - - /* x^78912 mod p(x), x^78848 mod p(x) */ - .octa 0x0000000005fa000000000000e9900000 - - /* x^77888 mod p(x), x^77824 mod p(x) */ - .octa 0x0000000004a1000000000000cc6f0000 - - /* x^76864 mod p(x), x^76800 mod p(x) */ - .octa 0x00000000af73000000000000ed110000 - - /* x^75840 mod p(x), x^75776 mod p(x) */ - .octa 0x0000000082530000000000008f7e0000 - - /* x^74816 mod p(x), x^74752 mod p(x) */ - .octa 0x00000000cfdc000000000000594f0000 - - /* x^73792 mod p(x), x^73728 mod p(x) */ - .octa 0x00000000a6b6000000000000a8750000 - - /* x^72768 mod p(x), x^72704 mod p(x) */ - .octa 0x00000000fd76000000000000aa0c0000 - - /* x^71744 mod p(x), x^71680 mod p(x) */ - .octa 0x0000000006f500000000000071db0000 - - /* x^70720 mod p(x), x^70656 mod p(x) */ - .octa 0x0000000037ca000000000000ab0c0000 - - /* x^69696 mod p(x), x^69632 mod p(x) */ - .octa 0x00000000d7ab000000000000b7a00000 - - /* x^68672 mod p(x), x^68608 mod p(x) */ - .octa 0x00000000440800000000000090d30000 - - /* x^67648 mod p(x), x^67584 mod p(x) */ - .octa 0x00000000186100000000000054730000 - - /* x^66624 mod p(x), x^66560 mod p(x) */ - .octa 0x000000007368000000000000a3a20000 - - /* x^65600 mod p(x), x^65536 mod p(x) */ - .octa 0x0000000026d0000000000000f9040000 - - /* x^64576 mod p(x), x^64512 mod p(x) */ - .octa 0x00000000fe770000000000009c0a0000 - - /* x^63552 mod p(x), x^63488 mod p(x) */ - .octa 0x000000002cba000000000000d1e70000 - - /* x^62528 mod p(x), x^62464 mod p(x) */ - .octa 0x00000000f8bd0000000000005ac10000 - - /* x^61504 mod p(x), x^61440 mod p(x) */ - .octa 0x000000007372000000000000d68d0000 - - /* x^60480 mod p(x), x^60416 mod p(x) */ - .octa 0x00000000f37f00000000000089f60000 - - /* x^59456 mod p(x), x^59392 mod p(x) */ - .octa 0x00000000078400000000000008a90000 - - /* x^58432 mod p(x), x^58368 mod p(x) */ - .octa 0x00000000d3e400000000000042360000 - - /* x^57408 mod p(x), x^57344 mod p(x) */ - .octa 0x00000000eba800000000000092d50000 - - /* x^56384 mod p(x), x^56320 mod p(x) */ - .octa 0x00000000afbe000000000000b4d50000 - - /* x^55360 mod p(x), x^55296 mod p(x) */ - .octa 0x00000000d8ca000000000000c9060000 - - /* x^54336 mod p(x), x^54272 mod p(x) */ - .octa 0x00000000c2d00000000000008f4f0000 - - /* x^53312 mod p(x), x^53248 mod p(x) */ - .octa 0x00000000373200000000000028690000 - - /* x^52288 mod p(x), x^52224 mod p(x) */ - .octa 0x0000000046ae000000000000c3b30000 - - /* x^51264 mod p(x), x^51200 mod p(x) */ - .octa 0x00000000b243000000000000f8700000 - - /* x^50240 mod p(x), x^50176 mod p(x) */ - .octa 0x00000000f7f500000000000029eb0000 - - /* x^49216 mod p(x), x^49152 mod p(x) */ - .octa 0x000000000c7e000000000000fe730000 - - /* x^48192 mod p(x), x^48128 mod p(x) */ - .octa 0x00000000c38200000000000096000000 - - /* x^47168 mod p(x), x^47104 mod p(x) */ - .octa 0x000000008956000000000000683c0000 - - /* x^46144 mod p(x), x^46080 mod p(x) */ - .octa 0x00000000422d0000000000005f1e0000 - - /* x^45120 mod p(x), x^45056 mod p(x) */ - .octa 0x00000000ac0f0000000000006f810000 - - /* x^44096 mod p(x), x^44032 mod p(x) */ - .octa 0x00000000ce30000000000000031f0000 - - /* x^43072 mod p(x), x^43008 mod p(x) */ - .octa 0x000000003d43000000000000455a0000 - - /* x^42048 mod p(x), x^41984 mod p(x) */ - .octa 0x000000007ebe000000000000a6050000 - - /* x^41024 mod p(x), x^40960 mod p(x) */ - .octa 0x00000000976e00000000000077eb0000 - - /* x^40000 mod p(x), x^39936 mod p(x) */ - .octa 0x000000000872000000000000389c0000 - - /* x^38976 mod p(x), x^38912 mod p(x) */ - .octa 0x000000008979000000000000c7b20000 - - /* x^37952 mod p(x), x^37888 mod p(x) */ - .octa 0x000000005c1e0000000000001d870000 - - /* x^36928 mod p(x), x^36864 mod p(x) */ - .octa 0x00000000aebb00000000000045810000 - - /* x^35904 mod p(x), x^35840 mod p(x) */ - .octa 0x000000004f7e0000000000006d4a0000 - - /* x^34880 mod p(x), x^34816 mod p(x) */ - .octa 0x00000000ea98000000000000b9200000 - - /* x^33856 mod p(x), x^33792 mod p(x) */ - .octa 0x00000000f39600000000000022f20000 - - /* x^32832 mod p(x), x^32768 mod p(x) */ - .octa 0x000000000bc500000000000041ca0000 - - /* x^31808 mod p(x), x^31744 mod p(x) */ - .octa 0x00000000786400000000000078500000 - - /* x^30784 mod p(x), x^30720 mod p(x) */ - .octa 0x00000000be970000000000009e7e0000 - - /* x^29760 mod p(x), x^29696 mod p(x) */ - .octa 0x00000000dd6d000000000000a53c0000 - - /* x^28736 mod p(x), x^28672 mod p(x) */ - .octa 0x000000004c3f00000000000039340000 - - /* x^27712 mod p(x), x^27648 mod p(x) */ - .octa 0x0000000093a4000000000000b58e0000 - - /* x^26688 mod p(x), x^26624 mod p(x) */ - .octa 0x0000000050fb00000000000062d40000 - - /* x^25664 mod p(x), x^25600 mod p(x) */ - .octa 0x00000000f505000000000000a26f0000 - - /* x^24640 mod p(x), x^24576 mod p(x) */ - .octa 0x0000000064f900000000000065e60000 - - /* x^23616 mod p(x), x^23552 mod p(x) */ - .octa 0x00000000e8c2000000000000aad90000 - - /* x^22592 mod p(x), x^22528 mod p(x) */ - .octa 0x00000000720b000000000000a3b00000 - - /* x^21568 mod p(x), x^21504 mod p(x) */ - .octa 0x00000000e992000000000000d2680000 - - /* x^20544 mod p(x), x^20480 mod p(x) */ - .octa 0x000000009132000000000000cf4c0000 - - /* x^19520 mod p(x), x^19456 mod p(x) */ - .octa 0x00000000608a00000000000076610000 - - /* x^18496 mod p(x), x^18432 mod p(x) */ - .octa 0x000000009948000000000000fb9f0000 - - /* x^17472 mod p(x), x^17408 mod p(x) */ - .octa 0x00000000173000000000000003770000 - - /* x^16448 mod p(x), x^16384 mod p(x) */ - .octa 0x000000006fe300000000000004880000 - - /* x^15424 mod p(x), x^15360 mod p(x) */ - .octa 0x00000000e15300000000000056a70000 - - /* x^14400 mod p(x), x^14336 mod p(x) */ - .octa 0x0000000092d60000000000009dfd0000 - - /* x^13376 mod p(x), x^13312 mod p(x) */ - .octa 0x0000000002fd00000000000074c80000 - - /* x^12352 mod p(x), x^12288 mod p(x) */ - .octa 0x00000000c78b000000000000a3ec0000 - - /* x^11328 mod p(x), x^11264 mod p(x) */ - .octa 0x000000009262000000000000b3530000 - - /* x^10304 mod p(x), x^10240 mod p(x) */ - .octa 0x0000000084f200000000000047bf0000 - - /* x^9280 mod p(x), x^9216 mod p(x) */ - .octa 0x0000000067ee000000000000e97c0000 - - /* x^8256 mod p(x), x^8192 mod p(x) */ - .octa 0x00000000535b00000000000091e10000 - - /* x^7232 mod p(x), x^7168 mod p(x) */ - .octa 0x000000007ebb00000000000055060000 - - /* x^6208 mod p(x), x^6144 mod p(x) */ - .octa 0x00000000c6a1000000000000fd360000 - - /* x^5184 mod p(x), x^5120 mod p(x) */ - .octa 0x000000001be500000000000055860000 - - /* x^4160 mod p(x), x^4096 mod p(x) */ - .octa 0x00000000ae0e0000000000005bd00000 - - /* x^3136 mod p(x), x^3072 mod p(x) */ - .octa 0x0000000022040000000000008db20000 - - /* x^2112 mod p(x), x^2048 mod p(x) */ - .octa 0x00000000c9eb000000000000efe20000 - - /* x^1088 mod p(x), x^1024 mod p(x) */ - .octa 0x0000000039b400000000000051d10000 - -.short_constants: - - /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */ - /* x^2048 mod p(x), x^2016 mod p(x), x^1984 mod p(x), x^1952 mod p(x) */ - .octa 0xefe20000dccf00009440000033590000 - - /* x^1920 mod p(x), x^1888 mod p(x), x^1856 mod p(x), x^1824 mod p(x) */ - .octa 0xee6300002f3f000062180000e0ed0000 - - /* x^1792 mod p(x), x^1760 mod p(x), x^1728 mod p(x), x^1696 mod p(x) */ - .octa 0xcf5f000017ef0000ccbe000023d30000 - - /* x^1664 mod p(x), x^1632 mod p(x), x^1600 mod p(x), x^1568 mod p(x) */ - .octa 0x6d0c0000a30e00000920000042630000 - - /* x^1536 mod p(x), x^1504 mod p(x), x^1472 mod p(x), x^1440 mod p(x) */ - .octa 0x21d30000932b0000a7a00000efcc0000 - - /* x^1408 mod p(x), x^1376 mod p(x), x^1344 mod p(x), x^1312 mod p(x) */ - .octa 0x10be00000b310000666f00000d1c0000 - - /* x^1280 mod p(x), x^1248 mod p(x), x^1216 mod p(x), x^1184 mod p(x) */ - .octa 0x1f240000ce9e0000caad0000589e0000 - - /* x^1152 mod p(x), x^1120 mod p(x), x^1088 mod p(x), x^1056 mod p(x) */ - .octa 0x29610000d02b000039b400007cf50000 - - /* x^1024 mod p(x), x^992 mod p(x), x^960 mod p(x), x^928 mod p(x) */ - .octa 0x51d100009d9d00003c0e0000bfd60000 - - /* x^896 mod p(x), x^864 mod p(x), x^832 mod p(x), x^800 mod p(x) */ - .octa 0xda390000ceae000013830000713c0000 - - /* x^768 mod p(x), x^736 mod p(x), x^704 mod p(x), x^672 mod p(x) */ - .octa 0xb67800001e16000085c0000080a60000 - - /* x^640 mod p(x), x^608 mod p(x), x^576 mod p(x), x^544 mod p(x) */ - .octa 0x0db40000f7f90000371d0000e6580000 - - /* x^512 mod p(x), x^480 mod p(x), x^448 mod p(x), x^416 mod p(x) */ - .octa 0x87e70000044c0000aadb0000a4970000 - - /* x^384 mod p(x), x^352 mod p(x), x^320 mod p(x), x^288 mod p(x) */ - .octa 0x1f990000ad180000d8b30000e7b50000 - - /* x^256 mod p(x), x^224 mod p(x), x^192 mod p(x), x^160 mod p(x) */ - .octa 0xbe6c00006ee300004c1a000006df0000 - - /* x^128 mod p(x), x^96 mod p(x), x^64 mod p(x), x^32 mod p(x) */ - .octa 0xfb0b00002d560000136800008bb70000 - - -.barrett_constants: - /* Barrett constant m - (4^32)/n */ - .octa 0x000000000000000000000001f65a57f8 /* x^64 div p(x) */ - /* Barrett constant n */ - .octa 0x0000000000000000000000018bb70000 - -#define CRC_FUNCTION_NAME __crct10dif_vpmsum -#include "crc32-vpmsum_core.S" diff --git a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c deleted file mode 100644 index 1dc8b6915178..000000000000 --- a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c +++ /dev/null @@ -1,126 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Calculate a CRC T10-DIF with vpmsum acceleration - * - * Copyright 2017, Daniel Axtens, IBM Corporation. - * [based on crc32c-vpmsum_glue.c] - */ - -#include <linux/crc-t10dif.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/cpufeature.h> -#include <asm/simd.h> -#include <asm/switch_to.h> - -#define VMX_ALIGN 16 -#define VMX_ALIGN_MASK (VMX_ALIGN-1) - -#define VECTOR_BREAKPOINT 64 - -u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len); - -static u16 crct10dif_vpmsum(u16 crci, unsigned char const *p, size_t len) -{ - unsigned int prealign; - unsigned int tail; - u32 crc = crci; - - if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || !crypto_simd_usable()) - return crc_t10dif_generic(crc, p, len); - - if ((unsigned long)p & VMX_ALIGN_MASK) { - prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); - crc = crc_t10dif_generic(crc, p, prealign); - len -= prealign; - p += prealign; - } - - if (len & ~VMX_ALIGN_MASK) { - crc <<= 16; - preempt_disable(); - pagefault_disable(); - enable_kernel_altivec(); - crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); - disable_kernel_altivec(); - pagefault_enable(); - preempt_enable(); - crc >>= 16; - } - - tail = len & VMX_ALIGN_MASK; - if (tail) { - p += len & ~VMX_ALIGN_MASK; - crc = crc_t10dif_generic(crc, p, tail); - } - - return crc & 0xffff; -} - -static int crct10dif_vpmsum_init(struct shash_desc *desc) -{ - u16 *crc = shash_desc_ctx(desc); - - *crc = 0; - return 0; -} - -static int crct10dif_vpmsum_update(struct shash_desc *desc, const u8 *data, - unsigned int length) -{ - u16 *crc = shash_desc_ctx(desc); - - *crc = crct10dif_vpmsum(*crc, data, length); - - return 0; -} - - -static int crct10dif_vpmsum_final(struct shash_desc *desc, u8 *out) -{ - u16 *crcp = shash_desc_ctx(desc); - - *(u16 *)out = *crcp; - return 0; -} - -static struct shash_alg alg = { - .init = crct10dif_vpmsum_init, - .update = crct10dif_vpmsum_update, - .final = crct10dif_vpmsum_final, - .descsize = CRC_T10DIF_DIGEST_SIZE, - .digestsize = CRC_T10DIF_DIGEST_SIZE, - .base = { - .cra_name = "crct10dif", - .cra_driver_name = "crct10dif-vpmsum", - .cra_priority = 200, - .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init crct10dif_vpmsum_mod_init(void) -{ - if (!cpu_has_feature(CPU_FTR_ARCH_207S)) - return -ENODEV; - - return crypto_register_shash(&alg); -} - -static void __exit crct10dif_vpmsum_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, crct10dif_vpmsum_mod_init); -module_exit(crct10dif_vpmsum_mod_fini); - -MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>"); -MODULE_DESCRIPTION("CRCT10DIF using vector polynomial multiply-sum instructions"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_CRYPTO("crct10dif"); -MODULE_ALIAS_CRYPTO("crct10dif-vpmsum"); diff --git a/arch/powerpc/crypto/ghash.c b/arch/powerpc/crypto/ghash.c index 77eca20bc7ac..7308735bdb33 100644 --- a/arch/powerpc/crypto/ghash.c +++ b/arch/powerpc/crypto/ghash.c @@ -11,19 +11,18 @@ * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> */ -#include <linux/types.h> -#include <linux/err.h> -#include <linux/crypto.h> -#include <linux/delay.h> -#include <asm/simd.h> +#include "aesp8-ppc.h" #include <asm/switch_to.h> #include <crypto/aes.h> +#include <crypto/gf128mul.h> #include <crypto/ghash.h> -#include <crypto/scatterwalk.h> #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> -#include <crypto/b128ops.h> -#include "aesp8-ppc.h" +#include <linux/err.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/uaccess.h> void gcm_init_p8(u128 htable[16], const u64 Xi[2]); void gcm_gmult_p8(u64 Xi[2], const u128 htable[16]); @@ -39,15 +38,12 @@ struct p8_ghash_ctx { struct p8_ghash_desc_ctx { u64 shash[2]; - u8 buffer[GHASH_DIGEST_SIZE]; - int bytes; }; static int p8_ghash_init(struct shash_desc *desc) { struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc); - dctx->bytes = 0; memset(dctx->shash, 0, GHASH_DIGEST_SIZE); return 0; } @@ -74,27 +70,30 @@ static int p8_ghash_setkey(struct crypto_shash *tfm, const u8 *key, } static inline void __ghash_block(struct p8_ghash_ctx *ctx, - struct p8_ghash_desc_ctx *dctx) + struct p8_ghash_desc_ctx *dctx, + const u8 *src) { if (crypto_simd_usable()) { preempt_disable(); pagefault_disable(); enable_kernel_vsx(); - gcm_ghash_p8(dctx->shash, ctx->htable, - dctx->buffer, GHASH_DIGEST_SIZE); + gcm_ghash_p8(dctx->shash, ctx->htable, src, GHASH_BLOCK_SIZE); disable_kernel_vsx(); pagefault_enable(); preempt_enable(); } else { - crypto_xor((u8 *)dctx->shash, dctx->buffer, GHASH_BLOCK_SIZE); + crypto_xor((u8 *)dctx->shash, src, GHASH_BLOCK_SIZE); gf128mul_lle((be128 *)dctx->shash, &ctx->key); } } -static inline void __ghash_blocks(struct p8_ghash_ctx *ctx, - struct p8_ghash_desc_ctx *dctx, - const u8 *src, unsigned int srclen) +static inline int __ghash_blocks(struct p8_ghash_ctx *ctx, + struct p8_ghash_desc_ctx *dctx, + const u8 *src, unsigned int srclen) { + int remain = srclen - round_down(srclen, GHASH_BLOCK_SIZE); + + srclen -= remain; if (crypto_simd_usable()) { preempt_disable(); pagefault_disable(); @@ -105,62 +104,38 @@ static inline void __ghash_blocks(struct p8_ghash_ctx *ctx, pagefault_enable(); preempt_enable(); } else { - while (srclen >= GHASH_BLOCK_SIZE) { + do { crypto_xor((u8 *)dctx->shash, src, GHASH_BLOCK_SIZE); gf128mul_lle((be128 *)dctx->shash, &ctx->key); srclen -= GHASH_BLOCK_SIZE; src += GHASH_BLOCK_SIZE; - } + } while (srclen); } + + return remain; } static int p8_ghash_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { - unsigned int len; struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm)); struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc); - if (dctx->bytes) { - if (dctx->bytes + srclen < GHASH_DIGEST_SIZE) { - memcpy(dctx->buffer + dctx->bytes, src, - srclen); - dctx->bytes += srclen; - return 0; - } - memcpy(dctx->buffer + dctx->bytes, src, - GHASH_DIGEST_SIZE - dctx->bytes); - - __ghash_block(ctx, dctx); - - src += GHASH_DIGEST_SIZE - dctx->bytes; - srclen -= GHASH_DIGEST_SIZE - dctx->bytes; - dctx->bytes = 0; - } - len = srclen & ~(GHASH_DIGEST_SIZE - 1); - if (len) { - __ghash_blocks(ctx, dctx, src, len); - src += len; - srclen -= len; - } - if (srclen) { - memcpy(dctx->buffer, src, srclen); - dctx->bytes = srclen; - } - return 0; + return __ghash_blocks(ctx, dctx, src, srclen); } -static int p8_ghash_final(struct shash_desc *desc, u8 *out) +static int p8_ghash_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *out) { - int i; struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm)); struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc); - if (dctx->bytes) { - for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++) - dctx->buffer[i] = 0; - __ghash_block(ctx, dctx); - dctx->bytes = 0; + if (len) { + u8 buf[GHASH_BLOCK_SIZE] = {}; + + memcpy(buf, src, len); + __ghash_block(ctx, dctx, buf); + memzero_explicit(buf, sizeof(buf)); } memcpy(out, dctx->shash, GHASH_DIGEST_SIZE); return 0; @@ -170,14 +145,14 @@ struct shash_alg p8_ghash_alg = { .digestsize = GHASH_DIGEST_SIZE, .init = p8_ghash_init, .update = p8_ghash_update, - .final = p8_ghash_final, + .finup = p8_ghash_finup, .setkey = p8_ghash_setkey, - .descsize = sizeof(struct p8_ghash_desc_ctx) - + sizeof(struct ghash_desc_ctx), + .descsize = sizeof(struct p8_ghash_desc_ctx), .base = { .cra_name = "ghash", .cra_driver_name = "p8_ghash", .cra_priority = 1000, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = GHASH_BLOCK_SIZE, .cra_ctxsize = sizeof(struct p8_ghash_ctx), .cra_module = THIS_MODULE, diff --git a/arch/powerpc/crypto/md5-glue.c b/arch/powerpc/crypto/md5-glue.c index c24f605033bd..204440a90cd8 100644 --- a/arch/powerpc/crypto/md5-glue.c +++ b/arch/powerpc/crypto/md5-glue.c @@ -8,25 +8,13 @@ */ #include <crypto/internal/hash.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/types.h> #include <crypto/md5.h> -#include <asm/byteorder.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> extern void ppc_md5_transform(u32 *state, const u8 *src, u32 blocks); -static inline void ppc_md5_clear_context(struct md5_state *sctx) -{ - int count = sizeof(struct md5_state) >> 2; - u32 *ptr = (u32 *)sctx; - - /* make sure we can clear the fast way */ - BUILD_BUG_ON(sizeof(struct md5_state) % 4); - do { *ptr++ = 0; } while (--count); -} - static int ppc_md5_init(struct shash_desc *desc) { struct md5_state *sctx = shash_desc_ctx(desc); @@ -44,79 +32,34 @@ static int ppc_md5_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct md5_state *sctx = shash_desc_ctx(desc); - const unsigned int offset = sctx->byte_count & 0x3f; - unsigned int avail = 64 - offset; - const u8 *src = data; - sctx->byte_count += len; - - if (avail > len) { - memcpy((char *)sctx->block + offset, src, len); - return 0; - } - - if (offset) { - memcpy((char *)sctx->block + offset, src, avail); - ppc_md5_transform(sctx->hash, (const u8 *)sctx->block, 1); - len -= avail; - src += avail; - } - - if (len > 63) { - ppc_md5_transform(sctx->hash, src, len >> 6); - src += len & ~0x3f; - len &= 0x3f; - } - - memcpy((char *)sctx->block, src, len); - return 0; + sctx->byte_count += round_down(len, MD5_HMAC_BLOCK_SIZE); + ppc_md5_transform(sctx->hash, data, len >> 6); + return len - round_down(len, MD5_HMAC_BLOCK_SIZE); } -static int ppc_md5_final(struct shash_desc *desc, u8 *out) +static int ppc_md5_finup(struct shash_desc *desc, const u8 *src, + unsigned int offset, u8 *out) { struct md5_state *sctx = shash_desc_ctx(desc); - const unsigned int offset = sctx->byte_count & 0x3f; - const u8 *src = (const u8 *)sctx->block; - u8 *p = (u8 *)src + offset; - int padlen = 55 - offset; - __le64 *pbits = (__le64 *)((char *)sctx->block + 56); + __le64 block[MD5_BLOCK_WORDS] = {}; + u8 *p = memcpy(block, src, offset); __le32 *dst = (__le32 *)out; + __le64 *pbits; + src = p; + p += offset; *p++ = 0x80; - - if (padlen < 0) { - memset(p, 0x00, padlen + sizeof (u64)); - ppc_md5_transform(sctx->hash, src, 1); - p = (char *)sctx->block; - padlen = 56; - } - - memset(p, 0, padlen); + sctx->byte_count += offset; + pbits = &block[(MD5_BLOCK_WORDS / (offset > 55 ? 1 : 2)) - 1]; *pbits = cpu_to_le64(sctx->byte_count << 3); - ppc_md5_transform(sctx->hash, src, 1); + ppc_md5_transform(sctx->hash, src, (pbits - block + 1) / 8); + memzero_explicit(block, sizeof(block)); dst[0] = cpu_to_le32(sctx->hash[0]); dst[1] = cpu_to_le32(sctx->hash[1]); dst[2] = cpu_to_le32(sctx->hash[2]); dst[3] = cpu_to_le32(sctx->hash[3]); - - ppc_md5_clear_context(sctx); - return 0; -} - -static int ppc_md5_export(struct shash_desc *desc, void *out) -{ - struct md5_state *sctx = shash_desc_ctx(desc); - - memcpy(out, sctx, sizeof(*sctx)); - return 0; -} - -static int ppc_md5_import(struct shash_desc *desc, const void *in) -{ - struct md5_state *sctx = shash_desc_ctx(desc); - - memcpy(sctx, in, sizeof(*sctx)); return 0; } @@ -124,15 +67,13 @@ static struct shash_alg alg = { .digestsize = MD5_DIGEST_SIZE, .init = ppc_md5_init, .update = ppc_md5_update, - .final = ppc_md5_final, - .export = ppc_md5_export, - .import = ppc_md5_import, - .descsize = sizeof(struct md5_state), - .statesize = sizeof(struct md5_state), + .finup = ppc_md5_finup, + .descsize = MD5_STATE_SIZE, .base = { .cra_name = "md5", .cra_driver_name= "md5-ppc", .cra_priority = 200, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = MD5_HMAC_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/powerpc/crypto/poly1305-p10-glue.c b/arch/powerpc/crypto/poly1305-p10-glue.c deleted file mode 100644 index 369686e9370b..000000000000 --- a/arch/powerpc/crypto/poly1305-p10-glue.c +++ /dev/null @@ -1,186 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Poly1305 authenticator algorithm, RFC7539. - * - * Copyright 2023- IBM Corp. All rights reserved. - */ - -#include <crypto/algapi.h> -#include <linux/crypto.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/jump_label.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/poly1305.h> -#include <crypto/internal/simd.h> -#include <linux/cpufeature.h> -#include <linux/unaligned.h> -#include <asm/simd.h> -#include <asm/switch_to.h> - -asmlinkage void poly1305_p10le_4blocks(void *h, const u8 *m, u32 mlen); -asmlinkage void poly1305_64s(void *h, const u8 *m, u32 mlen, int highbit); -asmlinkage void poly1305_emit_64(void *h, void *s, u8 *dst); - -static void vsx_begin(void) -{ - preempt_disable(); - enable_kernel_vsx(); -} - -static void vsx_end(void) -{ - disable_kernel_vsx(); - preempt_enable(); -} - -static int crypto_poly1305_p10_init(struct shash_desc *desc) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - poly1305_core_init(&dctx->h); - dctx->buflen = 0; - dctx->rset = 0; - dctx->sset = false; - - return 0; -} - -static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, - const u8 *inp, unsigned int len) -{ - unsigned int acc = 0; - - if (unlikely(!dctx->sset)) { - if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) { - struct poly1305_core_key *key = &dctx->core_r; - - key->key.r64[0] = get_unaligned_le64(&inp[0]); - key->key.r64[1] = get_unaligned_le64(&inp[8]); - inp += POLY1305_BLOCK_SIZE; - len -= POLY1305_BLOCK_SIZE; - acc += POLY1305_BLOCK_SIZE; - dctx->rset = 1; - } - if (len >= POLY1305_BLOCK_SIZE) { - dctx->s[0] = get_unaligned_le32(&inp[0]); - dctx->s[1] = get_unaligned_le32(&inp[4]); - dctx->s[2] = get_unaligned_le32(&inp[8]); - dctx->s[3] = get_unaligned_le32(&inp[12]); - acc += POLY1305_BLOCK_SIZE; - dctx->sset = true; - } - } - return acc; -} - -static int crypto_poly1305_p10_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - unsigned int bytes, used; - - if (unlikely(dctx->buflen)) { - bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); - memcpy(dctx->buf + dctx->buflen, src, bytes); - src += bytes; - srclen -= bytes; - dctx->buflen += bytes; - - if (dctx->buflen == POLY1305_BLOCK_SIZE) { - if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, - POLY1305_BLOCK_SIZE))) { - vsx_begin(); - poly1305_64s(&dctx->h, dctx->buf, - POLY1305_BLOCK_SIZE, 1); - vsx_end(); - } - dctx->buflen = 0; - } - } - - if (likely(srclen >= POLY1305_BLOCK_SIZE)) { - bytes = round_down(srclen, POLY1305_BLOCK_SIZE); - used = crypto_poly1305_setdctxkey(dctx, src, bytes); - if (likely(used)) { - srclen -= used; - src += used; - } - if (crypto_simd_usable() && (srclen >= POLY1305_BLOCK_SIZE*4)) { - vsx_begin(); - poly1305_p10le_4blocks(&dctx->h, src, srclen); - vsx_end(); - src += srclen - (srclen % (POLY1305_BLOCK_SIZE * 4)); - srclen %= POLY1305_BLOCK_SIZE * 4; - } - while (srclen >= POLY1305_BLOCK_SIZE) { - vsx_begin(); - poly1305_64s(&dctx->h, src, POLY1305_BLOCK_SIZE, 1); - vsx_end(); - srclen -= POLY1305_BLOCK_SIZE; - src += POLY1305_BLOCK_SIZE; - } - } - - if (unlikely(srclen)) { - dctx->buflen = srclen; - memcpy(dctx->buf, src, srclen); - } - - return 0; -} - -static int crypto_poly1305_p10_final(struct shash_desc *desc, u8 *dst) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - if (unlikely(!dctx->sset)) - return -ENOKEY; - - if ((dctx->buflen)) { - dctx->buf[dctx->buflen++] = 1; - memset(dctx->buf + dctx->buflen, 0, - POLY1305_BLOCK_SIZE - dctx->buflen); - vsx_begin(); - poly1305_64s(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); - vsx_end(); - dctx->buflen = 0; - } - - poly1305_emit_64(&dctx->h, &dctx->s, dst); - return 0; -} - -static struct shash_alg poly1305_alg = { - .digestsize = POLY1305_DIGEST_SIZE, - .init = crypto_poly1305_p10_init, - .update = crypto_poly1305_p10_update, - .final = crypto_poly1305_p10_final, - .descsize = sizeof(struct poly1305_desc_ctx), - .base = { - .cra_name = "poly1305", - .cra_driver_name = "poly1305-p10", - .cra_priority = 300, - .cra_blocksize = POLY1305_BLOCK_SIZE, - .cra_module = THIS_MODULE, - }, -}; - -static int __init poly1305_p10_init(void) -{ - return crypto_register_shash(&poly1305_alg); -} - -static void __exit poly1305_p10_exit(void) -{ - crypto_unregister_shash(&poly1305_alg); -} - -module_cpu_feature_match(PPC_MODULE_FEATURE_P10, poly1305_p10_init); -module_exit(poly1305_p10_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>"); -MODULE_DESCRIPTION("Optimized Poly1305 for P10"); -MODULE_ALIAS_CRYPTO("poly1305"); -MODULE_ALIAS_CRYPTO("poly1305-p10"); diff --git a/arch/powerpc/crypto/poly1305-p10le_64.S b/arch/powerpc/crypto/poly1305-p10le_64.S deleted file mode 100644 index a3c1987f1ecd..000000000000 --- a/arch/powerpc/crypto/poly1305-p10le_64.S +++ /dev/null @@ -1,1075 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -# -# Accelerated poly1305 implementation for ppc64le. -# -# Copyright 2023- IBM Corp. All rights reserved -# -#=================================================================================== -# Written by Danny Tsen <dtsen@us.ibm.com> -# -# Poly1305 - this version mainly using vector/VSX/Scalar -# - 26 bits limbs -# - Handle multiple 64 byte blcok. -# -# Block size 16 bytes -# key = (r, s) -# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF -# p = 2^130 - 5 -# a += m -# a = (r + a) % p -# a += s -# -# Improve performance by breaking down polynominal to the sum of products with -# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r -# -# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0 -# to 9 vectors for multiplications. -# -# setup r^4, r^3, r^2, r vectors -# vs [r^1, r^3, r^2, r^4] -# vs0 = [r0,.....] -# vs1 = [r1,.....] -# vs2 = [r2,.....] -# vs3 = [r3,.....] -# vs4 = [r4,.....] -# vs5 = [r1*5,...] -# vs6 = [r2*5,...] -# vs7 = [r2*5,...] -# vs8 = [r4*5,...] -# -# Each word in a vector consists a member of a "r/s" in [a * r/s]. -# -# r0, r4*5, r3*5, r2*5, r1*5; -# r1, r0, r4*5, r3*5, r2*5; -# r2, r1, r0, r4*5, r3*5; -# r3, r2, r1, r0, r4*5; -# r4, r3, r2, r1, r0 ; -# -# -# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) -# k = 32 bytes key -# r3 = k (r, s) -# r4 = mlen -# r5 = m -# -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> -#include <asm/asm-compat.h> -#include <linux/linkage.h> - -.machine "any" - -.text - -.macro SAVE_GPR GPR OFFSET FRAME - std \GPR,\OFFSET(\FRAME) -.endm - -.macro SAVE_VRS VRS OFFSET FRAME - li 16, \OFFSET - stvx \VRS, 16, \FRAME -.endm - -.macro SAVE_VSX VSX OFFSET FRAME - li 16, \OFFSET - stxvx \VSX, 16, \FRAME -.endm - -.macro RESTORE_GPR GPR OFFSET FRAME - ld \GPR,\OFFSET(\FRAME) -.endm - -.macro RESTORE_VRS VRS OFFSET FRAME - li 16, \OFFSET - lvx \VRS, 16, \FRAME -.endm - -.macro RESTORE_VSX VSX OFFSET FRAME - li 16, \OFFSET - lxvx \VSX, 16, \FRAME -.endm - -.macro SAVE_REGS - mflr 0 - std 0, 16(1) - stdu 1,-752(1) - - SAVE_GPR 14, 112, 1 - SAVE_GPR 15, 120, 1 - SAVE_GPR 16, 128, 1 - SAVE_GPR 17, 136, 1 - SAVE_GPR 18, 144, 1 - SAVE_GPR 19, 152, 1 - SAVE_GPR 20, 160, 1 - SAVE_GPR 21, 168, 1 - SAVE_GPR 22, 176, 1 - SAVE_GPR 23, 184, 1 - SAVE_GPR 24, 192, 1 - SAVE_GPR 25, 200, 1 - SAVE_GPR 26, 208, 1 - SAVE_GPR 27, 216, 1 - SAVE_GPR 28, 224, 1 - SAVE_GPR 29, 232, 1 - SAVE_GPR 30, 240, 1 - SAVE_GPR 31, 248, 1 - - addi 9, 1, 256 - SAVE_VRS 20, 0, 9 - SAVE_VRS 21, 16, 9 - SAVE_VRS 22, 32, 9 - SAVE_VRS 23, 48, 9 - SAVE_VRS 24, 64, 9 - SAVE_VRS 25, 80, 9 - SAVE_VRS 26, 96, 9 - SAVE_VRS 27, 112, 9 - SAVE_VRS 28, 128, 9 - SAVE_VRS 29, 144, 9 - SAVE_VRS 30, 160, 9 - SAVE_VRS 31, 176, 9 - - SAVE_VSX 14, 192, 9 - SAVE_VSX 15, 208, 9 - SAVE_VSX 16, 224, 9 - SAVE_VSX 17, 240, 9 - SAVE_VSX 18, 256, 9 - SAVE_VSX 19, 272, 9 - SAVE_VSX 20, 288, 9 - SAVE_VSX 21, 304, 9 - SAVE_VSX 22, 320, 9 - SAVE_VSX 23, 336, 9 - SAVE_VSX 24, 352, 9 - SAVE_VSX 25, 368, 9 - SAVE_VSX 26, 384, 9 - SAVE_VSX 27, 400, 9 - SAVE_VSX 28, 416, 9 - SAVE_VSX 29, 432, 9 - SAVE_VSX 30, 448, 9 - SAVE_VSX 31, 464, 9 -.endm # SAVE_REGS - -.macro RESTORE_REGS - addi 9, 1, 256 - RESTORE_VRS 20, 0, 9 - RESTORE_VRS 21, 16, 9 - RESTORE_VRS 22, 32, 9 - RESTORE_VRS 23, 48, 9 - RESTORE_VRS 24, 64, 9 - RESTORE_VRS 25, 80, 9 - RESTORE_VRS 26, 96, 9 - RESTORE_VRS 27, 112, 9 - RESTORE_VRS 28, 128, 9 - RESTORE_VRS 29, 144, 9 - RESTORE_VRS 30, 160, 9 - RESTORE_VRS 31, 176, 9 - - RESTORE_VSX 14, 192, 9 - RESTORE_VSX 15, 208, 9 - RESTORE_VSX 16, 224, 9 - RESTORE_VSX 17, 240, 9 - RESTORE_VSX 18, 256, 9 - RESTORE_VSX 19, 272, 9 - RESTORE_VSX 20, 288, 9 - RESTORE_VSX 21, 304, 9 - RESTORE_VSX 22, 320, 9 - RESTORE_VSX 23, 336, 9 - RESTORE_VSX 24, 352, 9 - RESTORE_VSX 25, 368, 9 - RESTORE_VSX 26, 384, 9 - RESTORE_VSX 27, 400, 9 - RESTORE_VSX 28, 416, 9 - RESTORE_VSX 29, 432, 9 - RESTORE_VSX 30, 448, 9 - RESTORE_VSX 31, 464, 9 - - RESTORE_GPR 14, 112, 1 - RESTORE_GPR 15, 120, 1 - RESTORE_GPR 16, 128, 1 - RESTORE_GPR 17, 136, 1 - RESTORE_GPR 18, 144, 1 - RESTORE_GPR 19, 152, 1 - RESTORE_GPR 20, 160, 1 - RESTORE_GPR 21, 168, 1 - RESTORE_GPR 22, 176, 1 - RESTORE_GPR 23, 184, 1 - RESTORE_GPR 24, 192, 1 - RESTORE_GPR 25, 200, 1 - RESTORE_GPR 26, 208, 1 - RESTORE_GPR 27, 216, 1 - RESTORE_GPR 28, 224, 1 - RESTORE_GPR 29, 232, 1 - RESTORE_GPR 30, 240, 1 - RESTORE_GPR 31, 248, 1 - - addi 1, 1, 752 - ld 0, 16(1) - mtlr 0 -.endm # RESTORE_REGS - -# -# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5; -# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5; -# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5; -# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5; -# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ; -# -# [r^2, r^3, r^1, r^4] -# [m3, m2, m4, m1] -# -# multiply odd and even words -.macro mul_odd - vmulouw 14, 4, 26 - vmulouw 10, 5, 3 - vmulouw 11, 6, 2 - vmulouw 12, 7, 1 - vmulouw 13, 8, 0 - vmulouw 15, 4, 27 - vaddudm 14, 14, 10 - vaddudm 14, 14, 11 - vmulouw 10, 5, 26 - vmulouw 11, 6, 3 - vaddudm 14, 14, 12 - vaddudm 14, 14, 13 # x0 - vaddudm 15, 15, 10 - vaddudm 15, 15, 11 - vmulouw 12, 7, 2 - vmulouw 13, 8, 1 - vaddudm 15, 15, 12 - vaddudm 15, 15, 13 # x1 - vmulouw 16, 4, 28 - vmulouw 10, 5, 27 - vmulouw 11, 6, 26 - vaddudm 16, 16, 10 - vaddudm 16, 16, 11 - vmulouw 12, 7, 3 - vmulouw 13, 8, 2 - vaddudm 16, 16, 12 - vaddudm 16, 16, 13 # x2 - vmulouw 17, 4, 29 - vmulouw 10, 5, 28 - vmulouw 11, 6, 27 - vaddudm 17, 17, 10 - vaddudm 17, 17, 11 - vmulouw 12, 7, 26 - vmulouw 13, 8, 3 - vaddudm 17, 17, 12 - vaddudm 17, 17, 13 # x3 - vmulouw 18, 4, 30 - vmulouw 10, 5, 29 - vmulouw 11, 6, 28 - vaddudm 18, 18, 10 - vaddudm 18, 18, 11 - vmulouw 12, 7, 27 - vmulouw 13, 8, 26 - vaddudm 18, 18, 12 - vaddudm 18, 18, 13 # x4 -.endm - -.macro mul_even - vmuleuw 9, 4, 26 - vmuleuw 10, 5, 3 - vmuleuw 11, 6, 2 - vmuleuw 12, 7, 1 - vmuleuw 13, 8, 0 - vaddudm 14, 14, 9 - vaddudm 14, 14, 10 - vaddudm 14, 14, 11 - vaddudm 14, 14, 12 - vaddudm 14, 14, 13 # x0 - - vmuleuw 9, 4, 27 - vmuleuw 10, 5, 26 - vmuleuw 11, 6, 3 - vmuleuw 12, 7, 2 - vmuleuw 13, 8, 1 - vaddudm 15, 15, 9 - vaddudm 15, 15, 10 - vaddudm 15, 15, 11 - vaddudm 15, 15, 12 - vaddudm 15, 15, 13 # x1 - - vmuleuw 9, 4, 28 - vmuleuw 10, 5, 27 - vmuleuw 11, 6, 26 - vmuleuw 12, 7, 3 - vmuleuw 13, 8, 2 - vaddudm 16, 16, 9 - vaddudm 16, 16, 10 - vaddudm 16, 16, 11 - vaddudm 16, 16, 12 - vaddudm 16, 16, 13 # x2 - - vmuleuw 9, 4, 29 - vmuleuw 10, 5, 28 - vmuleuw 11, 6, 27 - vmuleuw 12, 7, 26 - vmuleuw 13, 8, 3 - vaddudm 17, 17, 9 - vaddudm 17, 17, 10 - vaddudm 17, 17, 11 - vaddudm 17, 17, 12 - vaddudm 17, 17, 13 # x3 - - vmuleuw 9, 4, 30 - vmuleuw 10, 5, 29 - vmuleuw 11, 6, 28 - vmuleuw 12, 7, 27 - vmuleuw 13, 8, 26 - vaddudm 18, 18, 9 - vaddudm 18, 18, 10 - vaddudm 18, 18, 11 - vaddudm 18, 18, 12 - vaddudm 18, 18, 13 # x4 -.endm - -# -# poly1305_setup_r -# -# setup r^4, r^3, r^2, r vectors -# [r, r^3, r^2, r^4] -# vs0 = [r0,...] -# vs1 = [r1,...] -# vs2 = [r2,...] -# vs3 = [r3,...] -# vs4 = [r4,...] -# vs5 = [r4*5,...] -# vs6 = [r3*5,...] -# vs7 = [r2*5,...] -# vs8 = [r1*5,...] -# -# r0, r4*5, r3*5, r2*5, r1*5; -# r1, r0, r4*5, r3*5, r2*5; -# r2, r1, r0, r4*5, r3*5; -# r3, r2, r1, r0, r4*5; -# r4, r3, r2, r1, r0 ; -# -.macro poly1305_setup_r - - # save r - xxlor 26, 58, 58 - xxlor 27, 59, 59 - xxlor 28, 60, 60 - xxlor 29, 61, 61 - xxlor 30, 62, 62 - - xxlxor 31, 31, 31 - -# [r, r^3, r^2, r^4] - # compute r^2 - vmr 4, 26 - vmr 5, 27 - vmr 6, 28 - vmr 7, 29 - vmr 8, 30 - bl do_mul # r^2 r^1 - xxpermdi 58, 58, 36, 0x3 # r0 - xxpermdi 59, 59, 37, 0x3 # r1 - xxpermdi 60, 60, 38, 0x3 # r2 - xxpermdi 61, 61, 39, 0x3 # r3 - xxpermdi 62, 62, 40, 0x3 # r4 - xxpermdi 36, 36, 36, 0x3 - xxpermdi 37, 37, 37, 0x3 - xxpermdi 38, 38, 38, 0x3 - xxpermdi 39, 39, 39, 0x3 - xxpermdi 40, 40, 40, 0x3 - vspltisb 13, 2 - vsld 9, 27, 13 - vsld 10, 28, 13 - vsld 11, 29, 13 - vsld 12, 30, 13 - vaddudm 0, 9, 27 - vaddudm 1, 10, 28 - vaddudm 2, 11, 29 - vaddudm 3, 12, 30 - - bl do_mul # r^4 r^3 - vmrgow 26, 26, 4 - vmrgow 27, 27, 5 - vmrgow 28, 28, 6 - vmrgow 29, 29, 7 - vmrgow 30, 30, 8 - vspltisb 13, 2 - vsld 9, 27, 13 - vsld 10, 28, 13 - vsld 11, 29, 13 - vsld 12, 30, 13 - vaddudm 0, 9, 27 - vaddudm 1, 10, 28 - vaddudm 2, 11, 29 - vaddudm 3, 12, 30 - - # r^2 r^4 - xxlor 0, 58, 58 - xxlor 1, 59, 59 - xxlor 2, 60, 60 - xxlor 3, 61, 61 - xxlor 4, 62, 62 - xxlor 5, 32, 32 - xxlor 6, 33, 33 - xxlor 7, 34, 34 - xxlor 8, 35, 35 - - vspltw 9, 26, 3 - vspltw 10, 26, 2 - vmrgow 26, 10, 9 - vspltw 9, 27, 3 - vspltw 10, 27, 2 - vmrgow 27, 10, 9 - vspltw 9, 28, 3 - vspltw 10, 28, 2 - vmrgow 28, 10, 9 - vspltw 9, 29, 3 - vspltw 10, 29, 2 - vmrgow 29, 10, 9 - vspltw 9, 30, 3 - vspltw 10, 30, 2 - vmrgow 30, 10, 9 - - vsld 9, 27, 13 - vsld 10, 28, 13 - vsld 11, 29, 13 - vsld 12, 30, 13 - vaddudm 0, 9, 27 - vaddudm 1, 10, 28 - vaddudm 2, 11, 29 - vaddudm 3, 12, 30 -.endm - -SYM_FUNC_START_LOCAL(do_mul) - mul_odd - - # do reduction ( h %= p ) - # carry reduction - vspltisb 9, 2 - vsrd 10, 14, 31 - vsrd 11, 17, 31 - vand 7, 17, 25 - vand 4, 14, 25 - vaddudm 18, 18, 11 - vsrd 12, 18, 31 - vaddudm 15, 15, 10 - - vsrd 11, 15, 31 - vand 8, 18, 25 - vand 5, 15, 25 - vaddudm 4, 4, 12 - vsld 10, 12, 9 - vaddudm 6, 16, 11 - - vsrd 13, 6, 31 - vand 6, 6, 25 - vaddudm 4, 4, 10 - vsrd 10, 4, 31 - vaddudm 7, 7, 13 - - vsrd 11, 7, 31 - vand 7, 7, 25 - vand 4, 4, 25 - vaddudm 5, 5, 10 - vaddudm 8, 8, 11 - blr -SYM_FUNC_END(do_mul) - -# -# init key -# -.macro do_poly1305_init - addis 10, 2, rmask@toc@ha - addi 10, 10, rmask@toc@l - - ld 11, 0(10) - ld 12, 8(10) - - li 14, 16 - li 15, 32 - addis 10, 2, cnum@toc@ha - addi 10, 10, cnum@toc@l - lvx 25, 0, 10 # v25 - mask - lvx 31, 14, 10 # v31 = 1a - lvx 19, 15, 10 # v19 = 1 << 24 - lxv 24, 48(10) # vs24 - lxv 25, 64(10) # vs25 - - # initialize - # load key from r3 to vectors - ld 9, 24(3) - ld 10, 32(3) - and. 9, 9, 11 - and. 10, 10, 12 - - # break 26 bits - extrdi 14, 9, 26, 38 - extrdi 15, 9, 26, 12 - extrdi 16, 9, 12, 0 - mtvsrdd 58, 0, 14 - insrdi 16, 10, 14, 38 - mtvsrdd 59, 0, 15 - extrdi 17, 10, 26, 24 - mtvsrdd 60, 0, 16 - extrdi 18, 10, 24, 0 - mtvsrdd 61, 0, 17 - mtvsrdd 62, 0, 18 - - # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5 - li 9, 5 - mtvsrdd 36, 0, 9 - vmulouw 0, 27, 4 # v0 = rr0 - vmulouw 1, 28, 4 # v1 = rr1 - vmulouw 2, 29, 4 # v2 = rr2 - vmulouw 3, 30, 4 # v3 = rr3 -.endm - -# -# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) -# k = 32 bytes key -# r3 = k (r, s) -# r4 = mlen -# r5 = m -# -SYM_FUNC_START(poly1305_p10le_4blocks) -.align 5 - cmpdi 5, 64 - blt Out_no_poly1305 - - SAVE_REGS - - do_poly1305_init - - li 21, 0 # counter to message - - poly1305_setup_r - - # load previous H state - # break/convert r6 to 26 bits - ld 9, 0(3) - ld 10, 8(3) - ld 19, 16(3) - sldi 19, 19, 24 - mtvsrdd 41, 0, 19 - extrdi 14, 9, 26, 38 - extrdi 15, 9, 26, 12 - extrdi 16, 9, 12, 0 - mtvsrdd 36, 0, 14 - insrdi 16, 10, 14, 38 - mtvsrdd 37, 0, 15 - extrdi 17, 10, 26, 24 - mtvsrdd 38, 0, 16 - extrdi 18, 10, 24, 0 - mtvsrdd 39, 0, 17 - mtvsrdd 40, 0, 18 - vor 8, 8, 9 - - # input m1 m2 - add 20, 4, 21 - xxlor 49, 24, 24 - xxlor 50, 25, 25 - lxvw4x 43, 0, 20 - addi 17, 20, 16 - lxvw4x 44, 0, 17 - vperm 14, 11, 12, 17 - vperm 15, 11, 12, 18 - vand 9, 14, 25 # a0 - vsrd 10, 14, 31 # >> 26 - vsrd 11, 10, 31 # 12 bits left - vand 10, 10, 25 # a1 - vspltisb 13, 12 - vand 16, 15, 25 - vsld 12, 16, 13 - vor 11, 11, 12 - vand 11, 11, 25 # a2 - vspltisb 13, 14 - vsrd 12, 15, 13 # >> 14 - vsrd 13, 12, 31 # >> 26, a4 - vand 12, 12, 25 # a3 - - vaddudm 20, 4, 9 - vaddudm 21, 5, 10 - vaddudm 22, 6, 11 - vaddudm 23, 7, 12 - vaddudm 24, 8, 13 - - # m3 m4 - addi 17, 17, 16 - lxvw4x 43, 0, 17 - addi 17, 17, 16 - lxvw4x 44, 0, 17 - vperm 14, 11, 12, 17 - vperm 15, 11, 12, 18 - vand 9, 14, 25 # a0 - vsrd 10, 14, 31 # >> 26 - vsrd 11, 10, 31 # 12 bits left - vand 10, 10, 25 # a1 - vspltisb 13, 12 - vand 16, 15, 25 - vsld 12, 16, 13 - vspltisb 13, 14 - vor 11, 11, 12 - vand 11, 11, 25 # a2 - vsrd 12, 15, 13 # >> 14 - vsrd 13, 12, 31 # >> 26, a4 - vand 12, 12, 25 # a3 - - # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] - vmrgow 4, 9, 20 - vmrgow 5, 10, 21 - vmrgow 6, 11, 22 - vmrgow 7, 12, 23 - vmrgow 8, 13, 24 - vaddudm 8, 8, 19 - - addi 5, 5, -64 # len -= 64 - addi 21, 21, 64 # offset += 64 - - li 9, 64 - divdu 31, 5, 9 - - cmpdi 31, 0 - ble Skip_block_loop - - mtctr 31 - -# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r -# Rewrite the polynominal sum of product as follows, -# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2 -# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2 -# .... Repeat -# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 --> -# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r -# -loop_4blocks: - - # Multiply odd words and even words - mul_odd - mul_even - # carry reduction - vspltisb 9, 2 - vsrd 10, 14, 31 - vsrd 11, 17, 31 - vand 7, 17, 25 - vand 4, 14, 25 - vaddudm 18, 18, 11 - vsrd 12, 18, 31 - vaddudm 15, 15, 10 - - vsrd 11, 15, 31 - vand 8, 18, 25 - vand 5, 15, 25 - vaddudm 4, 4, 12 - vsld 10, 12, 9 - vaddudm 6, 16, 11 - - vsrd 13, 6, 31 - vand 6, 6, 25 - vaddudm 4, 4, 10 - vsrd 10, 4, 31 - vaddudm 7, 7, 13 - - vsrd 11, 7, 31 - vand 7, 7, 25 - vand 4, 4, 25 - vaddudm 5, 5, 10 - vaddudm 8, 8, 11 - - # input m1 m2 m3 m4 - add 20, 4, 21 - xxlor 49, 24, 24 - xxlor 50, 25, 25 - lxvw4x 43, 0, 20 - addi 17, 20, 16 - lxvw4x 44, 0, 17 - vperm 14, 11, 12, 17 - vperm 15, 11, 12, 18 - addi 17, 17, 16 - lxvw4x 43, 0, 17 - addi 17, 17, 16 - lxvw4x 44, 0, 17 - vperm 17, 11, 12, 17 - vperm 18, 11, 12, 18 - - vand 20, 14, 25 # a0 - vand 9, 17, 25 # a0 - vsrd 21, 14, 31 # >> 26 - vsrd 22, 21, 31 # 12 bits left - vsrd 10, 17, 31 # >> 26 - vsrd 11, 10, 31 # 12 bits left - - vand 21, 21, 25 # a1 - vand 10, 10, 25 # a1 - - vspltisb 13, 12 - vand 16, 15, 25 - vsld 23, 16, 13 - vor 22, 22, 23 - vand 22, 22, 25 # a2 - vand 16, 18, 25 - vsld 12, 16, 13 - vor 11, 11, 12 - vand 11, 11, 25 # a2 - vspltisb 13, 14 - vsrd 23, 15, 13 # >> 14 - vsrd 24, 23, 31 # >> 26, a4 - vand 23, 23, 25 # a3 - vsrd 12, 18, 13 # >> 14 - vsrd 13, 12, 31 # >> 26, a4 - vand 12, 12, 25 # a3 - - vaddudm 4, 4, 20 - vaddudm 5, 5, 21 - vaddudm 6, 6, 22 - vaddudm 7, 7, 23 - vaddudm 8, 8, 24 - - # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] - vmrgow 4, 9, 4 - vmrgow 5, 10, 5 - vmrgow 6, 11, 6 - vmrgow 7, 12, 7 - vmrgow 8, 13, 8 - vaddudm 8, 8, 19 - - addi 5, 5, -64 # len -= 64 - addi 21, 21, 64 # offset += 64 - - bdnz loop_4blocks - -Skip_block_loop: - xxlor 58, 0, 0 - xxlor 59, 1, 1 - xxlor 60, 2, 2 - xxlor 61, 3, 3 - xxlor 62, 4, 4 - xxlor 32, 5, 5 - xxlor 33, 6, 6 - xxlor 34, 7, 7 - xxlor 35, 8, 8 - - # Multiply odd words and even words - mul_odd - mul_even - - # Sum the products. - xxpermdi 41, 31, 46, 0 - xxpermdi 42, 31, 47, 0 - vaddudm 4, 14, 9 - xxpermdi 36, 31, 36, 3 - vaddudm 5, 15, 10 - xxpermdi 37, 31, 37, 3 - xxpermdi 43, 31, 48, 0 - vaddudm 6, 16, 11 - xxpermdi 38, 31, 38, 3 - xxpermdi 44, 31, 49, 0 - vaddudm 7, 17, 12 - xxpermdi 39, 31, 39, 3 - xxpermdi 45, 31, 50, 0 - vaddudm 8, 18, 13 - xxpermdi 40, 31, 40, 3 - - # carry reduction - vspltisb 9, 2 - vsrd 10, 4, 31 - vsrd 11, 7, 31 - vand 7, 7, 25 - vand 4, 4, 25 - vaddudm 8, 8, 11 - vsrd 12, 8, 31 - vaddudm 5, 5, 10 - - vsrd 11, 5, 31 - vand 8, 8, 25 - vand 5, 5, 25 - vaddudm 4, 4, 12 - vsld 10, 12, 9 - vaddudm 6, 6, 11 - - vsrd 13, 6, 31 - vand 6, 6, 25 - vaddudm 4, 4, 10 - vsrd 10, 4, 31 - vaddudm 7, 7, 13 - - vsrd 11, 7, 31 - vand 7, 7, 25 - vand 4, 4, 25 - vaddudm 5, 5, 10 - vsrd 10, 5, 31 - vand 5, 5, 25 - vaddudm 6, 6, 10 - vaddudm 8, 8, 11 - - b do_final_update - -do_final_update: - # combine 26 bit limbs - # v4, v5, v6, v7 and v8 are 26 bit vectors - vsld 5, 5, 31 - vor 20, 4, 5 - vspltisb 11, 12 - vsrd 12, 6, 11 - vsld 6, 6, 31 - vsld 6, 6, 31 - vor 20, 20, 6 - vspltisb 11, 14 - vsld 7, 7, 11 - vor 21, 7, 12 - mfvsrld 16, 40 # save last 2 bytes - vsld 8, 8, 11 - vsld 8, 8, 31 - vor 21, 21, 8 - mfvsrld 17, 52 - mfvsrld 19, 53 - srdi 16, 16, 24 - - std 17, 0(3) - std 19, 8(3) - stw 16, 16(3) - -Out_loop: - li 3, 0 - - RESTORE_REGS - - blr - -Out_no_poly1305: - li 3, 0 - blr -SYM_FUNC_END(poly1305_p10le_4blocks) - -# -# ======================================================================= -# The following functions implement 64 x 64 bits multiplication poly1305. -# -SYM_FUNC_START_LOCAL(Poly1305_init_64) - # mask 0x0FFFFFFC0FFFFFFC - # mask 0x0FFFFFFC0FFFFFFF - addis 10, 2, rmask@toc@ha - addi 10, 10, rmask@toc@l - ld 11, 0(10) - ld 12, 8(10) - - # initialize - # load key from r3 - ld 9, 24(3) - ld 10, 32(3) - and. 9, 9, 11 # cramp mask r0 - and. 10, 10, 12 # cramp mask r1 - - srdi 21, 10, 2 - add 19, 21, 10 # s1: r19 - (r1 >> 2) *5 - - # setup r and s - li 25, 0 - mtvsrdd 32+0, 9, 19 # r0, s1 - mtvsrdd 32+1, 10, 9 # r1, r0 - mtvsrdd 32+2, 19, 25 # s1 - mtvsrdd 32+3, 9, 25 # r0 - - blr -SYM_FUNC_END(Poly1305_init_64) - -# Poly1305_mult -# v6 = (h0, h1), v8 = h2 -# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0 -# -# Output: v7, v10, v11 -# -SYM_FUNC_START_LOCAL(Poly1305_mult) - # - # d0 = h0 * r0 + h1 * s1 - vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1 - - # d1 = h0 * r1 + h1 * r0 + h2 * s1 - vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0 - vmsumudm 10, 8, 2, 11 # d1 += h2 * s1 - - # d2 = r0 - vmsumudm 11, 8, 3, 9 # d2 = h2 * r0 - blr -SYM_FUNC_END(Poly1305_mult) - -# -# carry reduction -# h %=p -# -# Input: v7, v10, v11 -# Output: r27, r28, r29 -# -SYM_FUNC_START_LOCAL(Carry_reduction) - mfvsrld 27, 32+7 - mfvsrld 28, 32+10 - mfvsrld 29, 32+11 - mfvsrd 20, 32+7 # h0.h - mfvsrd 21, 32+10 # h1.h - - addc 28, 28, 20 - adde 29, 29, 21 - srdi 22, 29, 0x2 - sldi 23, 22, 0x2 - add 23, 23, 22 # (h2 & 3) * 5 - addc 27, 27, 23 # h0 - addze 28, 28 # h1 - andi. 29, 29, 0x3 # h2 - blr -SYM_FUNC_END(Carry_reduction) - -# -# poly1305 multiplication -# h *= r, h %= p -# d0 = h0 * r0 + h1 * s1 -# d1 = h0 * r1 + h1 * r0 + h2 * s1 -# d2 = h0 * r0 -# -# -# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit) -# - no highbit if final leftover block (highbit = 0) -# -SYM_FUNC_START(poly1305_64s) - cmpdi 5, 0 - ble Out_no_poly1305_64 - - mflr 0 - std 0, 16(1) - stdu 1,-400(1) - - SAVE_GPR 14, 112, 1 - SAVE_GPR 15, 120, 1 - SAVE_GPR 16, 128, 1 - SAVE_GPR 17, 136, 1 - SAVE_GPR 18, 144, 1 - SAVE_GPR 19, 152, 1 - SAVE_GPR 20, 160, 1 - SAVE_GPR 21, 168, 1 - SAVE_GPR 22, 176, 1 - SAVE_GPR 23, 184, 1 - SAVE_GPR 24, 192, 1 - SAVE_GPR 25, 200, 1 - SAVE_GPR 26, 208, 1 - SAVE_GPR 27, 216, 1 - SAVE_GPR 28, 224, 1 - SAVE_GPR 29, 232, 1 - SAVE_GPR 30, 240, 1 - SAVE_GPR 31, 248, 1 - - # Init poly1305 - bl Poly1305_init_64 - - li 25, 0 # offset to inp and outp - - add 11, 25, 4 - - # load h - # h0, h1, h2? - ld 27, 0(3) - ld 28, 8(3) - lwz 29, 16(3) - - li 30, 16 - divdu 31, 5, 30 - - mtctr 31 - - mr 24, 6 # highbit - -Loop_block_64: - vxor 9, 9, 9 - - ld 20, 0(11) - ld 21, 8(11) - addi 11, 11, 16 - - addc 27, 27, 20 - adde 28, 28, 21 - adde 29, 29, 24 - - li 22, 0 - mtvsrdd 32+6, 27, 28 # h0, h1 - mtvsrdd 32+8, 29, 22 # h2 - - bl Poly1305_mult - - bl Carry_reduction - - bdnz Loop_block_64 - - std 27, 0(3) - std 28, 8(3) - stw 29, 16(3) - - li 3, 0 - - RESTORE_GPR 14, 112, 1 - RESTORE_GPR 15, 120, 1 - RESTORE_GPR 16, 128, 1 - RESTORE_GPR 17, 136, 1 - RESTORE_GPR 18, 144, 1 - RESTORE_GPR 19, 152, 1 - RESTORE_GPR 20, 160, 1 - RESTORE_GPR 21, 168, 1 - RESTORE_GPR 22, 176, 1 - RESTORE_GPR 23, 184, 1 - RESTORE_GPR 24, 192, 1 - RESTORE_GPR 25, 200, 1 - RESTORE_GPR 26, 208, 1 - RESTORE_GPR 27, 216, 1 - RESTORE_GPR 28, 224, 1 - RESTORE_GPR 29, 232, 1 - RESTORE_GPR 30, 240, 1 - RESTORE_GPR 31, 248, 1 - - addi 1, 1, 400 - ld 0, 16(1) - mtlr 0 - - blr - -Out_no_poly1305_64: - li 3, 0 - blr -SYM_FUNC_END(poly1305_64s) - -# -# Input: r3 = h, r4 = s, r5 = mac -# mac = h + s -# -SYM_FUNC_START(poly1305_emit_64) - ld 10, 0(3) - ld 11, 8(3) - ld 12, 16(3) - - # compare modulus - # h + 5 + (-p) - mr 6, 10 - mr 7, 11 - mr 8, 12 - addic. 6, 6, 5 - addze 7, 7 - addze 8, 8 - srdi 9, 8, 2 # overflow? - cmpdi 9, 0 - beq Skip_h64 - mr 10, 6 - mr 11, 7 - mr 12, 8 - -Skip_h64: - ld 6, 0(4) - ld 7, 8(4) - addc 10, 10, 6 - adde 11, 11, 7 - addze 12, 12 - - std 10, 0(5) - std 11, 8(5) - blr -SYM_FUNC_END(poly1305_emit_64) - -SYM_DATA_START_LOCAL(RMASK) -.align 5 -rmask: -.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f -cnum: -.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000 -.long 0x1a, 0x00, 0x1a, 0x00 -.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 -.long 0x00010203, 0x04050607, 0x10111213, 0x14151617 -.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f -SYM_DATA_END(RMASK) diff --git a/arch/powerpc/crypto/sha1-spe-glue.c b/arch/powerpc/crypto/sha1-spe-glue.c index 9170892a8557..04c88e173ce1 100644 --- a/arch/powerpc/crypto/sha1-spe-glue.c +++ b/arch/powerpc/crypto/sha1-spe-glue.c @@ -7,16 +7,13 @@ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> */ +#include <asm/switch_to.h> #include <crypto/internal/hash.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/types.h> #include <crypto/sha1.h> #include <crypto/sha1_base.h> -#include <asm/byteorder.h> -#include <asm/switch_to.h> -#include <linux/hardirq.h> +#include <linux/kernel.h> +#include <linux/preempt.h> +#include <linux/module.h> /* * MAX_BYTES defines the number of bytes that are allowed to be processed @@ -30,7 +27,7 @@ */ #define MAX_BYTES 2048 -extern void ppc_spe_sha1_transform(u32 *state, const u8 *src, u32 blocks); +asmlinkage void ppc_spe_sha1_transform(u32 *state, const u8 *src, u32 blocks); static void spe_begin(void) { @@ -46,126 +43,45 @@ static void spe_end(void) preempt_enable(); } -static inline void ppc_sha1_clear_context(struct sha1_state *sctx) +static void ppc_spe_sha1_block(struct sha1_state *sctx, const u8 *src, + int blocks) { - int count = sizeof(struct sha1_state) >> 2; - u32 *ptr = (u32 *)sctx; - - /* make sure we can clear the fast way */ - BUILD_BUG_ON(sizeof(struct sha1_state) % 4); - do { *ptr++ = 0; } while (--count); -} - -static int ppc_spe_sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - const unsigned int offset = sctx->count & 0x3f; - const unsigned int avail = 64 - offset; - unsigned int bytes; - const u8 *src = data; - - if (avail > len) { - sctx->count += len; - memcpy((char *)sctx->buffer + offset, src, len); - return 0; - } - - sctx->count += len; - - if (offset) { - memcpy((char *)sctx->buffer + offset, src, avail); + do { + int unit = min(blocks, MAX_BYTES / SHA1_BLOCK_SIZE); spe_begin(); - ppc_spe_sha1_transform(sctx->state, (const u8 *)sctx->buffer, 1); + ppc_spe_sha1_transform(sctx->state, src, unit); spe_end(); - len -= avail; - src += avail; - } - - while (len > 63) { - bytes = (len > MAX_BYTES) ? MAX_BYTES : len; - bytes = bytes & ~0x3f; - - spe_begin(); - ppc_spe_sha1_transform(sctx->state, src, bytes >> 6); - spe_end(); - - src += bytes; - len -= bytes; - } - - memcpy((char *)sctx->buffer, src, len); - return 0; -} - -static int ppc_spe_sha1_final(struct shash_desc *desc, u8 *out) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - const unsigned int offset = sctx->count & 0x3f; - char *p = (char *)sctx->buffer + offset; - int padlen; - __be64 *pbits = (__be64 *)(((char *)&sctx->buffer) + 56); - __be32 *dst = (__be32 *)out; - - padlen = 55 - offset; - *p++ = 0x80; - - spe_begin(); - - if (padlen < 0) { - memset(p, 0x00, padlen + sizeof (u64)); - ppc_spe_sha1_transform(sctx->state, sctx->buffer, 1); - p = (char *)sctx->buffer; - padlen = 56; - } - - memset(p, 0, padlen); - *pbits = cpu_to_be64(sctx->count << 3); - ppc_spe_sha1_transform(sctx->state, sctx->buffer, 1); - - spe_end(); - - dst[0] = cpu_to_be32(sctx->state[0]); - dst[1] = cpu_to_be32(sctx->state[1]); - dst[2] = cpu_to_be32(sctx->state[2]); - dst[3] = cpu_to_be32(sctx->state[3]); - dst[4] = cpu_to_be32(sctx->state[4]); - - ppc_sha1_clear_context(sctx); - return 0; + src += unit * SHA1_BLOCK_SIZE; + blocks -= unit; + } while (blocks); } -static int ppc_spe_sha1_export(struct shash_desc *desc, void *out) +static int ppc_spe_sha1_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - struct sha1_state *sctx = shash_desc_ctx(desc); - - memcpy(out, sctx, sizeof(*sctx)); - return 0; + return sha1_base_do_update_blocks(desc, data, len, ppc_spe_sha1_block); } -static int ppc_spe_sha1_import(struct shash_desc *desc, const void *in) +static int ppc_spe_sha1_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *out) { - struct sha1_state *sctx = shash_desc_ctx(desc); - - memcpy(sctx, in, sizeof(*sctx)); - return 0; + sha1_base_do_finup(desc, src, len, ppc_spe_sha1_block); + return sha1_base_finish(desc, out); } static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = ppc_spe_sha1_update, - .final = ppc_spe_sha1_final, - .export = ppc_spe_sha1_export, - .import = ppc_spe_sha1_import, - .descsize = sizeof(struct sha1_state), - .statesize = sizeof(struct sha1_state), + .finup = ppc_spe_sha1_finup, + .descsize = SHA1_STATE_SIZE, .base = { .cra_name = "sha1", .cra_driver_name= "sha1-ppc-spe", .cra_priority = 300, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/powerpc/crypto/sha1.c b/arch/powerpc/crypto/sha1.c index f283bbd3f121..4593946aa9b3 100644 --- a/arch/powerpc/crypto/sha1.c +++ b/arch/powerpc/crypto/sha1.c @@ -13,107 +13,46 @@ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> */ #include <crypto/internal/hash.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/types.h> #include <crypto/sha1.h> #include <crypto/sha1_base.h> -#include <asm/byteorder.h> - -void powerpc_sha_transform(u32 *state, const u8 *src); - -static int powerpc_sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - unsigned int partial, done; - const u8 *src; - - partial = sctx->count & 0x3f; - sctx->count += len; - done = 0; - src = data; - - if ((partial + len) > 63) { - - if (partial) { - done = -partial; - memcpy(sctx->buffer + partial, data, done + 64); - src = sctx->buffer; - } - - do { - powerpc_sha_transform(sctx->state, src); - done += 64; - src = data + done; - } while (done + 63 < len); - - partial = 0; - } - memcpy(sctx->buffer + partial, src, len - done); - - return 0; -} +#include <linux/kernel.h> +#include <linux/module.h> +asmlinkage void powerpc_sha_transform(u32 *state, const u8 *src); -/* Add padding and return the message digest. */ -static int powerpc_sha1_final(struct shash_desc *desc, u8 *out) +static void powerpc_sha_block(struct sha1_state *sctx, const u8 *data, + int blocks) { - struct sha1_state *sctx = shash_desc_ctx(desc); - __be32 *dst = (__be32 *)out; - u32 i, index, padlen; - __be64 bits; - static const u8 padding[64] = { 0x80, }; - - bits = cpu_to_be64(sctx->count << 3); - - /* Pad out to 56 mod 64 */ - index = sctx->count & 0x3f; - padlen = (index < 56) ? (56 - index) : ((64+56) - index); - powerpc_sha1_update(desc, padding, padlen); - - /* Append length */ - powerpc_sha1_update(desc, (const u8 *)&bits, sizeof(bits)); - - /* Store state in digest */ - for (i = 0; i < 5; i++) - dst[i] = cpu_to_be32(sctx->state[i]); - - /* Wipe context */ - memset(sctx, 0, sizeof *sctx); - - return 0; + do { + powerpc_sha_transform(sctx->state, data); + data += 64; + } while (--blocks); } -static int powerpc_sha1_export(struct shash_desc *desc, void *out) +static int powerpc_sha1_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - struct sha1_state *sctx = shash_desc_ctx(desc); - - memcpy(out, sctx, sizeof(*sctx)); - return 0; + return sha1_base_do_update_blocks(desc, data, len, powerpc_sha_block); } -static int powerpc_sha1_import(struct shash_desc *desc, const void *in) +/* Add padding and return the message digest. */ +static int powerpc_sha1_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *out) { - struct sha1_state *sctx = shash_desc_ctx(desc); - - memcpy(sctx, in, sizeof(*sctx)); - return 0; + sha1_base_do_finup(desc, src, len, powerpc_sha_block); + return sha1_base_finish(desc, out); } static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = powerpc_sha1_update, - .final = powerpc_sha1_final, - .export = powerpc_sha1_export, - .import = powerpc_sha1_import, - .descsize = sizeof(struct sha1_state), - .statesize = sizeof(struct sha1_state), + .finup = powerpc_sha1_finup, + .descsize = SHA1_STATE_SIZE, .base = { .cra_name = "sha1", .cra_driver_name= "sha1-powerpc", + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/powerpc/crypto/sha256-spe-asm.S b/arch/powerpc/crypto/sha256-spe-asm.S deleted file mode 100644 index cd99d71dae34..000000000000 --- a/arch/powerpc/crypto/sha256-spe-asm.S +++ /dev/null @@ -1,318 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Fast SHA-256 implementation for SPE instruction set (PPC) - * - * This code makes use of the SPE SIMD instruction set as defined in - * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf - * Implementation is based on optimization guide notes from - * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf - * - * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> - */ - -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> - -#define rHP r3 /* pointer to hash values in memory */ -#define rKP r24 /* pointer to round constants */ -#define rWP r4 /* pointer to input data */ - -#define rH0 r5 /* 8 32 bit hash values in 8 registers */ -#define rH1 r6 -#define rH2 r7 -#define rH3 r8 -#define rH4 r9 -#define rH5 r10 -#define rH6 r11 -#define rH7 r12 - -#define rW0 r14 /* 64 bit registers. 16 words in 8 registers */ -#define rW1 r15 -#define rW2 r16 -#define rW3 r17 -#define rW4 r18 -#define rW5 r19 -#define rW6 r20 -#define rW7 r21 - -#define rT0 r22 /* 64 bit temporaries */ -#define rT1 r23 -#define rT2 r0 /* 32 bit temporaries */ -#define rT3 r25 - -#define CMP_KN_LOOP -#define CMP_KC_LOOP \ - cmpwi rT1,0; - -#define INITIALIZE \ - stwu r1,-128(r1); /* create stack frame */ \ - evstdw r14,8(r1); /* We must save non volatile */ \ - evstdw r15,16(r1); /* registers. Take the chance */ \ - evstdw r16,24(r1); /* and save the SPE part too */ \ - evstdw r17,32(r1); \ - evstdw r18,40(r1); \ - evstdw r19,48(r1); \ - evstdw r20,56(r1); \ - evstdw r21,64(r1); \ - evstdw r22,72(r1); \ - evstdw r23,80(r1); \ - stw r24,88(r1); /* save normal registers */ \ - stw r25,92(r1); - - -#define FINALIZE \ - evldw r14,8(r1); /* restore SPE registers */ \ - evldw r15,16(r1); \ - evldw r16,24(r1); \ - evldw r17,32(r1); \ - evldw r18,40(r1); \ - evldw r19,48(r1); \ - evldw r20,56(r1); \ - evldw r21,64(r1); \ - evldw r22,72(r1); \ - evldw r23,80(r1); \ - lwz r24,88(r1); /* restore normal registers */ \ - lwz r25,92(r1); \ - xor r0,r0,r0; \ - stw r0,8(r1); /* Delete sensitive data */ \ - stw r0,16(r1); /* that we might have pushed */ \ - stw r0,24(r1); /* from other context that runs */ \ - stw r0,32(r1); /* the same code. Assume that */ \ - stw r0,40(r1); /* the lower part of the GPRs */ \ - stw r0,48(r1); /* was already overwritten on */ \ - stw r0,56(r1); /* the way down to here */ \ - stw r0,64(r1); \ - stw r0,72(r1); \ - stw r0,80(r1); \ - addi r1,r1,128; /* cleanup stack frame */ - -#ifdef __BIG_ENDIAN__ -#define LOAD_DATA(reg, off) \ - lwz reg,off(rWP); /* load data */ -#define NEXT_BLOCK \ - addi rWP,rWP,64; /* increment per block */ -#else -#define LOAD_DATA(reg, off) \ - lwbrx reg,0,rWP; /* load data */ \ - addi rWP,rWP,4; /* increment per word */ -#define NEXT_BLOCK /* nothing to do */ -#endif - -#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \ - LOAD_DATA(w, off) /* 1: W */ \ - rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \ - rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \ - rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \ - xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \ - and rT3,e,f; /* 1: ch = e and f */ \ - xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \ - andc rT1,g,e; /* 1: ch' = ~e and g */ \ - lwz rT2,off(rKP); /* 1: K */ \ - xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \ - add h,h,rT0; /* 1: temp1 = h + S1 */ \ - add rT3,rT3,w; /* 1: temp1' = ch + w */ \ - rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \ - add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \ - rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \ - add h,h,rT2; /* 1: temp1 = temp1 + K */ \ - rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \ - xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \ - add d,d,h; /* 1: d = d + temp1 */ \ - xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \ - evmergelo w,w,w; /* shift W */ \ - or rT2,a,b; /* 1: maj = a or b */ \ - and rT1,a,b; /* 1: maj' = a and b */ \ - and rT2,rT2,c; /* 1: maj = maj and c */ \ - LOAD_DATA(w, off+4) /* 2: W */ \ - or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \ - rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \ - add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \ - rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \ - add h,h,rT3; /* 1: h = temp1 + temp2 */ \ - rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \ - xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \ - and rT3,d,e; /* 2: ch = e and f */ \ - xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \ - andc rT1,f,d; /* 2: ch' = ~e and g */ \ - lwz rT2,off+4(rKP); /* 2: K */ \ - xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \ - add g,g,rT0; /* 2: temp1 = h + S1 */ \ - add rT3,rT3,w; /* 2: temp1' = ch + w */ \ - rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \ - add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \ - rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \ - add g,g,rT2; /* 2: temp1 = temp1 + K */ \ - rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \ - xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \ - or rT2,h,a; /* 2: maj = a or b */ \ - xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \ - and rT1,h,a; /* 2: maj' = a and b */ \ - and rT2,rT2,b; /* 2: maj = maj and c */ \ - add c,c,g; /* 2: d = d + temp1 */ \ - or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \ - add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \ - add g,g,rT3 /* 2: h = temp1 + temp2 */ - -#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \ - rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \ - evmergelohi rT0,w0,w1; /* w[-15] */ \ - rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \ - evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \ - xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \ - evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \ - rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \ - evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \ - xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \ - evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \ - add h,h,rT2; /* 1: temp1 = h + S1 */ \ - evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \ - and rT2,e,f; /* 1: ch = e and f */ \ - evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \ - andc rT3,g,e; /* 1: ch' = ~e and g */ \ - evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \ - xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \ - evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \ - add h,h,rT2; /* 1: temp1 = temp1 + ch */ \ - evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \ - rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \ - evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \ - rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \ - evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \ - xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \ - evldw rT1,off(rKP); /* k */ \ - rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \ - evaddw w0,w0,rT0; /* w = w + s1 */ \ - xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \ - evmergelohi rT0,w4,w5; /* w[-7] */ \ - and rT3,a,b; /* 1: maj = a and b */ \ - evaddw w0,w0,rT0; /* w = w + w[-7] */ \ - CMP_K##k##_LOOP \ - add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \ - evaddw rT1,rT1,w0; /* wk = w + k */ \ - xor rT3,a,b; /* 1: maj = a xor b */ \ - evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \ - and rT3,rT3,c; /* 1: maj = maj and c */ \ - add h,h,rT0; /* 1: temp1 = temp1 + wk */ \ - add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \ - add g,g,rT1; /* 2: temp1 = temp1 + wk */ \ - add d,d,h; /* 1: d = d + temp1 */ \ - rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \ - add h,h,rT2; /* 1: h = temp1 + temp2 */ \ - rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \ - rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \ - xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \ - and rT3,d,e; /* 2: ch = e and f */ \ - xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \ - andc rT1,f,d; /* 2: ch' = ~e and g */ \ - add g,g,rT0; /* 2: temp1 = h + S1 */ \ - xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \ - rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \ - add g,g,rT3; /* 2: temp1 = temp1 + ch */ \ - rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \ - rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \ - xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \ - or rT2,h,a; /* 2: maj = a or b */ \ - and rT1,h,a; /* 2: maj' = a and b */ \ - and rT2,rT2,b; /* 2: maj = maj and c */ \ - xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \ - or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \ - add c,c,g; /* 2: d = d + temp1 */ \ - add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \ - add g,g,rT3 /* 2: h = temp1 + temp2 */ - -_GLOBAL(ppc_spe_sha256_transform) - INITIALIZE - - mtctr r5 - lwz rH0,0(rHP) - lwz rH1,4(rHP) - lwz rH2,8(rHP) - lwz rH3,12(rHP) - lwz rH4,16(rHP) - lwz rH5,20(rHP) - lwz rH6,24(rHP) - lwz rH7,28(rHP) - -ppc_spe_sha256_main: - lis rKP,PPC_SPE_SHA256_K@ha - addi rKP,rKP,PPC_SPE_SHA256_K@l - - R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0) - R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8) - R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16) - R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24) - R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32) - R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40) - R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48) - R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56) -ppc_spe_sha256_16_rounds: - addi rKP,rKP,64 - R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, - rW0, rW1, rW4, rW5, rW7, N, 0) - R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, - rW1, rW2, rW5, rW6, rW0, N, 8) - R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, - rW2, rW3, rW6, rW7, rW1, N, 16) - R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, - rW3, rW4, rW7, rW0, rW2, N, 24) - R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, - rW4, rW5, rW0, rW1, rW3, N, 32) - R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, - rW5, rW6, rW1, rW2, rW4, N, 40) - R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, - rW6, rW7, rW2, rW3, rW5, N, 48) - R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, - rW7, rW0, rW3, rW4, rW6, C, 56) - bt gt,ppc_spe_sha256_16_rounds - - lwz rW0,0(rHP) - NEXT_BLOCK - lwz rW1,4(rHP) - lwz rW2,8(rHP) - lwz rW3,12(rHP) - lwz rW4,16(rHP) - lwz rW5,20(rHP) - lwz rW6,24(rHP) - lwz rW7,28(rHP) - - add rH0,rH0,rW0 - stw rH0,0(rHP) - add rH1,rH1,rW1 - stw rH1,4(rHP) - add rH2,rH2,rW2 - stw rH2,8(rHP) - add rH3,rH3,rW3 - stw rH3,12(rHP) - add rH4,rH4,rW4 - stw rH4,16(rHP) - add rH5,rH5,rW5 - stw rH5,20(rHP) - add rH6,rH6,rW6 - stw rH6,24(rHP) - add rH7,rH7,rW7 - stw rH7,28(rHP) - - bdnz ppc_spe_sha256_main - - FINALIZE - blr - -.data -.align 5 -PPC_SPE_SHA256_K: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 diff --git a/arch/powerpc/crypto/sha256-spe-glue.c b/arch/powerpc/crypto/sha256-spe-glue.c deleted file mode 100644 index 2997d13236e0..000000000000 --- a/arch/powerpc/crypto/sha256-spe-glue.c +++ /dev/null @@ -1,235 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Glue code for SHA-256 implementation for SPE instructions (PPC) - * - * Based on generic implementation. The assembler module takes care - * about the SPE registers so it can run from interrupt context. - * - * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> - */ - -#include <crypto/internal/hash.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/types.h> -#include <crypto/sha2.h> -#include <crypto/sha256_base.h> -#include <asm/byteorder.h> -#include <asm/switch_to.h> -#include <linux/hardirq.h> - -/* - * MAX_BYTES defines the number of bytes that are allowed to be processed - * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000 - * operations per 64 bytes. e500 cores can issue two arithmetic instructions - * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2). - * Thus 1KB of input data will need an estimated maximum of 18,000 cycles. - * Headroom for cache misses included. Even with the low end model clocked - * at 667 MHz this equals to a critical time window of less than 27us. - * - */ -#define MAX_BYTES 1024 - -extern void ppc_spe_sha256_transform(u32 *state, const u8 *src, u32 blocks); - -static void spe_begin(void) -{ - /* We just start SPE operations and will save SPE registers later. */ - preempt_disable(); - enable_kernel_spe(); -} - -static void spe_end(void) -{ - disable_kernel_spe(); - /* reenable preemption */ - preempt_enable(); -} - -static inline void ppc_sha256_clear_context(struct sha256_state *sctx) -{ - int count = sizeof(struct sha256_state) >> 2; - u32 *ptr = (u32 *)sctx; - - /* make sure we can clear the fast way */ - BUILD_BUG_ON(sizeof(struct sha256_state) % 4); - do { *ptr++ = 0; } while (--count); -} - -static int ppc_spe_sha256_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - const unsigned int offset = sctx->count & 0x3f; - const unsigned int avail = 64 - offset; - unsigned int bytes; - const u8 *src = data; - - if (avail > len) { - sctx->count += len; - memcpy((char *)sctx->buf + offset, src, len); - return 0; - } - - sctx->count += len; - - if (offset) { - memcpy((char *)sctx->buf + offset, src, avail); - - spe_begin(); - ppc_spe_sha256_transform(sctx->state, (const u8 *)sctx->buf, 1); - spe_end(); - - len -= avail; - src += avail; - } - - while (len > 63) { - /* cut input data into smaller blocks */ - bytes = (len > MAX_BYTES) ? MAX_BYTES : len; - bytes = bytes & ~0x3f; - - spe_begin(); - ppc_spe_sha256_transform(sctx->state, src, bytes >> 6); - spe_end(); - - src += bytes; - len -= bytes; - } - - memcpy((char *)sctx->buf, src, len); - return 0; -} - -static int ppc_spe_sha256_final(struct shash_desc *desc, u8 *out) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - const unsigned int offset = sctx->count & 0x3f; - char *p = (char *)sctx->buf + offset; - int padlen; - __be64 *pbits = (__be64 *)(((char *)&sctx->buf) + 56); - __be32 *dst = (__be32 *)out; - - padlen = 55 - offset; - *p++ = 0x80; - - spe_begin(); - - if (padlen < 0) { - memset(p, 0x00, padlen + sizeof (u64)); - ppc_spe_sha256_transform(sctx->state, sctx->buf, 1); - p = (char *)sctx->buf; - padlen = 56; - } - - memset(p, 0, padlen); - *pbits = cpu_to_be64(sctx->count << 3); - ppc_spe_sha256_transform(sctx->state, sctx->buf, 1); - - spe_end(); - - dst[0] = cpu_to_be32(sctx->state[0]); - dst[1] = cpu_to_be32(sctx->state[1]); - dst[2] = cpu_to_be32(sctx->state[2]); - dst[3] = cpu_to_be32(sctx->state[3]); - dst[4] = cpu_to_be32(sctx->state[4]); - dst[5] = cpu_to_be32(sctx->state[5]); - dst[6] = cpu_to_be32(sctx->state[6]); - dst[7] = cpu_to_be32(sctx->state[7]); - - ppc_sha256_clear_context(sctx); - return 0; -} - -static int ppc_spe_sha224_final(struct shash_desc *desc, u8 *out) -{ - __be32 D[SHA256_DIGEST_SIZE >> 2]; - __be32 *dst = (__be32 *)out; - - ppc_spe_sha256_final(desc, (u8 *)D); - - /* avoid bytewise memcpy */ - dst[0] = D[0]; - dst[1] = D[1]; - dst[2] = D[2]; - dst[3] = D[3]; - dst[4] = D[4]; - dst[5] = D[5]; - dst[6] = D[6]; - - /* clear sensitive data */ - memzero_explicit(D, SHA256_DIGEST_SIZE); - return 0; -} - -static int ppc_spe_sha256_export(struct shash_desc *desc, void *out) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - memcpy(out, sctx, sizeof(*sctx)); - return 0; -} - -static int ppc_spe_sha256_import(struct shash_desc *desc, const void *in) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - memcpy(sctx, in, sizeof(*sctx)); - return 0; -} - -static struct shash_alg algs[2] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = ppc_spe_sha256_update, - .final = ppc_spe_sha256_final, - .export = ppc_spe_sha256_export, - .import = ppc_spe_sha256_import, - .descsize = sizeof(struct sha256_state), - .statesize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name= "sha256-ppc-spe", - .cra_priority = 300, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = ppc_spe_sha256_update, - .final = ppc_spe_sha224_final, - .export = ppc_spe_sha256_export, - .import = ppc_spe_sha256_import, - .descsize = sizeof(struct sha256_state), - .statesize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name= "sha224-ppc-spe", - .cra_priority = 300, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int __init ppc_spe_sha256_mod_init(void) -{ - return crypto_register_shashes(algs, ARRAY_SIZE(algs)); -} - -static void __exit ppc_spe_sha256_mod_fini(void) -{ - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); -} - -module_init(ppc_spe_sha256_mod_init); -module_exit(ppc_spe_sha256_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm, SPE optimized"); - -MODULE_ALIAS_CRYPTO("sha224"); -MODULE_ALIAS_CRYPTO("sha224-ppc-spe"); -MODULE_ALIAS_CRYPTO("sha256"); -MODULE_ALIAS_CRYPTO("sha256-ppc-spe"); |