diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-03 20:40:14 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-03 20:40:14 +0300 |
commit | ab5c60b79ab6cc50b39bbb21b2f9fb55af900b84 (patch) | |
tree | 71fa895fbf01e3b88f26cf257d9105f9d286b631 /arch | |
parent | 5577416c39652d395a6045677f4f598564aba1cf (diff) | |
parent | 3cbfe80737c18ac6e635421ab676716a393d3074 (diff) | |
download | linux-ab5c60b79ab6cc50b39bbb21b2f9fb55af900b84.tar.xz |
Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu:
"API:
- Add support for allocating transforms on a specific NUMA Node
- Introduce the flag CRYPTO_ALG_ALLOCATES_MEMORY for storage users
Algorithms:
- Drop PMULL based ghash on arm64
- Fixes for building with clang on x86
- Add sha256 helper that does the digest in one go
- Add SP800-56A rev 3 validation checks to dh
Drivers:
- Permit users to specify NUMA node in hisilicon/zip
- Add support for i.MX6 in imx-rngc
- Add sa2ul crypto driver
- Add BA431 hwrng driver
- Add Ingenic JZ4780 and X1000 hwrng driver
- Spread IRQ affinity in inside-secure and marvell/cesa"
* 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (157 commits)
crypto: sa2ul - Fix inconsistent IS_ERR and PTR_ERR
hwrng: core - remove redundant initialization of variable ret
crypto: x86/curve25519 - Remove unused carry variables
crypto: ingenic - Add hardware RNG for Ingenic JZ4780 and X1000
dt-bindings: RNG: Add Ingenic RNG bindings.
crypto: caam/qi2 - add module alias
crypto: caam - add more RNG hw error codes
crypto: caam/jr - remove incorrect reference to caam_jr_register()
crypto: caam - silence .setkey in case of bad key length
crypto: caam/qi2 - create ahash shared descriptors only once
crypto: caam/qi2 - fix error reporting for caam_hash_alloc
crypto: caam - remove deadcode on 32-bit platforms
crypto: ccp - use generic power management
crypto: xts - Replace memcpy() invocation with simple assignment
crypto: marvell/cesa - irq balance
crypto: inside-secure - irq balance
crypto: ecc - SP800-56A rev 3 local public key validation
crypto: dh - SP800-56A rev 3 local public key validation
crypto: dh - check validity of Z before export
lib/mpi: Add mpi_sub_ui()
...
Diffstat (limited to 'arch')
-rw-r--r-- | arch/arm/crypto/crc32-ce-core.S | 2 | ||||
-rw-r--r-- | arch/arm/crypto/ghash-ce-glue.c | 51 | ||||
-rw-r--r-- | arch/arm/crypto/sha1-armv4-large.S | 2 | ||||
-rw-r--r-- | arch/arm/crypto/sha256-armv4.pl | 2 | ||||
-rw-r--r-- | arch/arm/crypto/sha256-core.S_shipped | 2 | ||||
-rw-r--r-- | arch/arm/crypto/sha512-armv4.pl | 4 | ||||
-rw-r--r-- | arch/arm/crypto/sha512-core.S_shipped | 4 | ||||
-rw-r--r-- | arch/arm64/crypto/ghash-ce-glue.c | 257 | ||||
-rw-r--r-- | arch/sparc/crypto/sha256_glue.c | 14 | ||||
-rw-r--r-- | arch/x86/crypto/aes_ctrby8_avx-x86_64.S | 15 | ||||
-rw-r--r-- | arch/x86/crypto/aesni-intel_asm.S | 739 | ||||
-rw-r--r-- | arch/x86/crypto/aesni-intel_avx-x86_64.S | 1 | ||||
-rw-r--r-- | arch/x86/crypto/chacha-ssse3-x86_64.S | 16 | ||||
-rw-r--r-- | arch/x86/crypto/chacha_glue.c | 17 | ||||
-rw-r--r-- | arch/x86/crypto/crc32-pclmul_asm.S | 47 | ||||
-rw-r--r-- | arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 7 | ||||
-rw-r--r-- | arch/x86/crypto/curve25519-x86_64.c | 6 | ||||
-rw-r--r-- | arch/x86/crypto/ghash-clmulni-intel_asm.S | 17 | ||||
-rw-r--r-- | arch/x86/include/asm/inst.h | 163 |
19 files changed, 551 insertions, 815 deletions
diff --git a/arch/arm/crypto/crc32-ce-core.S b/arch/arm/crypto/crc32-ce-core.S index 5cbd4a6fedad..3f13a76b9066 100644 --- a/arch/arm/crypto/crc32-ce-core.S +++ b/arch/arm/crypto/crc32-ce-core.S @@ -39,7 +39,7 @@ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found * at: - * http://www.intel.com/products/processor/manuals/ + * https://www.intel.com/products/processor/manuals/ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual * Volume 2B: Instruction Set Reference, N-Z * diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c index a00fd329255f..f13401f3e669 100644 --- a/arch/arm/crypto/ghash-ce-glue.c +++ b/arch/arm/crypto/ghash-ce-glue.c @@ -16,6 +16,7 @@ #include <crypto/gf128mul.h> #include <linux/cpufeature.h> #include <linux/crypto.h> +#include <linux/jump_label.h> #include <linux/module.h> MODULE_DESCRIPTION("GHASH hash function using ARMv8 Crypto Extensions"); @@ -27,12 +28,8 @@ MODULE_ALIAS_CRYPTO("ghash"); #define GHASH_DIGEST_SIZE 16 struct ghash_key { - u64 h[2]; - u64 h2[2]; - u64 h3[2]; - u64 h4[2]; - be128 k; + u64 h[][2]; }; struct ghash_desc_ctx { @@ -46,16 +43,12 @@ struct ghash_async_ctx { }; asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src, - struct ghash_key const *k, - const char *head); + u64 const h[][2], const char *head); asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src, - struct ghash_key const *k, - const char *head); + u64 const h[][2], const char *head); -static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src, - struct ghash_key const *k, - const char *head); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_p64); static int ghash_init(struct shash_desc *desc) { @@ -70,7 +63,10 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src, { if (likely(crypto_simd_usable())) { kernel_neon_begin(); - pmull_ghash_update(blocks, dg, src, key, head); + if (static_branch_likely(&use_p64)) + pmull_ghash_update_p64(blocks, dg, src, key->h, head); + else + pmull_ghash_update_p8(blocks, dg, src, key->h, head); kernel_neon_end(); } else { be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) }; @@ -161,25 +157,26 @@ static int ghash_setkey(struct crypto_shash *tfm, const u8 *inkey, unsigned int keylen) { struct ghash_key *key = crypto_shash_ctx(tfm); - be128 h; if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; /* needed for the fallback */ memcpy(&key->k, inkey, GHASH_BLOCK_SIZE); - ghash_reflect(key->h, &key->k); + ghash_reflect(key->h[0], &key->k); - h = key->k; - gf128mul_lle(&h, &key->k); - ghash_reflect(key->h2, &h); + if (static_branch_likely(&use_p64)) { + be128 h = key->k; - gf128mul_lle(&h, &key->k); - ghash_reflect(key->h3, &h); + gf128mul_lle(&h, &key->k); + ghash_reflect(key->h[1], &h); - gf128mul_lle(&h, &key->k); - ghash_reflect(key->h4, &h); + gf128mul_lle(&h, &key->k); + ghash_reflect(key->h[2], &h); + gf128mul_lle(&h, &key->k); + ghash_reflect(key->h[3], &h); + } return 0; } @@ -195,7 +192,7 @@ static struct shash_alg ghash_alg = { .base.cra_driver_name = "ghash-ce-sync", .base.cra_priority = 300 - 1, .base.cra_blocksize = GHASH_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct ghash_key), + .base.cra_ctxsize = sizeof(struct ghash_key) + sizeof(u64[2]), .base.cra_module = THIS_MODULE, }; @@ -354,10 +351,10 @@ static int __init ghash_ce_mod_init(void) if (!(elf_hwcap & HWCAP_NEON)) return -ENODEV; - if (elf_hwcap2 & HWCAP2_PMULL) - pmull_ghash_update = pmull_ghash_update_p64; - else - pmull_ghash_update = pmull_ghash_update_p8; + if (elf_hwcap2 & HWCAP2_PMULL) { + ghash_alg.base.cra_ctxsize += 3 * sizeof(u64[2]); + static_branch_enable(&use_p64); + } err = crypto_register_shash(&ghash_alg); if (err) diff --git a/arch/arm/crypto/sha1-armv4-large.S b/arch/arm/crypto/sha1-armv4-large.S index f82cd8cf5a09..1c8b685149f2 100644 --- a/arch/arm/crypto/sha1-armv4-large.S +++ b/arch/arm/crypto/sha1-armv4-large.S @@ -13,7 +13,7 @@ @ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL @ project. The module is, however, dual licensed under OpenSSL and @ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see http://www.openssl.org/~appro/cryptogams/. +@ details see https://www.openssl.org/~appro/cryptogams/. @ ==================================================================== @ sha1_block procedure for ARMv4. diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl index a03cf4dfb781..9f96ff48e4a8 100644 --- a/arch/arm/crypto/sha256-armv4.pl +++ b/arch/arm/crypto/sha256-armv4.pl @@ -13,7 +13,7 @@ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. +# details see https://www.openssl.org/~appro/cryptogams/. # ==================================================================== # SHA256 block procedure for ARMv4. May 2007. diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped index 054aae0edfce..ea04b2ab0c33 100644 --- a/arch/arm/crypto/sha256-core.S_shipped +++ b/arch/arm/crypto/sha256-core.S_shipped @@ -12,7 +12,7 @@ @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @ project. The module is, however, dual licensed under OpenSSL and @ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see http://www.openssl.org/~appro/cryptogams/. +@ details see https://www.openssl.org/~appro/cryptogams/. @ ==================================================================== @ SHA256 block procedure for ARMv4. May 2007. diff --git a/arch/arm/crypto/sha512-armv4.pl b/arch/arm/crypto/sha512-armv4.pl index 788c17b56ecc..69df68981acd 100644 --- a/arch/arm/crypto/sha512-armv4.pl +++ b/arch/arm/crypto/sha512-armv4.pl @@ -13,7 +13,7 @@ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. +# details see https://www.openssl.org/~appro/cryptogams/. # ==================================================================== # SHA512 block procedure for ARMv4. September 2007. @@ -43,7 +43,7 @@ # terms it's 22.6 cycles per byte, which is disappointing result. # Technical writers asserted that 3-way S4 pipeline can sustain # multiple NEON instructions per cycle, but dual NEON issue could -# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html +# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html # for further details. On side note Cortex-A15 processes one byte in # 16 cycles. diff --git a/arch/arm/crypto/sha512-core.S_shipped b/arch/arm/crypto/sha512-core.S_shipped index 710ea309769e..cb147db5cbfe 100644 --- a/arch/arm/crypto/sha512-core.S_shipped +++ b/arch/arm/crypto/sha512-core.S_shipped @@ -12,7 +12,7 @@ @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @ project. The module is, however, dual licensed under OpenSSL and @ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see http://www.openssl.org/~appro/cryptogams/. +@ details see https://www.openssl.org/~appro/cryptogams/. @ ==================================================================== @ SHA512 block procedure for ARMv4. September 2007. @@ -42,7 +42,7 @@ @ terms it's 22.6 cycles per byte, which is disappointing result. @ Technical writers asserted that 3-way S4 pipeline can sustain @ multiple NEON instructions per cycle, but dual NEON issue could -@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html +@ not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html @ for further details. On side note Cortex-A15 processes one byte in @ 16 cycles. diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 22831d3b7f62..da1034867aaa 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -31,12 +31,8 @@ MODULE_ALIAS_CRYPTO("ghash"); #define GCM_IV_SIZE 12 struct ghash_key { - u64 h[2]; - u64 h2[2]; - u64 h3[2]; - u64 h4[2]; - be128 k; + u64 h[][2]; }; struct ghash_desc_ctx { @@ -51,22 +47,18 @@ struct gcm_aes_ctx { }; asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src, - struct ghash_key const *k, - const char *head); + u64 const h[][2], const char *head); asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src, - struct ghash_key const *k, - const char *head); + u64 const h[][2], const char *head); asmlinkage void pmull_gcm_encrypt(int bytes, u8 dst[], const u8 src[], - struct ghash_key const *k, u64 dg[], - u8 ctr[], u32 const rk[], int rounds, - u8 tag[]); + u64 const h[][2], u64 dg[], u8 ctr[], + u32 const rk[], int rounds, u8 tag[]); asmlinkage void pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[], - struct ghash_key const *k, u64 dg[], - u8 ctr[], u32 const rk[], int rounds, - u8 tag[]); + u64 const h[][2], u64 dg[], u8 ctr[], + u32 const rk[], int rounds, u8 tag[]); static int ghash_init(struct shash_desc *desc) { @@ -77,48 +69,51 @@ static int ghash_init(struct shash_desc *desc) } static void ghash_do_update(int blocks, u64 dg[], const char *src, - struct ghash_key *key, const char *head, - void (*simd_update)(int blocks, u64 dg[], - const char *src, - struct ghash_key const *k, - const char *head)) + struct ghash_key *key, const char *head) { - if (likely(crypto_simd_usable() && simd_update)) { - kernel_neon_begin(); - simd_update(blocks, dg, src, key, head); - kernel_neon_end(); - } else { - be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) }; + be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) }; - do { - const u8 *in = src; - - if (head) { - in = head; - blocks++; - head = NULL; - } else { - src += GHASH_BLOCK_SIZE; - } + do { + const u8 *in = src; + + if (head) { + in = head; + blocks++; + head = NULL; + } else { + src += GHASH_BLOCK_SIZE; + } - crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE); - gf128mul_lle(&dst, &key->k); - } while (--blocks); + crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE); + gf128mul_lle(&dst, &key->k); + } while (--blocks); - dg[0] = be64_to_cpu(dst.b); - dg[1] = be64_to_cpu(dst.a); + dg[0] = be64_to_cpu(dst.b); + dg[1] = be64_to_cpu(dst.a); +} + +static __always_inline +void ghash_do_simd_update(int blocks, u64 dg[], const char *src, + struct ghash_key *key, const char *head, + void (*simd_update)(int blocks, u64 dg[], + const char *src, + u64 const h[][2], + const char *head)) +{ + if (likely(crypto_simd_usable())) { + kernel_neon_begin(); + simd_update(blocks, dg, src, key->h, head); + kernel_neon_end(); + } else { + ghash_do_update(blocks, dg, src, key, head); } } /* avoid hogging the CPU for too long */ #define MAX_BLOCKS (SZ_64K / GHASH_BLOCK_SIZE) -static int __ghash_update(struct shash_desc *desc, const u8 *src, - unsigned int len, - void (*simd_update)(int blocks, u64 dg[], - const char *src, - struct ghash_key const *k, - const char *head)) +static int ghash_update(struct shash_desc *desc, const u8 *src, + unsigned int len) { struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; @@ -143,9 +138,9 @@ static int __ghash_update(struct shash_desc *desc, const u8 *src, do { int chunk = min(blocks, MAX_BLOCKS); - ghash_do_update(chunk, ctx->digest, src, key, - partial ? ctx->buf : NULL, - simd_update); + ghash_do_simd_update(chunk, ctx->digest, src, key, + partial ? ctx->buf : NULL, + pmull_ghash_update_p8); blocks -= chunk; src += chunk * GHASH_BLOCK_SIZE; @@ -157,39 +152,7 @@ static int __ghash_update(struct shash_desc *desc, const u8 *src, return 0; } -static int ghash_update_p8(struct shash_desc *desc, const u8 *src, - unsigned int len) -{ - return __ghash_update(desc, src, len, pmull_ghash_update_p8); -} - -static int ghash_update_p64(struct shash_desc *desc, const u8 *src, - unsigned int len) -{ - return __ghash_update(desc, src, len, pmull_ghash_update_p64); -} - -static int ghash_final_p8(struct shash_desc *desc, u8 *dst) -{ - struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); - unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; - - if (partial) { - struct ghash_key *key = crypto_shash_ctx(desc->tfm); - - memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); - - ghash_do_update(1, ctx->digest, ctx->buf, key, NULL, - pmull_ghash_update_p8); - } - put_unaligned_be64(ctx->digest[1], dst); - put_unaligned_be64(ctx->digest[0], dst + 8); - - *ctx = (struct ghash_desc_ctx){}; - return 0; -} - -static int ghash_final_p64(struct shash_desc *desc, u8 *dst) +static int ghash_final(struct shash_desc *desc, u8 *dst) { struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; @@ -199,8 +162,8 @@ static int ghash_final_p64(struct shash_desc *desc, u8 *dst) memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); - ghash_do_update(1, ctx->digest, ctx->buf, key, NULL, - pmull_ghash_update_p64); + ghash_do_simd_update(1, ctx->digest, ctx->buf, key, NULL, + pmull_ghash_update_p8); } put_unaligned_be64(ctx->digest[1], dst); put_unaligned_be64(ctx->digest[0], dst + 8); @@ -220,29 +183,6 @@ static void ghash_reflect(u64 h[], const be128 *k) h[1] ^= 0xc200000000000000UL; } -static int __ghash_setkey(struct ghash_key *key, - const u8 *inkey, unsigned int keylen) -{ - be128 h; - - /* needed for the fallback */ - memcpy(&key->k, inkey, GHASH_BLOCK_SIZE); - - ghash_reflect(key->h, &key->k); - - h = key->k; - gf128mul_lle(&h, &key->k); - ghash_reflect(key->h2, &h); - - gf128mul_lle(&h, &key->k); - ghash_reflect(key->h3, &h); - - gf128mul_lle(&h, &key->k); - ghash_reflect(key->h4, &h); - - return 0; -} - static int ghash_setkey(struct crypto_shash *tfm, const u8 *inkey, unsigned int keylen) { @@ -251,38 +191,28 @@ static int ghash_setkey(struct crypto_shash *tfm, if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; - return __ghash_setkey(key, inkey, keylen); + /* needed for the fallback */ + memcpy(&key->k, inkey, GHASH_BLOCK_SIZE); + + ghash_reflect(key->h[0], &key->k); + return 0; } -static struct shash_alg ghash_alg[] = {{ +static struct shash_alg ghash_alg = { .base.cra_name = "ghash", .base.cra_driver_name = "ghash-neon", .base.cra_priority = 150, .base.cra_blocksize = GHASH_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct ghash_key), - .base.cra_module = THIS_MODULE, - - .digestsize = GHASH_DIGEST_SIZE, - .init = ghash_init, - .update = ghash_update_p8, - .final = ghash_final_p8, - .setkey = ghash_setkey, - .descsize = sizeof(struct ghash_desc_ctx), -}, { - .base.cra_name = "ghash", - .base.cra_driver_name = "ghash-ce", - .base.cra_priority = 200, - .base.cra_blocksize = GHASH_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct ghash_key), + .base.cra_ctxsize = sizeof(struct ghash_key) + sizeof(u64[2]), .base.cra_module = THIS_MODULE, .digestsize = GHASH_DIGEST_SIZE, .init = ghash_init, - .update = ghash_update_p64, - .final = ghash_final_p64, + .update = ghash_update, + .final = ghash_final, .setkey = ghash_setkey, .descsize = sizeof(struct ghash_desc_ctx), -}}; +}; static int num_rounds(struct crypto_aes_ctx *ctx) { @@ -301,6 +231,7 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, { struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm); u8 key[GHASH_BLOCK_SIZE]; + be128 h; int ret; ret = aes_expandkey(&ctx->aes_key, inkey, keylen); @@ -309,7 +240,22 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, aes_encrypt(&ctx->aes_key, key, (u8[AES_BLOCK_SIZE]){}); - return __ghash_setkey(&ctx->ghash_key, key, sizeof(be128)); + /* needed for the fallback */ + memcpy(&ctx->ghash_key.k, key, GHASH_BLOCK_SIZE); + + ghash_reflect(ctx->ghash_key.h[0], &ctx->ghash_key.k); + + h = ctx->ghash_key.k; + gf128mul_lle(&h, &ctx->ghash_key.k); + ghash_reflect(ctx->ghash_key.h[1], &h); + + gf128mul_lle(&h, &ctx->ghash_key.k); + ghash_reflect(ctx->ghash_key.h[2], &h); + + gf128mul_lle(&h, &ctx->ghash_key.k); + ghash_reflect(ctx->ghash_key.h[3], &h); + + return 0; } static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) @@ -341,9 +287,9 @@ static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[], if (count >= GHASH_BLOCK_SIZE || *buf_count == GHASH_BLOCK_SIZE) { int blocks = count / GHASH_BLOCK_SIZE; - ghash_do_update(blocks, dg, src, &ctx->ghash_key, - *buf_count ? buf : NULL, - pmull_ghash_update_p64); + ghash_do_simd_update(blocks, dg, src, &ctx->ghash_key, + *buf_count ? buf : NULL, + pmull_ghash_update_p64); src += blocks * GHASH_BLOCK_SIZE; count %= GHASH_BLOCK_SIZE; @@ -387,8 +333,8 @@ static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[]) if (buf_count) { memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count); - ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL, - pmull_ghash_update_p64); + ghash_do_simd_update(1, dg, buf, &ctx->ghash_key, NULL, + pmull_ghash_update_p64); } } @@ -433,8 +379,8 @@ static int gcm_encrypt(struct aead_request *req) } kernel_neon_begin(); - pmull_gcm_encrypt(nbytes, dst, src, &ctx->ghash_key, dg, - iv, ctx->aes_key.key_enc, nrounds, + pmull_gcm_encrypt(nbytes, dst, src, ctx->ghash_key.h, + dg, iv, ctx->aes_key.key_enc, nrounds, tag); kernel_neon_end(); @@ -464,7 +410,7 @@ static int gcm_encrypt(struct aead_request *req) } while (--remaining > 0); ghash_do_update(blocks, dg, walk.dst.virt.addr, - &ctx->ghash_key, NULL, NULL); + &ctx->ghash_key, NULL); err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); @@ -483,7 +429,7 @@ static int gcm_encrypt(struct aead_request *req) tag = (u8 *)&lengths; ghash_do_update(1, dg, tag, &ctx->ghash_key, - walk.nbytes ? buf : NULL, NULL); + walk.nbytes ? buf : NULL); if (walk.nbytes) err = skcipher_walk_done(&walk, 0); @@ -547,8 +493,8 @@ static int gcm_decrypt(struct aead_request *req) } kernel_neon_begin(); - pmull_gcm_decrypt(nbytes, dst, src, &ctx->ghash_key, dg, - iv, ctx->aes_key.key_enc, nrounds, + pmull_gcm_decrypt(nbytes, dst, src, ctx->ghash_key.h, + dg, iv, ctx->aes_key.key_enc, nrounds, tag); kernel_neon_end(); @@ -568,7 +514,7 @@ static int gcm_decrypt(struct aead_request *req) u8 *dst = walk.dst.virt.addr; ghash_do_update(blocks, dg, walk.src.virt.addr, - &ctx->ghash_key, NULL, NULL); + &ctx->ghash_key, NULL); do { aes_encrypt(&ctx->aes_key, buf, iv); @@ -591,7 +537,7 @@ static int gcm_decrypt(struct aead_request *req) tag = (u8 *)&lengths; ghash_do_update(1, dg, tag, &ctx->ghash_key, - walk.nbytes ? buf : NULL, NULL); + walk.nbytes ? buf : NULL); if (walk.nbytes) { aes_encrypt(&ctx->aes_key, buf, iv); @@ -635,43 +581,28 @@ static struct aead_alg gcm_aes_alg = { .base.cra_driver_name = "gcm-aes-ce", .base.cra_priority = 300, .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct gcm_aes_ctx), + .base.cra_ctxsize = sizeof(struct gcm_aes_ctx) + + 4 * sizeof(u64[2]), .base.cra_module = THIS_MODULE, }; static int __init ghash_ce_mod_init(void) { - int ret; - if (!cpu_have_named_feature(ASIMD)) return -ENODEV; if (cpu_have_named_feature(PMULL)) - ret = crypto_register_shashes(ghash_alg, - ARRAY_SIZE(ghash_alg)); - else - /* only register the first array element */ - ret = crypto_register_shash(ghash_alg); + return crypto_register_aead(&gcm_aes_alg); - if (ret) - return ret; - - if (cpu_have_named_feature(PMULL)) { - ret = crypto_register_aead(&gcm_aes_alg); - if (ret) - crypto_unregister_shashes(ghash_alg, - ARRAY_SIZE(ghash_alg)); - } - return ret; + return crypto_register_shash(&ghash_alg); } static void __exit ghash_ce_mod_exit(void) { if (cpu_have_named_feature(PMULL)) - crypto_unregister_shashes(ghash_alg, ARRAY_SIZE(ghash_alg)); + crypto_unregister_aead(&gcm_aes_alg); else - crypto_unregister_shash(ghash_alg); - crypto_unregister_aead(&gcm_aes_alg); + crypto_unregister_shash(&ghash_alg); } static const struct cpu_feature ghash_cpu_feature[] = { diff --git a/arch/sparc/crypto/sha256_glue.c b/arch/sparc/crypto/sha256_glue.c index 286bc8ecf15b..ca2547df9652 100644 --- a/arch/sparc/crypto/sha256_glue.c +++ b/arch/sparc/crypto/sha256_glue.c @@ -156,7 +156,7 @@ static int sha256_sparc64_import(struct shash_desc *desc, const void *in) return 0; } -static struct shash_alg sha256 = { +static struct shash_alg sha256_alg = { .digestsize = SHA256_DIGEST_SIZE, .init = sha256_sparc64_init, .update = sha256_sparc64_update, @@ -174,7 +174,7 @@ static struct shash_alg sha256 = { } }; -static struct shash_alg sha224 = { +static struct shash_alg sha224_alg = { .digestsize = SHA224_DIGEST_SIZE, .init = sha224_sparc64_init, .update = sha256_sparc64_update, @@ -206,13 +206,13 @@ static bool __init sparc64_has_sha256_opcode(void) static int __init sha256_sparc64_mod_init(void) { if (sparc64_has_sha256_opcode()) { - int ret = crypto_register_shash(&sha224); + int ret = crypto_register_shash(&sha224_alg); if (ret < 0) return ret; - ret = crypto_register_shash(&sha256); + ret = crypto_register_shash(&sha256_alg); if (ret < 0) { - crypto_unregister_shash(&sha224); + crypto_unregister_shash(&sha224_alg); return ret; } @@ -225,8 +225,8 @@ static int __init sha256_sparc64_mod_init(void) static void __exit sha256_sparc64_mod_fini(void) { - crypto_unregister_shash(&sha224); - crypto_unregister_shash(&sha256); + crypto_unregister_shash(&sha224_alg); + crypto_unregister_shash(&sha256_alg); } module_init(sha256_sparc64_mod_init); diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index ec437db1fa54..3f0fc7dd87d7 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -63,7 +63,6 @@ */ #include <linux/linkage.h> -#include <asm/inst.h> #define VMOVDQ vmovdqu @@ -127,10 +126,6 @@ ddq_add_8: /* generate a unique variable for ddq_add_x */ -.macro setddq n - var_ddq_add = ddq_add_\n -.endm - /* generate a unique variable for xmm register */ .macro setxdata n var_xdata = %xmm\n @@ -140,9 +135,7 @@ ddq_add_8: .macro club name, id .altmacro - .if \name == DDQ_DATA - setddq %\id - .elseif \name == XDATA + .if \name == XDATA setxdata %\id .endif .noaltmacro @@ -165,9 +158,8 @@ ddq_add_8: .set i, 1 .rept (by - 1) - club DDQ_DATA, i club XDATA, i - vpaddq var_ddq_add(%rip), xcounter, var_xdata + vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata vptest ddq_low_msk(%rip), var_xdata jnz 1f vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata @@ -180,8 +172,7 @@ ddq_add_8: vmovdqa 1*16(p_keys), xkeyA vpxor xkey0, xdata0, xdata0 - club DDQ_DATA, by - vpaddq var_ddq_add(%rip), xcounter, xcounter + vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter vptest ddq_low_msk(%rip), xcounter jnz 1f vpaddq ddq_high_add_1(%rip), xcounter, xcounter diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 54e7d15dbd0d..1852b19a73a0 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -26,7 +26,6 @@ */ #include <linux/linkage.h> -#include <asm/inst.h> #include <asm/frame.h> #include <asm/nospec-branch.h> @@ -201,7 +200,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff mov \SUBKEY, %r12 movdqu (%r12), \TMP3 movdqa SHUF_MASK(%rip), \TMP2 - PSHUFB_XMM \TMP2, \TMP3 + pshufb \TMP2, \TMP3 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) @@ -263,10 +262,10 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv movdqa SHUF_MASK(%rip), %xmm2 - PSHUFB_XMM %xmm2, %xmm0 + pshufb %xmm2, %xmm0 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv - PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 movdqu HashKey(%arg2), %xmm13 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ @@ -347,7 +346,7 @@ _zero_cipher_left_\@: paddd ONE(%rip), %xmm0 # INCR CNT to get Yn movdqu %xmm0, CurCount(%arg2) movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10, %xmm0 + pshufb %xmm10, %xmm0 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) movdqu %xmm0, PBlockEncKey(%arg2) @@ -377,7 +376,7 @@ _large_enough_update_\@: # get the appropriate shuffle mask movdqu (%r12), %xmm2 # shift right 16-r13 bytes - PSHUFB_XMM %xmm2, %xmm1 + pshufb %xmm2, %xmm1 _data_read_\@: lea ALL_F+16(%rip), %r12 @@ -393,12 +392,12 @@ _data_read_\@: .ifc \operation, dec pand %xmm1, %xmm2 movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10 ,%xmm2 + pshufb %xmm10 ,%xmm2 pxor %xmm2, %xmm8 .else movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10,%xmm0 + pshufb %xmm10,%xmm0 pxor %xmm0, %xmm8 .endif @@ -408,17 +407,17 @@ _data_read_\@: # GHASH computation for the last <16 byte block movdqa SHUF_MASK(%rip), %xmm10 # shuffle xmm0 back to output as ciphertext - PSHUFB_XMM %xmm10, %xmm0 + pshufb %xmm10, %xmm0 .endif # Output %r13 bytes - MOVQ_R64_XMM %xmm0, %rax + movq %xmm0, %rax cmp $8, %r13 jle _less_than_8_bytes_left_\@ mov %rax, (%arg3 , %r11, 1) add $8, %r11 psrldq $8, %xmm0 - MOVQ_R64_XMM %xmm0, %rax + movq %xmm0, %rax sub $8, %r13 _less_than_8_bytes_left_\@: mov %al, (%arg3, %r11, 1) @@ -449,7 +448,7 @@ _partial_done\@: movd %r12d, %xmm15 # len(A) in %xmm15 mov InLen(%arg2), %r12 shl $3, %r12 # len(C) in bits (*128) - MOVQ_R64_XMM %r12, %xmm1 + movq %r12, %xmm1 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) @@ -457,7 +456,7 @@ _partial_done\@: GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10, %xmm8 + pshufb %xmm10, %xmm8 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) @@ -470,7 +469,7 @@ _return_T_\@: cmp $8, %r11 jl _T_4_\@ _T_8_\@: - MOVQ_R64_XMM %xmm0, %rax + movq %xmm0, %rax mov %rax, (%r10) add $8, %r10 sub $8, %r11 @@ -518,9 +517,9 @@ _return_T_done_\@: pshufd $78, \HK, \TMP3 pxor \GH, \TMP2 # TMP2 = a1+a0 pxor \HK, \TMP3 # TMP3 = b1+b0 - PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 - PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 - PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) + pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x00, \HK, \GH # GH = a0*b0 + pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) pxor \GH, \TMP2 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) movdqa \TMP2, \TMP3 @@ -570,7 +569,7 @@ _return_T_done_\@: cmp $8, \DLEN jl _read_lt8_\@ mov (\DPTR), %rax - MOVQ_R64_XMM %rax, \XMMDst + movq %rax, \XMMDst sub $8, \DLEN jz _done_read_partial_block_\@ xor %eax, %eax @@ -579,7 +578,7 @@ _read_next_byte_\@: mov 7(\DPTR, \DLEN, 1), %al dec \DLEN jnz _read_next_byte_\@ - MOVQ_R64_XMM %rax, \XMM1 + movq %rax, \XMM1 pslldq $8, \XMM1 por \XMM1, \XMMDst jmp _done_read_partial_block_\@ @@ -590,7 +589,7 @@ _read_next_byte_lt8_\@: mov -1(\DPTR, \DLEN, 1), %al dec \DLEN jnz _read_next_byte_lt8_\@ - MOVQ_R64_XMM %rax, \XMMDst + movq %rax, \XMMDst _done_read_partial_block_\@: .endm @@ -608,7 +607,7 @@ _done_read_partial_block_\@: jl _get_AAD_rest\@ _get_AAD_blocks\@: movdqu (%r10), \TMP7 - PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data + pshufb %xmm14, \TMP7 # byte-reflect the AAD data pxor \TMP7, \TMP6 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 add $16, %r10 @@ -624,7 +623,7 @@ _get_AAD_rest\@: je _get_AAD_done\@ READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 - PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data + pshufb %xmm14, \TMP7 # byte-reflect the AAD data pxor \TMP6, \TMP7 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 movdqu \TMP7, \TMP6 @@ -667,7 +666,7 @@ _data_read_\@: # Finished reading in data # r16-r13 is the number of bytes in plaintext mod 16) add %r13, %r12 movdqu (%r12), %xmm2 # get the appropriate shuffle mask - PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes + pshufb %xmm2, %xmm9 # shift right r13 bytes .ifc \operation, dec movdqa %xmm1, %xmm3 @@ -689,8 +688,8 @@ _no_extra_mask_1_\@: pand %xmm1, %xmm3 movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10, %xmm3 - PSHUFB_XMM %xmm2, %xmm3 + pshufb %xmm10, %xmm3 + pshufb %xmm2, %xmm3 pxor %xmm3, \AAD_HASH cmp $0, %r10 @@ -724,8 +723,8 @@ _no_extra_mask_2_\@: pand %xmm1, %xmm9 movdqa SHUF_MASK(%rip), %xmm1 - PSHUFB_XMM %xmm1, %xmm9 - PSHUFB_XMM %xmm2, %xmm9 + pshufb %xmm1, %xmm9 + pshufb %xmm2, %xmm9 pxor %xmm9, \AAD_HASH cmp $0, %r10 @@ -744,8 +743,8 @@ _encode_done_\@: movdqa SHUF_MASK(%rip), %xmm10 # shuffle xmm9 back to output as ciphertext - PSHUFB_XMM %xmm10, %xmm9 - PSHUFB_XMM %xmm2, %xmm9 + pshufb %xmm10, %xmm9 + pshufb %xmm2, %xmm9 .endif # output encrypted Bytes cmp $0, %r10 @@ -759,14 +758,14 @@ _partial_fill_\@: mov \PLAIN_CYPH_LEN, %r13 _count_set_\@: movdqa %xmm9, %xmm0 - MOVQ_R64_XMM %xmm0, %rax + movq %xmm0, %rax cmp $8, %r13 jle _less_than_8_bytes_left_\@ mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) add $8, \DATA_OFFSET psrldq $8, %xmm0 - MOVQ_R64_XMM %xmm0, %rax + movq %xmm0, %rax sub $8, %r13 _less_than_8_bytes_left_\@: movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) @@ -810,7 +809,7 @@ _partial_block_done_\@: .else MOVADQ \XMM0, %xmm\index .endif - PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap + pshufb %xmm14, %xmm\index # perform a 16 byte swap pxor \TMP2, %xmm\index .endr lea 0x10(%arg1),%r10 @@ -821,7 +820,7 @@ _partial_block_done_\@: aes_loop_initial_\@: MOVADQ (%r10),\TMP1 .irpc index, \i_seq - AESENC \TMP1, %xmm\index + aesenc \TMP1, %xmm\index .endr add $16,%r10 sub $1,%eax @@ -829,7 +828,7 @@ aes_loop_initial_\@: MOVADQ (%r10), \TMP1 .irpc index, \i_seq - AESENCLAST \TMP1, %xmm\index # Last Round + aesenclast \TMP1, %xmm\index # Last Round .endr .irpc index, \i_seq movdqu (%arg4 , %r11, 1), \TMP1 @@ -841,7 +840,7 @@ aes_loop_initial_\@: .ifc \operation, dec movdqa \TMP1, %xmm\index .endif - PSHUFB_XMM %xmm14, %xmm\index + pshufb %xmm14, %xmm\index # prepare plaintext/ciphertext for GHASH computation .endr @@ -876,19 +875,19 @@ aes_loop_initial_\@: MOVADQ ONE(%RIP),\TMP1 paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM1 - PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap + pshufb %xmm14, \XMM1 # perform a 16 byte swap paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM2 - PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap + pshufb %xmm14, \XMM2 # perform a 16 byte swap paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM3 - PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap + pshufb %xmm14, \XMM3 # perform a 16 byte swap paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM4 - PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap + pshufb %xmm14, \XMM4 # perform a 16 byte swap MOVADQ 0(%arg1),\TMP1 pxor \TMP1, \XMM1 @@ -897,17 +896,17 @@ aes_loop_initial_\@: pxor \TMP1, \XMM4 .irpc index, 1234 # do 4 rounds movaps 0x10*\index(%arg1), \TMP1 - AESENC \TMP1, \XMM1 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 .endr .irpc index, 56789 # do next 5 rounds movaps 0x10*\index(%arg1), \TMP1 - AESENC \TMP1, \XMM1 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 .endr lea 0xa0(%arg1),%r10 mov keysize,%eax @@ -918,7 +917,7 @@ aes_loop_initial_\@: aes_loop_pre_\@: MOVADQ (%r10),\TMP2 .irpc index, 1234 - AESENC \TMP2, %xmm\index + aesenc \TMP2, %xmm\index .endr add $16,%r10 sub $1,%eax @@ -926,10 +925,10 @@ aes_loop_pre_\@: aes_loop_pre_done\@: MOVADQ (%r10), \TMP2 - AESENCLAST \TMP2, \XMM1 - AESENCLAST \TMP2, \XMM2 - AESENCLAST \TMP2, \XMM3 - AESENCLAST \TMP2, \XMM4 + aesenclast \TMP2, \XMM1 + aesenclast \TMP2, \XMM2 + aesenclast \TMP2, \XMM3 + aesenclast \TMP2, \XMM4 movdqu 16*0(%arg4 , %r11 , 1), \TMP1 pxor \TMP1, \XMM1 .ifc \operation, dec @@ -961,12 +960,12 @@ aes_loop_pre_done\@: .endif add $64, %r11 - PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap + pshufb %xmm14, \XMM1 # perform a 16 byte swap pxor \XMMDst, \XMM1 # combine GHASHed value with the corresponding ciphertext - PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap + pshufb %xmm14, \XMM2 # perform a 16 byte swap + pshufb %xmm14, \XMM3 # perform a 16 byte swap + pshufb %xmm14, \XMM4 # perform a 16 byte swap _initial_blocks_done\@: @@ -978,7 +977,7 @@ _initial_blocks_done\@: * arg1, %arg3, %arg4 are used as pointers only, not modified * %r11 is the data offset value */ -.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ +.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM1, \XMM5 @@ -994,7 +993,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pxor \XMM5, \TMP6 paddd ONE(%rip), \XMM0 # INCR CNT movdqu HashKey_4(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 movdqa \XMM0, \XMM1 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM2 @@ -1002,51 +1001,51 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM0, \XMM3 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM4 - PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap - PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + pshufb %xmm15, \XMM1 # perform a 16 byte swap + pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 + pshufb %xmm15, \XMM2 # perform a 16 byte swap + pshufb %xmm15, \XMM3 # perform a 16 byte swap + pshufb %xmm15, \XMM4 # perform a 16 byte swap pxor (%arg1), \XMM1 pxor (%arg1), \XMM2 pxor (%arg1), \XMM3 pxor (%arg1), \XMM4 movdqu HashKey_4_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) movaps 0x10(%arg1), \TMP1 - AESENC \TMP1, \XMM1 # Round 1 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 # Round 1 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 movaps 0x20(%arg1), \TMP1 - AESENC \TMP1, \XMM1 # Round 2 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 # Round 2 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 movdqa \XMM6, \TMP1 pshufd $78, \XMM6, \TMP2 pxor \XMM6, \TMP2 movdqu HashKey_3(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 movaps 0x30(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 3 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 + aesenc \TMP3, \XMM1 # Round 3 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 movaps 0x40(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 4 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 4 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 movdqu HashKey_3_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x50(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 5 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 5 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM6, \XMM5 @@ -1058,25 +1057,25 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation # Multiply TMP5 * HashKey using karatsuba - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x60(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 6 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 + aesenc \TMP3, \XMM1 # Round 6 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 movaps 0x70(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 7 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 7 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 movdqu HashKey_2_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x80(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 8 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 8 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM7, \XMM5 @@ -1089,13 +1088,13 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pshufd $78, \XMM8, \TMP2 pxor \XMM8, \TMP2 movdqu HashKey(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x90(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 9 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 + aesenc \TMP3, \XMM1 # Round 9 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 lea 0xa0(%arg1),%r10 mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 @@ -1105,7 +1104,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation aes_loop_par_enc\@: MOVADQ (%r10),\TMP3 .irpc index, 1234 - AESENC \TMP3, %xmm\index + aesenc \TMP3, %xmm\index .endr add $16,%r10 sub $1,%eax @@ -1113,12 +1112,12 @@ aes_loop_par_enc\@: aes_loop_par_enc_done\@: MOVADQ (%r10), \TMP3 - AESENCLAST \TMP3, \XMM1 # Round 10 - AESENCLAST \TMP3, \XMM2 - AESENCLAST \TMP3, \XMM3 - AESENCLAST \TMP3, \XMM4 + aesenclast \TMP3, \XMM1 # Round 10 + aesenclast \TMP3, \XMM2 + aesenclast \TMP3, \XMM3 + aesenclast \TMP3, \XMM4 movdqu HashKey_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqu (%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK movdqu 16(%arg4,%r11,1), \TMP3 @@ -1131,10 +1130,10 @@ aes_loop_par_enc_done\@: movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer - PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + pshufb %xmm15, \XMM1 # perform a 16 byte swap + pshufb %xmm15, \XMM2 # perform a 16 byte swap + pshufb %xmm15, \XMM3 # perform a 16 byte swap + pshufb %xmm15, \XMM4 # perform a 16 byte swap pxor \TMP4, \TMP1 pxor \XMM8, \XMM5 @@ -1186,7 +1185,7 @@ aes_loop_par_enc_done\@: * arg1, %arg3, %arg4 are used as pointers only, not modified * %r11 is the data offset value */ -.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ +.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM1, \XMM5 @@ -1202,7 +1201,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pxor \XMM5, \TMP6 paddd ONE(%rip), \XMM0 # INCR CNT movdqu HashKey_4(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 movdqa \XMM0, \XMM1 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM2 @@ -1210,51 +1209,51 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM0, \XMM3 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM4 - PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap - PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + pshufb %xmm15, \XMM1 # perform a 16 byte swap + pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 + pshufb %xmm15, \XMM2 # perform a 16 byte swap + pshufb %xmm15, \XMM3 # perform a 16 byte swap + pshufb %xmm15, \XMM4 # perform a 16 byte swap pxor (%arg1), \XMM1 pxor (%arg1), \XMM2 pxor (%arg1), \XMM3 pxor (%arg1), \XMM4 movdqu HashKey_4_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) movaps 0x10(%arg1), \TMP1 - AESENC \TMP1, \XMM1 # Round 1 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 # Round 1 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 movaps 0x20(%arg1), \TMP1 - AESENC \TMP1, \XMM1 # Round 2 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 # Round 2 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 movdqa \XMM6, \TMP1 pshufd $78, \XMM6, \TMP2 pxor \XMM6, \TMP2 movdqu HashKey_3(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 movaps 0x30(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 3 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 + aesenc \TMP3, \XMM1 # Round 3 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 movaps 0x40(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 4 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 4 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 movdqu HashKey_3_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x50(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 5 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 5 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM6, \XMM5 @@ -1266,25 +1265,25 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation # Multiply TMP5 * HashKey using karatsuba - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x60(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 6 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 + aesenc \TMP3, \XMM1 # Round 6 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 movaps 0x70(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 7 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 7 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 movdqu HashKey_2_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x80(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 8 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 8 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM7, \XMM5 @@ -1297,13 +1296,13 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pshufd $78, \XMM8, \TMP2 pxor \XMM8, \TMP2 movdqu HashKey(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x90(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 9 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 + aesenc \TMP3, \XMM1 # Round 9 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 lea 0xa0(%arg1),%r10 mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 @@ -1313,7 +1312,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation aes_loop_par_dec\@: MOVADQ (%r10),\TMP3 .irpc index, 1234 - AESENC \TMP3, %xmm\index + aesenc \TMP3, %xmm\index .endr add $16,%r10 sub $1,%eax @@ -1321,12 +1320,12 @@ aes_loop_par_dec\@: aes_loop_par_dec_done\@: MOVADQ (%r10), \TMP3 - AESENCLAST \TMP3, \XMM1 # last round - AESENCLAST \TMP3, \XMM2 - AESENCLAST \TMP3, \XMM3 - AESENCLAST \TMP3, \XMM4 + aesenclast \TMP3, \XMM1 # last round + aesenclast \TMP3, \XMM2 + aesenclast \TMP3, \XMM3 + aesenclast \TMP3, \XMM4 movdqu HashKey_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqu (%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer @@ -1343,10 +1342,10 @@ aes_loop_par_dec_done\@: pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer movdqa \TMP3, \XMM4 - PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + pshufb %xmm15, \XMM1 # perform a 16 byte swap + pshufb %xmm15, \XMM2 # perform a 16 byte swap + pshufb %xmm15, \XMM3 # perform a 16 byte swap + pshufb %xmm15, \XMM4 # perform a 16 byte swap pxor \TMP4, \TMP1 pxor \XMM8, \XMM5 @@ -1402,10 +1401,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst pshufd $78, \XMM1, \TMP2 pxor \XMM1, \TMP2 movdqu HashKey_4(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 - PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 + pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 + pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 movdqu HashKey_4_k(%arg2), \TMP4 - PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqa \XMM1, \XMMDst movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 @@ -1415,10 +1414,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst pshufd $78, \XMM2, \TMP2 pxor \XMM2, \TMP2 movdqu HashKey_3(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 movdqu HashKey_3_k(%arg2), \TMP4 - PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM2, \XMMDst pxor \TMP2, \XMM1 @@ -1430,10 +1429,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst pshufd $78, \XMM3, \TMP2 pxor \XMM3, \TMP2 movdqu HashKey_2(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 movdqu HashKey_2_k(%arg2), \TMP4 - PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM3, \XMMDst pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 @@ -1443,10 +1442,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst pshufd $78, \XMM4, \TMP2 pxor \XMM4, \TMP2 movdqu HashKey(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 movdqu HashKey_k(%arg2), \TMP4 - PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM4, \XMMDst pxor \XMM1, \TMP2 @@ -1504,13 +1503,13 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst _esb_loop_\@: MOVADQ (%r10),\TMP1 - AESENC \TMP1,\XMM0 + aesenc \TMP1,\XMM0 add $16,%r10 sub $1,%eax jnz _esb_loop_\@ MOVADQ (%r10),\TMP1 - AESENCLAST \TMP1,\XMM0 + aesenclast \TMP1,\XMM0 .endm /***************************************************************************** * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. @@ -1849,72 +1848,72 @@ SYM_FUNC_START(aesni_set_key) movups 0x10(UKEYP), %xmm2 # other user key movaps %xmm2, (TKEYP) add $0x10, TKEYP - AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 + aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 call _key_expansion_256a - AESKEYGENASSIST 0x1 %xmm0 %xmm1 + aeskeygenassist $0x1, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 + aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 call _key_expansion_256a - AESKEYGENASSIST 0x2 %xmm0 %xmm1 + aeskeygenassist $0x2, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 + aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 call _key_expansion_256a - AESKEYGENASSIST 0x4 %xmm0 %xmm1 + aeskeygenassist $0x4, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 + aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 call _key_expansion_256a - AESKEYGENASSIST 0x8 %xmm0 %xmm1 + aeskeygenassist $0x8, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 + aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 call _key_expansion_256a - AESKEYGENASSIST 0x10 %xmm0 %xmm1 + aeskeygenassist $0x10, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 + aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 call _key_expansion_256a - AESKEYGENASSIST 0x20 %xmm0 %xmm1 + aeskeygenassist $0x20, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 + aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 call _key_expansion_256a jmp .Ldec_key .Lenc_key192: movq 0x10(UKEYP), %xmm2 # other user key - AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 + aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 call _key_expansion_192a - AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 + aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 call _key_expansion_192b - AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 + aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 call _key_expansion_192a - AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 + aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 call _key_expansion_192b - AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 + aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 call _key_expansion_192a - AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 + aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 call _key_expansion_192b - AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 + aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 call _key_expansion_192a - AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 + aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 call _key_expansion_192b jmp .Ldec_key .Lenc_key128: - AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 + aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 call _key_expansion_128 - AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 + aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 call _key_expansion_128 - AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 + aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 call _key_expansion_128 - AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 + aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 call _key_expansion_128 - AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 + aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 call _key_expansion_128 - AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 + aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 call _key_expansion_128 - AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 + aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 call _key_expansion_128 - AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 + aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 call _key_expansion_128 - AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 + aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 call _key_expansion_128 - AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 + aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 call _key_expansion_128 .Ldec_key: sub $0x10, TKEYP @@ -1927,7 +1926,7 @@ SYM_FUNC_START(aesni_set_key) .align 4 .Ldec_key_loop: movaps (KEYP), %xmm0 - AESIMC %xmm0 %xmm1 + aesimc %xmm0, %xmm1 movaps %xmm1, (UKEYP) add $0x10, KEYP sub $0x10, UKEYP @@ -1988,37 +1987,37 @@ SYM_FUNC_START_LOCAL(_aesni_enc1) je .Lenc192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps -0x50(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE .align 4 .Lenc192: movaps -0x40(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps -0x30(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE .align 4 .Lenc128: movaps -0x20(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps -0x10(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps (TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x10(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x20(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x30(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x40(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x50(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x60(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x70(TKEYP), KEY - AESENCLAST KEY STATE + aesenclast KEY, STATE ret SYM_FUNC_END(_aesni_enc1) @@ -2054,79 +2053,79 @@ SYM_FUNC_START_LOCAL(_aesni_enc4) je .L4enc192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps -0x50(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 #.align 4 .L4enc192: movaps -0x40(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps -0x30(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 #.align 4 .L4enc128: movaps -0x20(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps -0x10(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps (TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x10(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x20(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x30(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x40(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x50(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x60(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x70(TKEYP), KEY - AESENCLAST KEY STATE1 # last round - AESENCLAST KEY STATE2 - AESENCLAST KEY STATE3 - AESENCLAST KEY STATE4 + aesenclast KEY, STATE1 # last round + aesenclast KEY, STATE2 + aesenclast KEY, STATE3 + aesenclast KEY, STATE4 ret SYM_FUNC_END(_aesni_enc4) @@ -2178,37 +2177,37 @@ SYM_FUNC_START_LOCAL(_aesni_dec1) je .Ldec192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps -0x50(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE .align 4 .Ldec192: movaps -0x40(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps -0x30(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE .align 4 .Ldec128: movaps -0x20(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps -0x10(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps (TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x10(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x20(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x30(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x40(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x50(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x60(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x70(TKEYP), KEY - AESDECLAST KEY STATE + aesdeclast KEY, STATE ret SYM_FUNC_END(_aesni_dec1) @@ -2244,79 +2243,79 @@ SYM_FUNC_START_LOCAL(_aesni_dec4) je .L4dec192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps -0x50(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 .align 4 .L4dec192: movaps -0x40(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps -0x30(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 .align 4 .L4dec128: movaps -0x20(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps -0x10(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps (TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x10(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x20(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x30(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x40(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x50(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x60(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x70(TKEYP), KEY - AESDECLAST KEY STATE1 # last round - AESDECLAST KEY STATE2 - AESDECLAST KEY STATE3 - AESDECLAST KEY STATE4 + aesdeclast KEY, STATE1 # last round + aesdeclast KEY, STATE2 + aesdeclast KEY, STATE3 + aesdeclast KEY, STATE4 ret SYM_FUNC_END(_aesni_dec4) @@ -2599,10 +2598,10 @@ SYM_FUNC_END(aesni_cbc_dec) SYM_FUNC_START_LOCAL(_aesni_inc_init) movaps .Lbswap_mask, BSWAP_MASK movaps IV, CTR - PSHUFB_XMM BSWAP_MASK CTR + pshufb BSWAP_MASK, CTR mov $1, TCTR_LOW - MOVQ_R64_XMM TCTR_LOW INC - MOVQ_R64_XMM CTR TCTR_LOW + movq TCTR_LOW, INC + movq CTR, TCTR_LOW ret SYM_FUNC_END(_aesni_inc_init) @@ -2630,7 +2629,7 @@ SYM_FUNC_START_LOCAL(_aesni_inc) psrldq $8, INC .Linc_low: movaps CTR, IV - PSHUFB_XMM BSWAP_MASK IV + pshufb BSWAP_MASK, IV ret SYM_FUNC_END(_aesni_inc) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 0cea33295287..5fee47956f3b 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -120,7 +120,6 @@ ## #include <linux/linkage.h> -#include <asm/inst.h> # constants in mergeable sections, linker can reorder and merge .section .rodata.cst16.POLY, "aM", @progbits, 16 diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S index a38ab2512a6f..ca1788bfee16 100644 --- a/arch/x86/crypto/chacha-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha-ssse3-x86_64.S @@ -120,10 +120,10 @@ SYM_FUNC_START(chacha_block_xor_ssse3) FRAME_BEGIN # x0..3 = s0..3 - movdqa 0x00(%rdi),%xmm0 - movdqa 0x10(%rdi),%xmm1 - movdqa 0x20(%rdi),%xmm2 - movdqa 0x30(%rdi),%xmm3 + movdqu 0x00(%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqu 0x20(%rdi),%xmm2 + movdqu 0x30(%rdi),%xmm3 movdqa %xmm0,%xmm8 movdqa %xmm1,%xmm9 movdqa %xmm2,%xmm10 @@ -205,10 +205,10 @@ SYM_FUNC_START(hchacha_block_ssse3) # %edx: nrounds FRAME_BEGIN - movdqa 0x00(%rdi),%xmm0 - movdqa 0x10(%rdi),%xmm1 - movdqa 0x20(%rdi),%xmm2 - movdqa 0x30(%rdi),%xmm3 + movdqu 0x00(%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqu 0x20(%rdi),%xmm2 + movdqu 0x30(%rdi),%xmm3 mov %edx,%r8d call chacha_permute diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 22250091cdbe..e67a59130025 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -14,8 +14,6 @@ #include <linux/module.h> #include <asm/simd.h> -#define CHACHA_STATE_ALIGN 16 - asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, @@ -124,8 +122,6 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) { - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); - if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { hchacha_block_generic(state, stream, nrounds); } else { @@ -138,8 +134,6 @@ EXPORT_SYMBOL(hchacha_block_arch); void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) { - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); - chacha_init_generic(state, key, iv); } EXPORT_SYMBOL(chacha_init_arch); @@ -147,8 +141,6 @@ EXPORT_SYMBOL(chacha_init_arch); void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); - if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || bytes <= CHACHA_BLOCK_SIZE) return chacha_crypt_generic(state, dst, src, bytes, nrounds); @@ -170,15 +162,12 @@ EXPORT_SYMBOL(chacha_crypt_arch); static int chacha_simd_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, const u8 *iv) { - u32 *state, state_buf[16 + 2] __aligned(8); + u32 state[CHACHA_STATE_WORDS] __aligned(8); struct skcipher_walk walk; int err; err = skcipher_walk_virt(&walk, req, false); - BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); - state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); - chacha_init_generic(state, ctx->key, iv); while (walk.nbytes > 0) { @@ -217,12 +206,10 @@ static int xchacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 *state, state_buf[16 + 2] __aligned(8); + u32 state[CHACHA_STATE_WORDS] __aligned(8); struct chacha_ctx subctx; u8 real_iv[16]; - BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); - state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); chacha_init_generic(state, ctx->key, req->iv); if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S index 9fd28ff65bc2..6e7d4c4d3208 100644 --- a/arch/x86/crypto/crc32-pclmul_asm.S +++ b/arch/x86/crypto/crc32-pclmul_asm.S @@ -38,7 +38,6 @@ */ #include <linux/linkage.h> -#include <asm/inst.h> .section .rodata @@ -129,17 +128,17 @@ loop_64:/* 64 bytes Full cache line folding */ #ifdef __x86_64__ movdqa %xmm4, %xmm8 #endif - PCLMULQDQ 00, CONSTANT, %xmm1 - PCLMULQDQ 00, CONSTANT, %xmm2 - PCLMULQDQ 00, CONSTANT, %xmm3 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x00, CONSTANT, %xmm2 + pclmulqdq $0x00, CONSTANT, %xmm3 #ifdef __x86_64__ - PCLMULQDQ 00, CONSTANT, %xmm4 + pclmulqdq $0x00, CONSTANT, %xmm4 #endif - PCLMULQDQ 0x11, CONSTANT, %xmm5 - PCLMULQDQ 0x11, CONSTANT, %xmm6 - PCLMULQDQ 0x11, CONSTANT, %xmm7 + pclmulqdq $0x11, CONSTANT, %xmm5 + pclmulqdq $0x11, CONSTANT, %xmm6 + pclmulqdq $0x11, CONSTANT, %xmm7 #ifdef __x86_64__ - PCLMULQDQ 0x11, CONSTANT, %xmm8 + pclmulqdq $0x11, CONSTANT, %xmm8 #endif pxor %xmm5, %xmm1 pxor %xmm6, %xmm2 @@ -149,8 +148,8 @@ loop_64:/* 64 bytes Full cache line folding */ #else /* xmm8 unsupported for x32 */ movdqa %xmm4, %xmm5 - PCLMULQDQ 00, CONSTANT, %xmm4 - PCLMULQDQ 0x11, CONSTANT, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm4 + pclmulqdq $0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm4 #endif @@ -172,20 +171,20 @@ less_64:/* Folding cache line into 128bit */ prefetchnta (BUF) movdqa %xmm1, %xmm5 - PCLMULQDQ 0x00, CONSTANT, %xmm1 - PCLMULQDQ 0x11, CONSTANT, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor %xmm2, %xmm1 movdqa %xmm1, %xmm5 - PCLMULQDQ 0x00, CONSTANT, %xmm1 - PCLMULQDQ 0x11, CONSTANT, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor %xmm3, %xmm1 movdqa %xmm1, %xmm5 - PCLMULQDQ 0x00, CONSTANT, %xmm1 - PCLMULQDQ 0x11, CONSTANT, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor %xmm4, %xmm1 @@ -193,8 +192,8 @@ less_64:/* Folding cache line into 128bit */ jb fold_64 loop_16:/* Folding rest buffer into 128bit */ movdqa %xmm1, %xmm5 - PCLMULQDQ 0x00, CONSTANT, %xmm1 - PCLMULQDQ 0x11, CONSTANT, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor (BUF), %xmm1 sub $0x10, LEN @@ -205,7 +204,7 @@ loop_16:/* Folding rest buffer into 128bit */ fold_64: /* perform the last 64 bit fold, also adds 32 zeroes * to the input stream */ - PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ + pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ psrldq $0x08, %xmm1 pxor CONSTANT, %xmm1 @@ -220,7 +219,7 @@ fold_64: #endif psrldq $0x04, %xmm2 pand %xmm3, %xmm1 - PCLMULQDQ 0x00, CONSTANT, %xmm1 + pclmulqdq $0x00, CONSTANT, %xmm1 pxor %xmm2, %xmm1 /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ @@ -231,11 +230,11 @@ fold_64: #endif movdqa %xmm1, %xmm2 pand %xmm3, %xmm1 - PCLMULQDQ 0x10, CONSTANT, %xmm1 + pclmulqdq $0x10, CONSTANT, %xmm1 pand %xmm3, %xmm1 - PCLMULQDQ 0x00, CONSTANT, %xmm1 + pclmulqdq $0x00, CONSTANT, %xmm1 pxor %xmm2, %xmm1 - PEXTRD 0x01, %xmm1, %eax + pextrd $0x01, %xmm1, %eax ret SYM_FUNC_END(crc32_pclmul_le_16) diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 8501ec4532f4..884dc767b051 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -43,7 +43,6 @@ * SOFTWARE. */ -#include <asm/inst.h> #include <linux/linkage.h> #include <asm/nospec-branch.h> @@ -170,7 +169,7 @@ continue_block: ## branch into array lea jump_table(%rip), %bufp - movzxw (%bufp, %rax, 2), len + movzwq (%bufp, %rax, 2), len lea crc_array(%rip), %bufp lea (%bufp, len, 1), %bufp JMP_NOSPEC bufp @@ -225,10 +224,10 @@ LABEL crc_ %i subq %rax, tmp # tmp -= rax*24 movq crc_init, %xmm1 # CRC for block 1 - PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2 + pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 movq crc1, %xmm2 # CRC for block 2 - PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1 + pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 pxor %xmm2,%xmm1 movq %xmm1, %rax diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index 8a17621f7d3a..8acbb6584a37 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -948,10 +948,8 @@ static void store_felem(u64 *b, u64 *f) { u64 f30 = f[3U]; u64 top_bit0 = f30 >> (u32)63U; - u64 carry0; u64 f31; u64 top_bit; - u64 carry; u64 f0; u64 f1; u64 f2; @@ -970,11 +968,11 @@ static void store_felem(u64 *b, u64 *f) u64 o2; u64 o3; f[3U] = f30 & (u64)0x7fffffffffffffffU; - carry0 = add_scalar(f, f, (u64)19U * top_bit0); + add_scalar(f, f, (u64)19U * top_bit0); f31 = f[3U]; top_bit = f31 >> (u32)63U; f[3U] = f31 & (u64)0x7fffffffffffffffU; - carry = add_scalar(f, f, (u64)19U * top_bit); + add_scalar(f, f, (u64)19U * top_bit); f0 = f[0U]; f1 = f[1U]; f2 = f[2U]; diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index bb9735fbb865..99ac25e18e09 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -14,7 +14,6 @@ */ #include <linux/linkage.h> -#include <asm/inst.h> #include <asm/frame.h> .section .rodata.cst16.bswap_mask, "aM", @progbits, 16 @@ -51,9 +50,9 @@ SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble) pxor DATA, T2 pxor SHASH, T3 - PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0 - PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1 - PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0) + pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0 + pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1 + pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0) pxor DATA, T2 pxor T1, T2 # T2 = a0 * b1 + a1 * b0 @@ -95,9 +94,9 @@ SYM_FUNC_START(clmul_ghash_mul) movups (%rdi), DATA movups (%rsi), SHASH movaps .Lbswap_mask, BSWAP - PSHUFB_XMM BSWAP DATA + pshufb BSWAP, DATA call __clmul_gf128mul_ble - PSHUFB_XMM BSWAP DATA + pshufb BSWAP, DATA movups DATA, (%rdi) FRAME_END ret @@ -114,18 +113,18 @@ SYM_FUNC_START(clmul_ghash_update) movaps .Lbswap_mask, BSWAP movups (%rdi), DATA movups (%rcx), SHASH - PSHUFB_XMM BSWAP DATA + pshufb BSWAP, DATA .align 4 .Lupdate_loop: movups (%rsi), IN1 - PSHUFB_XMM BSWAP IN1 + pshufb BSWAP, IN1 pxor IN1, DATA call __clmul_gf128mul_ble sub $16, %rdx add $16, %rsi cmp $16, %rdx jge .Lupdate_loop - PSHUFB_XMM BSWAP DATA + pshufb BSWAP, DATA movups DATA, (%rdi) .Lupdate_just_ret: FRAME_END diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index f5a796da07f8..438ccd4f3cc4 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -12,7 +12,6 @@ #define REG_TYPE_R32 0 #define REG_TYPE_R64 1 -#define REG_TYPE_XMM 2 #define REG_TYPE_INVALID 100 .macro R32_NUM opd r32 @@ -123,77 +122,18 @@ #endif .endm - .macro XMM_NUM opd xmm - \opd = REG_NUM_INVALID - .ifc \xmm,%xmm0 - \opd = 0 - .endif - .ifc \xmm,%xmm1 - \opd = 1 - .endif - .ifc \xmm,%xmm2 - \opd = 2 - .endif - .ifc \xmm,%xmm3 - \opd = 3 - .endif - .ifc \xmm,%xmm4 - \opd = 4 - .endif - .ifc \xmm,%xmm5 - \opd = 5 - .endif - .ifc \xmm,%xmm6 - \opd = 6 - .endif - .ifc \xmm,%xmm7 - \opd = 7 - .endif - .ifc \xmm,%xmm8 - \opd = 8 - .endif - .ifc \xmm,%xmm9 - \opd = 9 - .endif - .ifc \xmm,%xmm10 - \opd = 10 - .endif - .ifc \xmm,%xmm11 - \opd = 11 - .endif - .ifc \xmm,%xmm12 - \opd = 12 - .endif - .ifc \xmm,%xmm13 - \opd = 13 - .endif - .ifc \xmm,%xmm14 - \opd = 14 - .endif - .ifc \xmm,%xmm15 - \opd = 15 - .endif - .endm - .macro REG_TYPE type reg R32_NUM reg_type_r32 \reg R64_NUM reg_type_r64 \reg - XMM_NUM reg_type_xmm \reg .if reg_type_r64 <> REG_NUM_INVALID \type = REG_TYPE_R64 .elseif reg_type_r32 <> REG_NUM_INVALID \type = REG_TYPE_R32 - .elseif reg_type_xmm <> REG_NUM_INVALID - \type = REG_TYPE_XMM .else \type = REG_TYPE_INVALID .endif .endm - .macro PFX_OPD_SIZE - .byte 0x66 - .endm - .macro PFX_REX opd1 opd2 W=0 .if ((\opd1 | \opd2) & 8) || \W .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3) @@ -203,109 +143,6 @@ .macro MODRM mod opd1 opd2 .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3) .endm - - .macro PSHUFB_XMM xmm1 xmm2 - XMM_NUM pshufb_opd1 \xmm1 - XMM_NUM pshufb_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX pshufb_opd1 pshufb_opd2 - .byte 0x0f, 0x38, 0x00 - MODRM 0xc0 pshufb_opd1 pshufb_opd2 - .endm - - .macro PCLMULQDQ imm8 xmm1 xmm2 - XMM_NUM clmul_opd1 \xmm1 - XMM_NUM clmul_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX clmul_opd1 clmul_opd2 - .byte 0x0f, 0x3a, 0x44 - MODRM 0xc0 clmul_opd1 clmul_opd2 - .byte \imm8 - .endm - - .macro PEXTRD imm8 xmm gpr - R32_NUM extrd_opd1 \gpr - XMM_NUM extrd_opd2 \xmm - PFX_OPD_SIZE - PFX_REX extrd_opd1 extrd_opd2 - .byte 0x0f, 0x3a, 0x16 - MODRM 0xc0 extrd_opd1 extrd_opd2 - .byte \imm8 - .endm - - .macro AESKEYGENASSIST rcon xmm1 xmm2 - XMM_NUM aeskeygen_opd1 \xmm1 - XMM_NUM aeskeygen_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX aeskeygen_opd1 aeskeygen_opd2 - .byte 0x0f, 0x3a, 0xdf - MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2 - .byte \rcon - .endm - - .macro AESIMC xmm1 xmm2 - XMM_NUM aesimc_opd1 \xmm1 - XMM_NUM aesimc_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX aesimc_opd1 aesimc_opd2 - .byte 0x0f, 0x38, 0xdb - MODRM 0xc0 aesimc_opd1 aesimc_opd2 - .endm - - .macro AESENC xmm1 xmm2 - XMM_NUM aesenc_opd1 \xmm1 - XMM_NUM aesenc_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX aesenc_opd1 aesenc_opd2 - .byte 0x0f, 0x38, 0xdc - MODRM 0xc0 aesenc_opd1 aesenc_opd2 - .endm - - .macro AESENCLAST xmm1 xmm2 - XMM_NUM aesenclast_opd1 \xmm1 - XMM_NUM aesenclast_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX aesenclast_opd1 aesenclast_opd2 - .byte 0x0f, 0x38, 0xdd - MODRM 0xc0 aesenclast_opd1 aesenclast_opd2 - .endm - - .macro AESDEC xmm1 xmm2 - XMM_NUM aesdec_opd1 \xmm1 - XMM_NUM aesdec_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX aesdec_opd1 aesdec_opd2 - .byte 0x0f, 0x38, 0xde - MODRM 0xc0 aesdec_opd1 aesdec_opd2 - .endm - - .macro AESDECLAST xmm1 xmm2 - XMM_NUM aesdeclast_opd1 \xmm1 - XMM_NUM aesdeclast_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX aesdeclast_opd1 aesdeclast_opd2 - .byte 0x0f, 0x38, 0xdf - MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2 - .endm - - .macro MOVQ_R64_XMM opd1 opd2 - REG_TYPE movq_r64_xmm_opd1_type \opd1 - .if movq_r64_xmm_opd1_type == REG_TYPE_XMM - XMM_NUM movq_r64_xmm_opd1 \opd1 - R64_NUM movq_r64_xmm_opd2 \opd2 - .else - R64_NUM movq_r64_xmm_opd1 \opd1 - XMM_NUM movq_r64_xmm_opd2 \opd2 - .endif - PFX_OPD_SIZE - PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1 - .if movq_r64_xmm_opd1_type == REG_TYPE_XMM - .byte 0x0f, 0x7e - .else - .byte 0x0f, 0x6e - .endif - MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 - .endm #endif #endif |