summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-07-19 18:52:58 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2024-07-19 18:52:58 +0300
commitc434e25b62f8efcfbb6bf1f7ce55960206c1137e (patch)
tree824a68893982c718225a1821fda98c495178473d /arch
parent720261cfc7329406a50c2a8536e0039b9dd9a4e5 (diff)
parentdf1e9791998a92fe9f1e7d3f031b34daaad39e2f (diff)
downloadlinux-c434e25b62f8efcfbb6bf1f7ce55960206c1137e.tar.xz
Merge tag 'v6.11-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto update from Herbert Xu: "API: - Test setkey in no-SIMD context - Add skcipher speed test for user-specified algorithm Algorithms: - Add x25519 support on ppc64le - Add VAES and AVX512 / AVX10 optimized AES-GCM on x86 - Remove sm2 algorithm Drivers: - Add Allwinner H616 support to sun8i-ce - Use DMA in stm32 - Add Exynos850 hwrng support to exynos" * tag 'v6.11-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (81 commits) hwrng: core - remove (un)register_miscdev() crypto: lib/mpi - delete unnecessary condition crypto: testmgr - generate power-of-2 lengths more often crypto: mxs-dcp - Ensure payload is zero when using key slot hwrng: Kconfig - Do not enable by default CN10K driver crypto: starfive - Fix nent assignment in rsa dec crypto: starfive - Align rsa input data to 32-bit crypto: qat - fix unintentional re-enabling of error interrupts crypto: qat - extend scope of lock in adf_cfg_add_key_value_param() Documentation: qat: fix auto_reset attribute details crypto: sun8i-ce - add Allwinner H616 support crypto: sun8i-ce - wrap accesses to descriptor address fields dt-bindings: crypto: sun8i-ce: Add compatible for H616 hwrng: core - Fix wrong quality calculation at hw rng registration hwrng: exynos - Enable Exynos850 support hwrng: exynos - Add SMC based TRNG operation hwrng: exynos - Implement bus clock control hwrng: exynos - Use devm_clk_get_enabled() to get the clock hwrng: exynos - Improve coding style dt-bindings: rng: Add Exynos850 support to exynos-trng ...
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/crypto/aes-neonbs-glue.c1
-rw-r--r--arch/arm/crypto/crc32-ce-core.S17
-rw-r--r--arch/arm/crypto/crc32-ce-glue.c1
-rw-r--r--arch/arm/crypto/crct10dif-ce-glue.c1
-rw-r--r--arch/arm/crypto/curve25519-glue.c1
-rw-r--r--arch/arm/crypto/poly1305-glue.c1
-rw-r--r--arch/arm64/crypto/aes-neonbs-glue.c1
-rw-r--r--arch/arm64/crypto/crct10dif-ce-glue.c3
-rw-r--r--arch/arm64/crypto/poly1305-glue.c1
-rw-r--r--arch/powerpc/crypto/Kconfig11
-rw-r--r--arch/powerpc/crypto/Makefile2
-rw-r--r--arch/powerpc/crypto/curve25519-ppc64le-core.c299
-rw-r--r--arch/powerpc/crypto/curve25519-ppc64le_asm.S671
-rw-r--r--arch/x86/crypto/Kconfig1
-rw-r--r--arch/x86/crypto/Makefile8
-rw-r--r--arch/x86/crypto/aes-gcm-aesni-x86_64.S1128
-rw-r--r--arch/x86/crypto/aes-gcm-avx10-x86_64.S1222
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S1503
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S2804
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c1269
-rw-r--r--arch/x86/crypto/crc32-pclmul_glue.c1
-rw-r--r--arch/x86/crypto/curve25519-x86_64.c1
-rw-r--r--arch/x86/crypto/poly1305_glue.c4
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c9
24 files changed, 4135 insertions, 4825 deletions
diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c
index f00f042ef357..201eb35dde37 100644
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -17,6 +17,7 @@
#include <linux/module.h>
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_DESCRIPTION("Bit sliced AES using NEON instructions");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("ecb(aes)");
diff --git a/arch/arm/crypto/crc32-ce-core.S b/arch/arm/crypto/crc32-ce-core.S
index 3f13a76b9066..88f9edf94e95 100644
--- a/arch/arm/crypto/crc32-ce-core.S
+++ b/arch/arm/crypto/crc32-ce-core.S
@@ -48,6 +48,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/assembler.h>
.text
@@ -123,11 +124,12 @@
* uint crc32_pmull_le(unsigned char const *buffer,
* size_t len, uint crc32)
*/
-ENTRY(crc32_pmull_le)
+SYM_FUNC_START(crc32_pmull_le)
adr r3, .Lcrc32_constants
b 0f
+SYM_FUNC_END(crc32_pmull_le)
-ENTRY(crc32c_pmull_le)
+SYM_FUNC_START(crc32c_pmull_le)
adr r3, .Lcrc32c_constants
0: bic LEN, LEN, #15
@@ -236,8 +238,7 @@ fold_64:
vmov r0, s5
bx lr
-ENDPROC(crc32_pmull_le)
-ENDPROC(crc32c_pmull_le)
+SYM_FUNC_END(crc32c_pmull_le)
.macro __crc32, c
subs ip, r2, #8
@@ -296,11 +297,11 @@ ARM_BE8(rev16 r3, r3 )
.endm
.align 5
-ENTRY(crc32_armv8_le)
+SYM_TYPED_FUNC_START(crc32_armv8_le)
__crc32
-ENDPROC(crc32_armv8_le)
+SYM_FUNC_END(crc32_armv8_le)
.align 5
-ENTRY(crc32c_armv8_le)
+SYM_TYPED_FUNC_START(crc32c_armv8_le)
__crc32 c
-ENDPROC(crc32c_armv8_le)
+SYM_FUNC_END(crc32c_armv8_le)
diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c
index 2208445808d7..4ff18044af07 100644
--- a/arch/arm/crypto/crc32-ce-glue.c
+++ b/arch/arm/crypto/crc32-ce-glue.c
@@ -241,6 +241,7 @@ module_init(crc32_pmull_mod_init);
module_exit(crc32_pmull_mod_exit);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_DESCRIPTION("Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("crc32");
MODULE_ALIAS_CRYPTO("crc32c");
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
index e9191a8c87b9..79f3b204d8c0 100644
--- a/arch/arm/crypto/crct10dif-ce-glue.c
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -84,5 +84,6 @@ module_init(crc_t10dif_mod_init);
module_exit(crc_t10dif_mod_exit);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_DESCRIPTION("Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("crct10dif");
diff --git a/arch/arm/crypto/curve25519-glue.c b/arch/arm/crypto/curve25519-glue.c
index 9bdafd57888c..e7b87e09dd99 100644
--- a/arch/arm/crypto/curve25519-glue.c
+++ b/arch/arm/crypto/curve25519-glue.c
@@ -133,4 +133,5 @@ module_exit(arm_curve25519_exit);
MODULE_ALIAS_CRYPTO("curve25519");
MODULE_ALIAS_CRYPTO("curve25519-neon");
+MODULE_DESCRIPTION("Public key crypto: Curve25519 (NEON-accelerated)");
MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/crypto/poly1305-glue.c b/arch/arm/crypto/poly1305-glue.c
index c31bd8f7c092..8482e302c45a 100644
--- a/arch/arm/crypto/poly1305-glue.c
+++ b/arch/arm/crypto/poly1305-glue.c
@@ -267,6 +267,7 @@ static void __exit arm_poly1305_mod_exit(void)
module_init(arm_poly1305_mod_init);
module_exit(arm_poly1305_mod_exit);
+MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("poly1305");
MODULE_ALIAS_CRYPTO("poly1305-arm");
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index 467ac2f768ac..46425e7b9755 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_DESCRIPTION("Bit sliced AES using NEON instructions");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("ecb(aes)");
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index 09eb1456aed4..606d25c559ed 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -98,7 +98,7 @@ static struct shash_alg crc_t10dif_alg[] = {{
.base.cra_name = "crct10dif",
.base.cra_driver_name = "crct10dif-arm64-neon",
- .base.cra_priority = 100,
+ .base.cra_priority = 150,
.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
}, {
@@ -138,6 +138,7 @@ module_cpu_feature_match(ASIMD, crc_t10dif_mod_init);
module_exit(crc_t10dif_mod_exit);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("crct10dif");
MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce");
diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c
index 1fae18ba11ed..9c4bfd62e789 100644
--- a/arch/arm64/crypto/poly1305-glue.c
+++ b/arch/arm64/crypto/poly1305-glue.c
@@ -226,6 +226,7 @@ static void __exit neon_poly1305_mod_exit(void)
module_init(neon_poly1305_mod_init);
module_exit(neon_poly1305_mod_exit);
+MODULE_DESCRIPTION("Poly1305 transform using NEON instructions");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("poly1305");
MODULE_ALIAS_CRYPTO("poly1305-neon");
diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 1e201b7ae2fc..09ebcbdfb34f 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -2,6 +2,17 @@
menu "Accelerated Cryptographic Algorithms for CPU (powerpc)"
+config CRYPTO_CURVE25519_PPC64
+ tristate "Public key crypto: Curve25519 (PowerPC64)"
+ depends on PPC64 && CPU_LITTLE_ENDIAN
+ select CRYPTO_LIB_CURVE25519_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CURVE25519
+ help
+ Curve25519 algorithm
+
+ Architecture: PowerPC64
+ - Little-endian
+
config CRYPTO_CRC32C_VPMSUM
tristate "CRC32c"
depends on PPC64 && ALTIVEC
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index fca0e9739869..59808592f0a1 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o
obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o
+obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o
aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
md5-ppc-y := md5-asm.o md5-glue.o
@@ -29,6 +30,7 @@ aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-p
chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o
+curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o
ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
override flavour := linux-ppc64le
diff --git a/arch/powerpc/crypto/curve25519-ppc64le-core.c b/arch/powerpc/crypto/curve25519-ppc64le-core.c
new file mode 100644
index 000000000000..4e3e44ea4484
--- /dev/null
+++ b/arch/powerpc/crypto/curve25519-ppc64le-core.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2024- IBM Corp.
+ *
+ * X25519 scalar multiplication with 51 bits limbs for PPC64le.
+ * Based on RFC7748 and AArch64 optimized implementation for X25519
+ * - Algorithm 1 Scalar multiplication of a variable point
+ */
+
+#include <crypto/curve25519.h>
+#include <crypto/internal/kpp.h>
+
+#include <linux/types.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+
+#include <linux/cpufeature.h>
+#include <linux/processor.h>
+
+typedef uint64_t fe51[5];
+
+asmlinkage void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
+asmlinkage void x25519_fe51_sqr(fe51 h, const fe51 f);
+asmlinkage void x25519_fe51_mul121666(fe51 h, fe51 f);
+asmlinkage void x25519_fe51_sqr_times(fe51 h, const fe51 f, int n);
+asmlinkage void x25519_fe51_frombytes(fe51 h, const uint8_t *s);
+asmlinkage void x25519_fe51_tobytes(uint8_t *s, const fe51 h);
+asmlinkage void x25519_cswap(fe51 p, fe51 q, unsigned int bit);
+
+#define fmul x25519_fe51_mul
+#define fsqr x25519_fe51_sqr
+#define fmul121666 x25519_fe51_mul121666
+#define fe51_tobytes x25519_fe51_tobytes
+
+static void fadd(fe51 h, const fe51 f, const fe51 g)
+{
+ h[0] = f[0] + g[0];
+ h[1] = f[1] + g[1];
+ h[2] = f[2] + g[2];
+ h[3] = f[3] + g[3];
+ h[4] = f[4] + g[4];
+}
+
+/*
+ * Prime = 2 ** 255 - 19, 255 bits
+ * (0x7fffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffed)
+ *
+ * Prime in 5 51-bit limbs
+ */
+static fe51 prime51 = { 0x7ffffffffffed, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff};
+
+static void fsub(fe51 h, const fe51 f, const fe51 g)
+{
+ h[0] = (f[0] + ((prime51[0] * 2))) - g[0];
+ h[1] = (f[1] + ((prime51[1] * 2))) - g[1];
+ h[2] = (f[2] + ((prime51[2] * 2))) - g[2];
+ h[3] = (f[3] + ((prime51[3] * 2))) - g[3];
+ h[4] = (f[4] + ((prime51[4] * 2))) - g[4];
+}
+
+static void fe51_frombytes(fe51 h, const uint8_t *s)
+{
+ /*
+ * Make sure 64-bit aligned.
+ */
+ unsigned char sbuf[32+8];
+ unsigned char *sb = PTR_ALIGN((void *)sbuf, 8);
+
+ memcpy(sb, s, 32);
+ x25519_fe51_frombytes(h, sb);
+}
+
+static void finv(fe51 o, const fe51 i)
+{
+ fe51 a0, b, c, t00;
+
+ fsqr(a0, i);
+ x25519_fe51_sqr_times(t00, a0, 2);
+
+ fmul(b, t00, i);
+ fmul(a0, b, a0);
+
+ fsqr(t00, a0);
+
+ fmul(b, t00, b);
+ x25519_fe51_sqr_times(t00, b, 5);
+
+ fmul(b, t00, b);
+ x25519_fe51_sqr_times(t00, b, 10);
+
+ fmul(c, t00, b);
+ x25519_fe51_sqr_times(t00, c, 20);
+
+ fmul(t00, t00, c);
+ x25519_fe51_sqr_times(t00, t00, 10);
+
+ fmul(b, t00, b);
+ x25519_fe51_sqr_times(t00, b, 50);
+
+ fmul(c, t00, b);
+ x25519_fe51_sqr_times(t00, c, 100);
+
+ fmul(t00, t00, c);
+ x25519_fe51_sqr_times(t00, t00, 50);
+
+ fmul(t00, t00, b);
+ x25519_fe51_sqr_times(t00, t00, 5);
+
+ fmul(o, t00, a0);
+}
+
+static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32],
+ const uint8_t point[32])
+{
+ fe51 x1, x2, z2, x3, z3;
+ uint8_t s[32];
+ unsigned int swap = 0;
+ int i;
+
+ memcpy(s, scalar, 32);
+ s[0] &= 0xf8;
+ s[31] &= 0x7f;
+ s[31] |= 0x40;
+ fe51_frombytes(x1, point);
+
+ z2[0] = z2[1] = z2[2] = z2[3] = z2[4] = 0;
+ x3[0] = x1[0];
+ x3[1] = x1[1];
+ x3[2] = x1[2];
+ x3[3] = x1[3];
+ x3[4] = x1[4];
+
+ x2[0] = z3[0] = 1;
+ x2[1] = z3[1] = 0;
+ x2[2] = z3[2] = 0;
+ x2[3] = z3[3] = 0;
+ x2[4] = z3[4] = 0;
+
+ for (i = 254; i >= 0; --i) {
+ unsigned int k_t = 1 & (s[i / 8] >> (i & 7));
+ fe51 a, b, c, d, e;
+ fe51 da, cb, aa, bb;
+ fe51 dacb_p, dacb_m;
+
+ swap ^= k_t;
+ x25519_cswap(x2, x3, swap);
+ x25519_cswap(z2, z3, swap);
+ swap = k_t;
+
+ fsub(b, x2, z2); // B = x_2 - z_2
+ fadd(a, x2, z2); // A = x_2 + z_2
+ fsub(d, x3, z3); // D = x_3 - z_3
+ fadd(c, x3, z3); // C = x_3 + z_3
+
+ fsqr(bb, b); // BB = B^2
+ fsqr(aa, a); // AA = A^2
+ fmul(da, d, a); // DA = D * A
+ fmul(cb, c, b); // CB = C * B
+
+ fsub(e, aa, bb); // E = AA - BB
+ fmul(x2, aa, bb); // x2 = AA * BB
+ fadd(dacb_p, da, cb); // DA + CB
+ fsub(dacb_m, da, cb); // DA - CB
+
+ fmul121666(z3, e); // 121666 * E
+ fsqr(z2, dacb_m); // (DA - CB)^2
+ fsqr(x3, dacb_p); // x3 = (DA + CB)^2
+ fadd(b, bb, z3); // BB + 121666 * E
+ fmul(z3, x1, z2); // z3 = x1 * (DA - CB)^2
+ fmul(z2, e, b); // z2 = e * (BB + (DA + CB)^2)
+ }
+
+ finv(z2, z2);
+ fmul(x2, x2, z2);
+ fe51_tobytes(out, x2);
+}
+
+void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+ curve25519_fe51(mypublic, secret, basepoint);
+}
+EXPORT_SYMBOL(curve25519_arch);
+
+void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE])
+{
+ curve25519_fe51(pub, secret, curve25519_base_point);
+}
+EXPORT_SYMBOL(curve25519_base_arch);
+
+static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
+ unsigned int len)
+{
+ u8 *secret = kpp_tfm_ctx(tfm);
+
+ if (!len)
+ curve25519_generate_secret(secret);
+ else if (len == CURVE25519_KEY_SIZE &&
+ crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
+ memcpy(secret, buf, CURVE25519_KEY_SIZE);
+ else
+ return -EINVAL;
+ return 0;
+}
+
+static int curve25519_generate_public_key(struct kpp_request *req)
+{
+ struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
+ const u8 *secret = kpp_tfm_ctx(tfm);
+ u8 buf[CURVE25519_KEY_SIZE];
+ int copied, nbytes;
+
+ if (req->src)
+ return -EINVAL;
+
+ curve25519_base_arch(buf, secret);
+
+ /* might want less than we've got */
+ nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
+ copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
+ nbytes),
+ buf, nbytes);
+ if (copied != nbytes)
+ return -EINVAL;
+ return 0;
+}
+
+static int curve25519_compute_shared_secret(struct kpp_request *req)
+{
+ struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
+ const u8 *secret = kpp_tfm_ctx(tfm);
+ u8 public_key[CURVE25519_KEY_SIZE];
+ u8 buf[CURVE25519_KEY_SIZE];
+ int copied, nbytes;
+
+ if (!req->src)
+ return -EINVAL;
+
+ copied = sg_copy_to_buffer(req->src,
+ sg_nents_for_len(req->src,
+ CURVE25519_KEY_SIZE),
+ public_key, CURVE25519_KEY_SIZE);
+ if (copied != CURVE25519_KEY_SIZE)
+ return -EINVAL;
+
+ curve25519_arch(buf, secret, public_key);
+
+ /* might want less than we've got */
+ nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
+ copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
+ nbytes),
+ buf, nbytes);
+ if (copied != nbytes)
+ return -EINVAL;
+ return 0;
+}
+
+static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
+{
+ return CURVE25519_KEY_SIZE;
+}
+
+static struct kpp_alg curve25519_alg = {
+ .base.cra_name = "curve25519",
+ .base.cra_driver_name = "curve25519-ppc64le",
+ .base.cra_priority = 200,
+ .base.cra_module = THIS_MODULE,
+ .base.cra_ctxsize = CURVE25519_KEY_SIZE,
+
+ .set_secret = curve25519_set_secret,
+ .generate_public_key = curve25519_generate_public_key,
+ .compute_shared_secret = curve25519_compute_shared_secret,
+ .max_size = curve25519_max_size,
+};
+
+
+static int __init curve25519_mod_init(void)
+{
+ return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
+ crypto_register_kpp(&curve25519_alg) : 0;
+}
+
+static void __exit curve25519_mod_exit(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_KPP))
+ crypto_unregister_kpp(&curve25519_alg);
+}
+
+module_init(curve25519_mod_init);
+module_exit(curve25519_mod_exit);
+
+MODULE_ALIAS_CRYPTO("curve25519");
+MODULE_ALIAS_CRYPTO("curve25519-ppc64le");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Danny Tsen <dtsen@us.ibm.com>");
diff --git a/arch/powerpc/crypto/curve25519-ppc64le_asm.S b/arch/powerpc/crypto/curve25519-ppc64le_asm.S
new file mode 100644
index 000000000000..06c1febe24b9
--- /dev/null
+++ b/arch/powerpc/crypto/curve25519-ppc64le_asm.S
@@ -0,0 +1,671 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# This code is taken from CRYPTOGAMs[1] and is included here using the option
+# in the license to distribute the code under the GPL. Therefore this program
+# is free software; you can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2 as published by the Free Software
+# Foundation.
+#
+# [1] https://github.com/dot-asm/cryptogams/
+
+# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain copyright notices,
+# this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# * Neither the name of the CRYPTOGAMS nor the names of its
+# copyright holder and contributors may be used to endorse or
+# promote products derived from this software without specific
+# prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+#
+# ====================================================================
+# Written and Modified by Danny Tsen <dtsen@us.ibm.com>
+# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes
+# and x25519_cswap
+#
+# Copyright 2024- IBM Corp.
+#
+# X25519 lower-level primitives for PPC64.
+#
+
+#include <linux/linkage.h>
+
+.text
+
+.align 5
+SYM_FUNC_START(x25519_fe51_mul)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ ld 6,0(5)
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ mulld 22,7,6
+ mulhdu 23,7,6
+
+ mulld 24,8,6
+ mulhdu 25,8,6
+
+ mulld 30,11,6
+ mulhdu 31,11,6
+ ld 4,8(5)
+ mulli 11,11,19
+
+ mulld 26,9,6
+ mulhdu 27,9,6
+
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 12,11,4
+ mulhdu 21,11,4
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,7,4
+ mulhdu 21,7,4
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,10,4
+ mulhdu 21,10,4
+ ld 6,16(5)
+ mulli 10,10,19
+ addc 30,30,12
+ adde 31,31,21
+
+ mulld 12,8,4
+ mulhdu 21,8,4
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,9,4
+ mulhdu 21,9,4
+ addc 28,28,12
+ adde 29,29,21
+ mulld 12,10,6
+ mulhdu 21,10,6
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,11,6
+ mulhdu 21,11,6
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,9,6
+ mulhdu 21,9,6
+ ld 4,24(5)
+ mulli 9,9,19
+ addc 30,30,12
+ adde 31,31,21
+
+ mulld 12,7,6
+ mulhdu 21,7,6
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,8,6
+ mulhdu 21,8,6
+ addc 28,28,12
+ adde 29,29,21
+ mulld 12,9,4
+ mulhdu 21,9,4
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,10,4
+ mulhdu 21,10,4
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,8,4
+ mulhdu 21,8,4
+ ld 6,32(5)
+ mulli 8,8,19
+ addc 30,30,12
+ adde 31,31,21
+
+ mulld 12,11,4
+ mulhdu 21,11,4
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,7,4
+ mulhdu 21,7,4
+ addc 28,28,12
+ adde 29,29,21
+ mulld 12,8,6
+ mulhdu 21,8,6
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,9,6
+ mulhdu 21,9,6
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,10,6
+ mulhdu 21,10,6
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,11,6
+ mulhdu 21,11,6
+ addc 28,28,12
+ adde 29,29,21
+
+ mulld 12,7,6
+ mulhdu 21,7,6
+ addc 30,30,12
+ adde 31,31,21
+
+.Lfe51_reduce:
+ li 0,-1
+ srdi 0,0,13
+
+ srdi 12,26,51
+ and 9,26,0
+ insrdi 12,27,51,0
+ srdi 21,22,51
+ and 7,22,0
+ insrdi 21,23,51,0
+ addc 28,28,12
+ addze 29,29
+ addc 24,24,21
+ addze 25,25
+
+ srdi 12,28,51
+ and 10,28,0
+ insrdi 12,29,51,0
+ srdi 21,24,51
+ and 8,24,0
+ insrdi 21,25,51,0
+ addc 30,30,12
+ addze 31,31
+ add 9,9,21
+
+ srdi 12,30,51
+ and 11,30,0
+ insrdi 12,31,51,0
+ mulli 12,12,19
+
+ add 7,7,12
+
+ srdi 21,9,51
+ and 9,9,0
+ add 10,10,21
+
+ srdi 12,7,51
+ and 7,7,0
+ add 8,8,12
+
+ std 9,16(3)
+ std 10,24(3)
+ std 11,32(3)
+ std 7,0(3)
+ std 8,8(3)
+
+ ld 21,56(1)
+ ld 22,64(1)
+ ld 23,72(1)
+ ld 24,80(1)
+ ld 25,88(1)
+ ld 26,96(1)
+ ld 27,104(1)
+ ld 28,112(1)
+ ld 29,120(1)
+ ld 30,128(1)
+ ld 31,136(1)
+ addi 1,1,144
+ blr
+SYM_FUNC_END(x25519_fe51_mul)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_sqr)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ add 6,7,7
+ mulli 21,11,19
+
+ mulld 22,7,7
+ mulhdu 23,7,7
+ mulld 24,8,6
+ mulhdu 25,8,6
+ mulld 26,9,6
+ mulhdu 27,9,6
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 30,11,6
+ mulhdu 31,11,6
+ add 6,8,8
+ mulld 12,11,21
+ mulhdu 11,11,21
+ addc 28,28,12
+ adde 29,29,11
+
+ mulli 5,10,19
+
+ mulld 12,8,8
+ mulhdu 11,8,8
+ addc 26,26,12
+ adde 27,27,11
+ mulld 12,9,6
+ mulhdu 11,9,6
+ addc 28,28,12
+ adde 29,29,11
+ mulld 12,10,6
+ mulhdu 11,10,6
+ addc 30,30,12
+ adde 31,31,11
+ mulld 12,21,6
+ mulhdu 11,21,6
+ add 6,10,10
+ addc 22,22,12
+ adde 23,23,11
+ mulld 12,10,5
+ mulhdu 10,10,5
+ addc 24,24,12
+ adde 25,25,10
+ mulld 12,6,21
+ mulhdu 10,6,21
+ add 6,9,9
+ addc 26,26,12
+ adde 27,27,10
+
+ mulld 12,9,9
+ mulhdu 10,9,9
+ addc 30,30,12
+ adde 31,31,10
+ mulld 12,5,6
+ mulhdu 10,5,6
+ addc 22,22,12
+ adde 23,23,10
+ mulld 12,21,6
+ mulhdu 10,21,6
+ addc 24,24,12
+ adde 25,25,10
+
+ b .Lfe51_reduce
+SYM_FUNC_END(x25519_fe51_sqr)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_mul121666)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ lis 6,1
+ ori 6,6,56130
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ mulld 22,7,6
+ mulhdu 23,7,6
+ mulld 24,8,6
+ mulhdu 25,8,6
+ mulld 26,9,6
+ mulhdu 27,9,6
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 30,11,6
+ mulhdu 31,11,6
+
+ b .Lfe51_reduce
+SYM_FUNC_END(x25519_fe51_mul121666)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_sqr_times)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ mtctr 5
+
+.Lsqr_times_loop:
+ add 6,7,7
+ mulli 21,11,19
+
+ mulld 22,7,7
+ mulhdu 23,7,7
+ mulld 24,8,6
+ mulhdu 25,8,6
+ mulld 26,9,6
+ mulhdu 27,9,6
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 30,11,6
+ mulhdu 31,11,6
+ add 6,8,8
+ mulld 12,11,21
+ mulhdu 11,11,21
+ addc 28,28,12
+ adde 29,29,11
+
+ mulli 5,10,19
+
+ mulld 12,8,8
+ mulhdu 11,8,8
+ addc 26,26,12
+ adde 27,27,11
+ mulld 12,9,6
+ mulhdu 11,9,6
+ addc 28,28,12
+ adde 29,29,11
+ mulld 12,10,6
+ mulhdu 11,10,6
+ addc 30,30,12
+ adde 31,31,11
+ mulld 12,21,6
+ mulhdu 11,21,6
+ add 6,10,10
+ addc 22,22,12
+ adde 23,23,11
+ mulld 12,10,5
+ mulhdu 10,10,5
+ addc 24,24,12
+ adde 25,25,10
+ mulld 12,6,21
+ mulhdu 10,6,21
+ add 6,9,9
+ addc 26,26,12
+ adde 27,27,10
+
+ mulld 12,9,9
+ mulhdu 10,9,9
+ addc 30,30,12
+ adde 31,31,10
+ mulld 12,5,6
+ mulhdu 10,5,6
+ addc 22,22,12
+ adde 23,23,10
+ mulld 12,21,6
+ mulhdu 10,21,6
+ addc 24,24,12
+ adde 25,25,10
+
+ # fe51_reduce
+ li 0,-1
+ srdi 0,0,13
+
+ srdi 12,26,51
+ and 9,26,0
+ insrdi 12,27,51,0
+ srdi 21,22,51
+ and 7,22,0
+ insrdi 21,23,51,0
+ addc 28,28,12
+ addze 29,29
+ addc 24,24,21
+ addze 25,25
+
+ srdi 12,28,51
+ and 10,28,0
+ insrdi 12,29,51,0
+ srdi 21,24,51
+ and 8,24,0
+ insrdi 21,25,51,0
+ addc 30,30,12
+ addze 31,31
+ add 9,9,21
+
+ srdi 12,30,51
+ and 11,30,0
+ insrdi 12,31,51,0
+ mulli 12,12,19
+
+ add 7,7,12
+
+ srdi 21,9,51
+ and 9,9,0
+ add 10,10,21
+
+ srdi 12,7,51
+ and 7,7,0
+ add 8,8,12
+
+ bdnz .Lsqr_times_loop
+
+ std 9,16(3)
+ std 10,24(3)
+ std 11,32(3)
+ std 7,0(3)
+ std 8,8(3)
+
+ ld 21,56(1)
+ ld 22,64(1)
+ ld 23,72(1)
+ ld 24,80(1)
+ ld 25,88(1)
+ ld 26,96(1)
+ ld 27,104(1)
+ ld 28,112(1)
+ ld 29,120(1)
+ ld 30,128(1)
+ ld 31,136(1)
+ addi 1,1,144
+ blr
+SYM_FUNC_END(x25519_fe51_sqr_times)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_frombytes)
+
+ li 12, -1
+ srdi 12, 12, 13 # 0x7ffffffffffff
+
+ ld 5, 0(4)
+ ld 6, 8(4)
+ ld 7, 16(4)
+ ld 8, 24(4)
+
+ srdi 10, 5, 51
+ and 5, 5, 12 # h0
+
+ sldi 11, 6, 13
+ or 11, 10, 11 # h1t
+ srdi 10, 6, 38
+ and 6, 11, 12 # h1
+
+ sldi 11, 7, 26
+ or 10, 10, 11 # h2t
+
+ srdi 11, 7, 25
+ and 7, 10, 12 # h2
+ sldi 10, 8, 39
+ or 11, 11, 10 # h3t
+
+ srdi 9, 8, 12
+ and 8, 11, 12 # h3
+ and 9, 9, 12 # h4
+
+ std 5, 0(3)
+ std 6, 8(3)
+ std 7, 16(3)
+ std 8, 24(3)
+ std 9, 32(3)
+
+ blr
+SYM_FUNC_END(x25519_fe51_frombytes)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_tobytes)
+
+ ld 5, 0(4)
+ ld 6, 8(4)
+ ld 7, 16(4)
+ ld 8, 24(4)
+ ld 9, 32(4)
+
+ li 12, -1
+ srdi 12, 12, 13 # 0x7ffffffffffff
+
+ # Full reducuction
+ addi 10, 5, 19
+ srdi 10, 10, 51
+ add 10, 10, 6
+ srdi 10, 10, 51
+ add 10, 10, 7
+ srdi 10, 10, 51
+ add 10, 10, 8
+ srdi 10, 10, 51
+ add 10, 10, 9
+ srdi 10, 10, 51
+
+ mulli 10, 10, 19
+ add 5, 5, 10
+ srdi 11, 5, 51
+ add 6, 6, 11
+ srdi 11, 6, 51
+ add 7, 7, 11
+ srdi 11, 7, 51
+ add 8, 8, 11
+ srdi 11, 8, 51
+ add 9, 9, 11
+
+ and 5, 5, 12
+ and 6, 6, 12
+ and 7, 7, 12
+ and 8, 8, 12
+ and 9, 9, 12
+
+ sldi 10, 6, 51
+ or 5, 5, 10 # s0
+
+ srdi 11, 6, 13
+ sldi 10, 7, 38
+ or 6, 11, 10 # s1
+
+ srdi 11, 7, 26
+ sldi 10, 8, 25
+ or 7, 11, 10 # s2
+
+ srdi 11, 8, 39
+ sldi 10, 9, 12
+ or 8, 11, 10 # s4
+
+ std 5, 0(3)
+ std 6, 8(3)
+ std 7, 16(3)
+ std 8, 24(3)
+
+ blr
+SYM_FUNC_END(x25519_fe51_tobytes)
+
+.align 5
+SYM_FUNC_START(x25519_cswap)
+
+ li 7, 5
+ neg 6, 5
+ mtctr 7
+
+.Lswap_loop:
+ ld 8, 0(3)
+ ld 9, 0(4)
+ xor 10, 8, 9
+ and 10, 10, 6
+ xor 11, 8, 10
+ xor 12, 9, 10
+ std 11, 0(3)
+ addi 3, 3, 8
+ std 12, 0(4)
+ addi 4, 4, 8
+ bdnz .Lswap_loop
+
+ blr
+SYM_FUNC_END(x25519_cswap)
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index c9e59589a1ce..24875e6295f2 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -18,6 +18,7 @@ config CRYPTO_AES_NI_INTEL
depends on X86
select CRYPTO_AEAD
select CRYPTO_LIB_AES
+ select CRYPTO_LIB_GF128MUL
select CRYPTO_ALGAPI
select CRYPTO_SKCIPHER
select CRYPTO_SIMD
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 9c5ce5613738..53b4a277809e 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -48,8 +48,12 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o \
- aes_ctrby8_avx-x86_64.o aes-xts-avx-x86_64.o
+aesni-intel-$(CONFIG_64BIT) += aes_ctrby8_avx-x86_64.o \
+ aes-gcm-aesni-x86_64.o \
+ aes-xts-avx-x86_64.o
+ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy)
+aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o
+endif
obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o
diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
new file mode 100644
index 000000000000..45940e2883a0
--- /dev/null
+++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
@@ -0,0 +1,1128 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// AES-NI optimized AES-GCM for x86_64
+//
+// Copyright 2024 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+//------------------------------------------------------------------------------
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+//
+// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
+// support the original set of AES instructions, i.e. AES-NI. Two
+// implementations are provided, one that uses AVX and one that doesn't. They
+// are very similar, being generated by the same macros. The only difference is
+// that the AVX implementation takes advantage of VEX-coded instructions in some
+// places to avoid some 'movdqu' and 'movdqa' instructions. The AVX
+// implementation does *not* use 256-bit vectors, as AES is not supported on
+// 256-bit vectors until the VAES feature (which this file doesn't target).
+//
+// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
+// for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems
+// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
+//
+// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
+// more thoroughly commented. This file has the following notable changes:
+//
+// - The vector length is fixed at 128-bit, i.e. xmm registers. This means
+// there is only one AES block (and GHASH block) per register.
+//
+// - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
+// 32. We work around this by being much more careful about using
+// registers, relying heavily on loads to load values as they are needed.
+//
+// - Masking is not available either. We work around this by implementing
+// partial block loads and stores using overlapping scalar loads and stores
+// combined with shifts and SSE4.1 insertion and extraction instructions.
+//
+// - The main loop is organized differently due to the different design
+// constraints. First, with just one AES block per SIMD register, on some
+// CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore
+// do an 8-register wide loop. Considering that and the fact that we have
+// just 16 SIMD registers to work with, it's not feasible to cache AES
+// round keys and GHASH key powers in registers across loop iterations.
+// That's not ideal, but also not actually that bad, since loads can run in
+// parallel with other instructions. Significantly, this also makes it
+// possible to roll up the inner loops, relying on hardware loop unrolling
+// instead of software loop unrolling, greatly reducing code size.
+//
+// - We implement the GHASH multiplications in the main loop using Karatsuba
+// multiplication instead of schoolbook multiplication. This saves one
+// pclmulqdq instruction per block, at the cost of one 64-bit load, one
+// pshufd, and 0.25 pxors per block. (This is without the three-argument
+// XOR support that would be provided by AVX512 / AVX10, which would be
+// more beneficial to schoolbook than Karatsuba.)
+//
+// As a rough approximation, we can assume that Karatsuba multiplication is
+// faster than schoolbook multiplication in this context if one pshufd and
+// 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit
+// load is "free" due to running in parallel with arithmetic instructions.)
+// This is true on AMD CPUs, including all that support pclmulqdq up to at
+// least Zen 3. It's also true on older Intel CPUs: Westmere through
+// Haswell on the Core side, and Silvermont through Goldmont Plus on the
+// low-power side. On some of these CPUs, pclmulqdq is quite slow, and the
+// benefit of Karatsuba should be substantial. On newer Intel CPUs,
+// schoolbook multiplication should be faster, but only marginally.
+//
+// Not all these CPUs were available to be tested. However, benchmarks on
+// available CPUs suggest that this approximation is plausible. Switching
+// to Karatsuba showed negligible change (< 1%) on Intel Broadwell,
+// Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
+// Considering that and the fact that Karatsuba should be even more
+// beneficial on older Intel CPUs, it seems like the right choice here.
+//
+// An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
+// saved by using a multiplication-less reduction method. We don't do that
+// because it would require a large number of shift and xor instructions,
+// making it less worthwhile and likely harmful on newer CPUs.
+//
+// It does make sense to sometimes use a different reduction optimization
+// that saves a pclmulqdq, though: precompute the hash key times x^64, and
+// multiply the low half of the data block by the hash key with the extra
+// factor of x^64. This eliminates one step of the reduction. However,
+// this is incompatible with Karatsuba multiplication. Therefore, for
+// multi-block processing we use Karatsuba multiplication with a regular
+// reduction. For single-block processing, we use the x^64 optimization.
+
+#include <linux/linkage.h>
+
+.section .rodata
+.p2align 4
+.Lbswap_mask:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+.Lgfpoly:
+ .quad 0xc200000000000000
+.Lone:
+ .quad 1
+.Lgfpoly_and_internal_carrybit:
+ .octa 0xc2000000000000010000000000000001
+ // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
+ // 'len' 0xff bytes and the rest zeroes.
+.Lzeropad_mask:
+ .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0
+
+// Offsets in struct aes_gcm_key_aesni
+#define OFFSETOF_AESKEYLEN 480
+#define OFFSETOF_H_POWERS 496
+#define OFFSETOF_H_POWERS_XORED 624
+#define OFFSETOF_H_TIMES_X64 688
+
+.text
+
+// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback
+// assumes that all operands are distinct and that any mem operand is aligned.
+.macro _vpclmulqdq imm, src1, src2, dst
+.if USE_AVX
+ vpclmulqdq \imm, \src1, \src2, \dst
+.else
+ movdqa \src2, \dst
+ pclmulqdq \imm, \src1, \dst
+.endif
+.endm
+
+// Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes
+// that all operands are distinct and that any mem operand is aligned.
+.macro _vpshufb src1, src2, dst
+.if USE_AVX
+ vpshufb \src1, \src2, \dst
+.else
+ movdqa \src2, \dst
+ pshufb \src1, \dst
+.endif
+.endm
+
+// Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that
+// all operands are distinct.
+.macro _vpand src1, src2, dst
+.if USE_AVX
+ vpand \src1, \src2, \dst
+.else
+ movdqu \src1, \dst
+ pand \src2, \dst
+.endif
+.endm
+
+// XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must
+// be a temporary xmm register.
+.macro _xor_mem_to_reg mem, reg, tmp
+.if USE_AVX
+ vpxor \mem, \reg, \reg
+.else
+ movdqu \mem, \tmp
+ pxor \tmp, \reg
+.endif
+.endm
+
+// Test the unaligned memory operand \mem against the xmm register \reg. \tmp
+// must be a temporary xmm register.
+.macro _test_mem mem, reg, tmp
+.if USE_AVX
+ vptest \mem, \reg
+.else
+ movdqu \mem, \tmp
+ ptest \tmp, \reg
+.endif
+.endm
+
+// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
+// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro _load_partial_block src, dst, tmp64, tmp32
+ sub $8, %ecx // LEN - 8
+ jle .Lle8\@
+
+ // Load 9 <= LEN <= 15 bytes.
+ movq (\src), \dst // Load first 8 bytes
+ mov (\src, %rcx), %rax // Load last 8 bytes
+ neg %ecx
+ shl $3, %ecx
+ shr %cl, %rax // Discard overlapping bytes
+ pinsrq $1, %rax, \dst
+ jmp .Ldone\@
+
+.Lle8\@:
+ add $4, %ecx // LEN - 4
+ jl .Llt4\@
+
+ // Load 4 <= LEN <= 8 bytes.
+ mov (\src), %eax // Load first 4 bytes
+ mov (\src, %rcx), \tmp32 // Load last 4 bytes
+ jmp .Lcombine\@
+
+.Llt4\@:
+ // Load 1 <= LEN <= 3 bytes.
+ add $2, %ecx // LEN - 2
+ movzbl (\src), %eax // Load first byte
+ jl .Lmovq\@
+ movzwl (\src, %rcx), \tmp32 // Load last 2 bytes
+.Lcombine\@:
+ shl $3, %ecx
+ shl %cl, \tmp64
+ or \tmp64, %rax // Combine the two parts
+.Lmovq\@:
+ movq %rax, \dst
+.Ldone\@:
+.endm
+
+// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
+// Clobbers %rax, %rcx, and %rsi.
+.macro _store_partial_block src, dst
+ sub $8, %ecx // LEN - 8
+ jl .Llt8\@
+
+ // Store 8 <= LEN <= 15 bytes.
+ pextrq $1, \src, %rax
+ mov %ecx, %esi
+ shl $3, %ecx
+ ror %cl, %rax
+ mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes
+ movq \src, (\dst) // Store first 8 bytes
+ jmp .Ldone\@
+
+.Llt8\@:
+ add $4, %ecx // LEN - 4
+ jl .Llt4\@
+
+ // Store 4 <= LEN <= 7 bytes.
+ pextrd $1, \src, %eax
+ mov %ecx, %esi
+ shl $3, %ecx
+ ror %cl, %eax
+ mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes
+ movd \src, (\dst) // Store first 4 bytes
+ jmp .Ldone\@
+
+.Llt4\@:
+ // Store 1 <= LEN <= 3 bytes.
+ pextrb $0, \src, 0(\dst)
+ cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2?
+ jl .Ldone\@
+ pextrb $1, \src, 1(\dst)
+ je .Ldone\@
+ pextrb $2, \src, 2(\dst)
+.Ldone\@:
+.endm
+
+// Do one step of GHASH-multiplying \a by \b and storing the reduced product in
+// \b. To complete all steps, this must be invoked with \i=0 through \i=9.
+// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
+// .Lgfpoly constant, and \t0-\t1 must be temporary registers.
+.macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1
+
+ // MI = (a_L * b_H) + ((a*x^64)_L * b_L)
+.if \i == 0
+ _vpclmulqdq $0x01, \a, \b, \t0
+.elseif \i == 1
+ _vpclmulqdq $0x00, \a_times_x64, \b, \t1
+.elseif \i == 2
+ pxor \t1, \t0
+
+ // HI = (a_H * b_H) + ((a*x^64)_H * b_L)
+.elseif \i == 3
+ _vpclmulqdq $0x11, \a, \b, \t1
+.elseif \i == 4
+ pclmulqdq $0x10, \a_times_x64, \b
+.elseif \i == 5
+ pxor \t1, \b
+.elseif \i == 6
+
+ // Fold MI into HI.
+ pshufd $0x4e, \t0, \t1 // Swap halves of MI
+.elseif \i == 7
+ pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+ pxor \t1, \b
+.elseif \i == 9
+ pxor \t0, \b
+.endif
+.endm
+
+// GHASH-multiply \a by \b and store the reduced product in \b.
+// See _ghash_mul_step for details.
+.macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1
+.irp i, 0,1,2,3,4,5,6,7,8,9
+ _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
+.endr
+.endm
+
+// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
+// This does Karatsuba multiplication and must be paired with _ghash_reduce. On
+// the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the
+// two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered.
+.macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0
+
+ // LO += a_L * b_L
+ _vpclmulqdq $0x00, \a, \b, \t0
+ pxor \t0, \lo
+
+ // b_L + b_H
+ pshufd $0x4e, \b, \t0
+ pxor \b, \t0
+
+ // HI += a_H * b_H
+ pclmulqdq $0x11, \a, \b
+ pxor \b, \hi
+
+ // MI += (a_L + a_H) * (b_L + b_H)
+ pclmulqdq $0x00, \a_xored, \t0
+ pxor \t0, \mi
+.endm
+
+// Reduce the product from \lo, \mi, and \hi, and store the result in \dst.
+// This assumes that _ghash_mul_noreduce was used.
+.macro _ghash_reduce lo, mi, hi, dst, t0
+
+ movq .Lgfpoly(%rip), \t0
+
+ // MI += LO + HI (needed because we used Karatsuba multiplication)
+ pxor \lo, \mi
+ pxor \hi, \mi
+
+ // Fold LO into MI.
+ pshufd $0x4e, \lo, \dst
+ pclmulqdq $0x00, \t0, \lo
+ pxor \dst, \mi
+ pxor \lo, \mi
+
+ // Fold MI into HI.
+ pshufd $0x4e, \mi, \dst
+ pclmulqdq $0x00, \t0, \mi
+ pxor \hi, \dst
+ pxor \mi, \dst
+.endm
+
+// Do the first step of the GHASH update of a set of 8 ciphertext blocks.
+//
+// The whole GHASH update does:
+//
+// GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
+// blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1
+//
+// This macro just does the first step: it does the unreduced multiplication
+// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
+// registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the
+// inner block counter in %rax, which is a value that counts up by 8 for each
+// block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
+//
+// To reduce the number of pclmulqdq instructions required, both this macro and
+// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook
+// multiplication. See the file comment for more details about this choice.
+//
+// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
+// encrypting, or SRC if decrypting. They also expect the precomputed hash key
+// powers H^i and their XOR'd-together halves to be available in the struct
+// pointed to by KEY. Both macros clobber TMP[0-2].
+.macro _ghash_update_begin_8x enc
+
+ // Initialize the inner block counter.
+ xor %eax, %eax
+
+ // Load the highest hash key power, H^8.
+ movdqa OFFSETOF_H_POWERS(KEY), TMP0
+
+ // Load the first ciphertext block and byte-reflect it.
+.if \enc
+ movdqu (DST), TMP1
+.else
+ movdqu (SRC), TMP1
+.endif
+ pshufb BSWAP_MASK, TMP1
+
+ // Add the GHASH accumulator to the ciphertext block to get the block
+ // 'b' that needs to be multiplied with the hash key power 'a'.
+ pxor TMP1, GHASH_ACC
+
+ // b_L + b_H
+ pshufd $0x4e, GHASH_ACC, MI
+ pxor GHASH_ACC, MI
+
+ // LO = a_L * b_L
+ _vpclmulqdq $0x00, TMP0, GHASH_ACC, LO
+
+ // HI = a_H * b_H
+ pclmulqdq $0x11, TMP0, GHASH_ACC
+
+ // MI = (a_L + a_H) * (b_L + b_H)
+ pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI
+.endm
+
+// Continue the GHASH update of 8 ciphertext blocks as described above by doing
+// an unreduced multiplication of the next ciphertext block by the next lowest
+// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
+.macro _ghash_update_continue_8x enc
+ add $8, %eax
+
+ // Load the next lowest key power.
+ movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0
+
+ // Load the next ciphertext block and byte-reflect it.
+.if \enc
+ movdqu (DST,%rax,2), TMP1
+.else
+ movdqu (SRC,%rax,2), TMP1
+.endif
+ pshufb BSWAP_MASK, TMP1
+
+ // LO += a_L * b_L
+ _vpclmulqdq $0x00, TMP0, TMP1, TMP2
+ pxor TMP2, LO
+
+ // b_L + b_H
+ pshufd $0x4e, TMP1, TMP2
+ pxor TMP1, TMP2
+
+ // HI += a_H * b_H
+ pclmulqdq $0x11, TMP0, TMP1
+ pxor TMP1, GHASH_ACC
+
+ // MI += (a_L + a_H) * (b_L + b_H)
+ movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1
+ pclmulqdq $0x00, TMP1, TMP2
+ pxor TMP2, MI
+.endm
+
+// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to
+// _ghash_reduce, but it's hardcoded to use the registers of the main loop and
+// it uses the same register for HI and the destination. It's also divided into
+// two steps. TMP1 must be preserved across steps.
+//
+// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of
+// shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would
+// increase the critical path length, and it seems to slightly hurt performance.
+.macro _ghash_update_end_8x_step i
+.if \i == 0
+ movq .Lgfpoly(%rip), TMP1
+ pxor LO, MI
+ pxor GHASH_ACC, MI
+ pshufd $0x4e, LO, TMP2
+ pclmulqdq $0x00, TMP1, LO
+ pxor TMP2, MI
+ pxor LO, MI
+.elseif \i == 1
+ pshufd $0x4e, MI, TMP2
+ pclmulqdq $0x00, TMP1, MI
+ pxor TMP2, GHASH_ACC
+ pxor MI, GHASH_ACC
+.endif
+.endm
+
+// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);
+//
+// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH
+// related fields in the key struct.
+.macro _aes_gcm_precompute
+
+ // Function arguments
+ .set KEY, %rdi
+
+ // Additional local variables.
+ // %xmm0-%xmm1 and %rax are used as temporaries.
+ .set RNDKEYLAST_PTR, %rsi
+ .set H_CUR, %xmm2
+ .set H_POW1, %xmm3 // H^1
+ .set H_POW1_X64, %xmm4 // H^1 * x^64
+ .set GFPOLY, %xmm5
+
+ // Encrypt an all-zeroes block to get the raw hash subkey.
+ movl OFFSETOF_AESKEYLEN(KEY), %eax
+ lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR
+ movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block
+ lea 16(KEY), %rax
+1:
+ aesenc (%rax), H_POW1
+ add $16, %rax
+ cmp %rax, RNDKEYLAST_PTR
+ jne 1b
+ aesenclast (RNDKEYLAST_PTR), H_POW1
+
+ // Preprocess the raw hash subkey as needed to operate on GHASH's
+ // bit-reflected values directly: reflect its bytes, then multiply it by
+ // x^-1 (using the backwards interpretation of polynomial coefficients
+ // from the GCM spec) or equivalently x^1 (using the alternative,
+ // natural interpretation of polynomial coefficients).
+ pshufb .Lbswap_mask(%rip), H_POW1
+ movdqa H_POW1, %xmm0
+ pshufd $0xd3, %xmm0, %xmm0
+ psrad $31, %xmm0
+ paddq H_POW1, H_POW1
+ pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0
+ pxor %xmm0, H_POW1
+
+ // Store H^1.
+ movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY)
+
+ // Compute and store H^1 * x^64.
+ movq .Lgfpoly(%rip), GFPOLY
+ pshufd $0x4e, H_POW1, %xmm0
+ _vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64
+ pxor %xmm0, H_POW1_X64
+ movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)
+
+ // Compute and store the halves of H^1 XOR'd together.
+ pxor H_POW1, %xmm0
+ movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)
+
+ // Compute and store the remaining key powers H^2 through H^8.
+ movdqa H_POW1, H_CUR
+ mov $6*8, %eax
+.Lprecompute_next\@:
+ // Compute H^i = H^{i-1} * H^1.
+ _ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1
+ // Store H^i.
+ movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)
+ // Compute and store the halves of H^i XOR'd together.
+ pshufd $0x4e, H_CUR, %xmm0
+ pxor H_CUR, %xmm0
+ movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)
+ sub $8, %eax
+ jge .Lprecompute_next\@
+
+ RET
+.endm
+
+// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
+// u8 ghash_acc[16], const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all
+// zeroes. |aadlen| must be a multiple of 16, except on the last call where it
+// can be any length. The caller must do any buffering needed to ensure this.
+.macro _aes_gcm_aad_update
+
+ // Function arguments
+ .set KEY, %rdi
+ .set GHASH_ACC_PTR, %rsi
+ .set AAD, %rdx
+ .set AADLEN, %ecx
+ // Note: _load_partial_block relies on AADLEN being in %ecx.
+
+ // Additional local variables.
+ // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
+ .set BSWAP_MASK, %xmm2
+ .set GHASH_ACC, %xmm3
+ .set H_POW1, %xmm4 // H^1
+ .set H_POW1_X64, %xmm5 // H^1 * x^64
+ .set GFPOLY, %xmm6
+
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
+ movdqu (GHASH_ACC_PTR), GHASH_ACC
+ movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1
+ movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
+ movq .Lgfpoly(%rip), GFPOLY
+
+ // Process the AAD one full block at a time.
+ sub $16, AADLEN
+ jl .Laad_loop_1x_done\@
+.Laad_loop_1x\@:
+ movdqu (AAD), %xmm0
+ pshufb BSWAP_MASK, %xmm0
+ pxor %xmm0, GHASH_ACC
+ _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
+ add $16, AAD
+ sub $16, AADLEN
+ jge .Laad_loop_1x\@
+.Laad_loop_1x_done\@:
+ // Check whether there is a partial block at the end.
+ add $16, AADLEN
+ jz .Laad_done\@
+
+ // Process a partial block of length 1 <= AADLEN <= 15.
+ // _load_partial_block assumes that %ecx contains AADLEN.
+ _load_partial_block AAD, %xmm0, %r10, %r10d
+ pshufb BSWAP_MASK, %xmm0
+ pxor %xmm0, GHASH_ACC
+ _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
+
+.Laad_done\@:
+ movdqu GHASH_ACC, (GHASH_ACC_PTR)
+ RET
+.endm
+
+// Increment LE_CTR eight times to generate eight little-endian counter blocks,
+// swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with
+// the zero-th AES round key. Clobbers TMP0 and TMP1.
+.macro _ctr_begin_8x
+ movq .Lone(%rip), TMP0
+ movdqa (KEY), TMP1 // zero-th round key
+.irp i, 0,1,2,3,4,5,6,7
+ _vpshufb BSWAP_MASK, LE_CTR, AESDATA\i
+ pxor TMP1, AESDATA\i
+ paddd TMP0, LE_CTR
+.endr
+.endm
+
+// Do a non-last round of AES on AESDATA[0-7] using \round_key.
+.macro _aesenc_8x round_key
+.irp i, 0,1,2,3,4,5,6,7
+ aesenc \round_key, AESDATA\i
+.endr
+.endm
+
+// Do the last round of AES on AESDATA[0-7] using \round_key.
+.macro _aesenclast_8x round_key
+.irp i, 0,1,2,3,4,5,6,7
+ aesenclast \round_key, AESDATA\i
+.endr
+.endm
+
+// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
+// store the result to DST. Clobbers TMP0.
+.macro _xor_data_8x
+.irp i, 0,1,2,3,4,5,6,7
+ _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0
+.endr
+.irp i, 0,1,2,3,4,5,6,7
+ movdqu AESDATA\i, \i*16(DST)
+.endr
+.endm
+
+// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// const u8 *src, u8 *dst, int datalen);
+//
+// This macro generates a GCM encryption or decryption update function with the
+// above prototype (with \enc selecting which one).
+//
+// This function computes the next portion of the CTR keystream, XOR's it with
+// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
+// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the
+// next |datalen| ciphertext bytes.
+//
+// |datalen| must be a multiple of 16, except on the last call where it can be
+// any length. The caller must do any buffering needed to ensure this. Both
+// in-place and out-of-place en/decryption are supported.
+//
+// |le_ctr| must give the current counter in little-endian format. For a new
+// message, the low word of the counter must be 2. This function loads the
+// counter from |le_ctr| and increments the loaded counter as needed, but it
+// does *not* store the updated counter back to |le_ctr|. The caller must
+// update |le_ctr| if any more data segments follow. Internally, only the low
+// 32-bit word of the counter is incremented, following the GCM standard.
+.macro _aes_gcm_update enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg
+ .set GHASH_ACC_PTR, %rdx
+ .set SRC, %rcx
+ .set DST, %r8
+ .set DATALEN, %r9d
+ .set DATALEN64, %r9 // Zero-extend DATALEN before using!
+ // Note: the code setting up for _load_partial_block assumes that SRC is
+ // in %rcx (and that DATALEN is *not* in %rcx).
+
+ // Additional local variables
+
+ // %rax and %rsi are used as temporary registers. Note: %rsi overlaps
+ // with LE_CTR_PTR, which is used only at the beginning.
+
+ .set AESKEYLEN, %r10d // AES key length in bytes
+ .set AESKEYLEN64, %r10
+ .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key
+
+ // Put the most frequently used values in %xmm0-%xmm7 to reduce code
+ // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
+ .set TMP0, %xmm0
+ .set TMP1, %xmm1
+ .set TMP2, %xmm2
+ .set LO, %xmm3 // Low part of unreduced product
+ .set MI, %xmm4 // Middle part of unreduced product
+ .set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also
+ // the high part of unreduced product
+ .set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes
+ .set LE_CTR, %xmm7 // Little-endian counter value
+ .set AESDATA0, %xmm8
+ .set AESDATA1, %xmm9
+ .set AESDATA2, %xmm10
+ .set AESDATA3, %xmm11
+ .set AESDATA4, %xmm12
+ .set AESDATA5, %xmm13
+ .set AESDATA6, %xmm14
+ .set AESDATA7, %xmm15
+
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
+ movdqu (GHASH_ACC_PTR), GHASH_ACC
+ movdqu (LE_CTR_PTR), LE_CTR
+
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+ lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
+
+ // If there are at least 8*16 bytes of data, then continue into the main
+ // loop, which processes 8*16 bytes of data per iteration.
+ //
+ // The main loop interleaves AES and GHASH to improve performance on
+ // CPUs that can execute these instructions in parallel. When
+ // decrypting, the GHASH input (the ciphertext) is immediately
+ // available. When encrypting, we instead encrypt a set of 8 blocks
+ // first and then GHASH those blocks while encrypting the next set of 8,
+ // repeat that as needed, and finally GHASH the last set of 8 blocks.
+ //
+ // Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
+ // as this makes the immediate fit in a signed byte, saving 3 bytes.
+ add $-8*16, DATALEN
+ jl .Lcrypt_loop_8x_done\@
+.if \enc
+ // Encrypt the first 8 plaintext blocks.
+ _ctr_begin_8x
+ lea 16(KEY), %rsi
+ .p2align 4
+1:
+ movdqa (%rsi), TMP0
+ _aesenc_8x TMP0
+ add $16, %rsi
+ cmp %rsi, RNDKEYLAST_PTR
+ jne 1b
+ movdqa (%rsi), TMP0
+ _aesenclast_8x TMP0
+ _xor_data_8x
+ // Don't increment DST until the ciphertext blocks have been hashed.
+ sub $-8*16, SRC
+ add $-8*16, DATALEN
+ jl .Lghash_last_ciphertext_8x\@
+.endif
+
+ .p2align 4
+.Lcrypt_loop_8x\@:
+
+ // Generate the next set of 8 counter blocks and start encrypting them.
+ _ctr_begin_8x
+ lea 16(KEY), %rsi
+
+ // Do a round of AES, and start the GHASH update of 8 ciphertext blocks
+ // by doing the unreduced multiplication for the first ciphertext block.
+ movdqa (%rsi), TMP0
+ add $16, %rsi
+ _aesenc_8x TMP0
+ _ghash_update_begin_8x \enc
+
+ // Do 7 more rounds of AES, and continue the GHASH update by doing the
+ // unreduced multiplication for the remaining ciphertext blocks.
+ .p2align 4
+1:
+ movdqa (%rsi), TMP0
+ add $16, %rsi
+ _aesenc_8x TMP0
+ _ghash_update_continue_8x \enc
+ cmp $7*8, %eax
+ jne 1b
+
+ // Do the remaining AES rounds.
+ .p2align 4
+1:
+ movdqa (%rsi), TMP0
+ add $16, %rsi
+ _aesenc_8x TMP0
+ cmp %rsi, RNDKEYLAST_PTR
+ jne 1b
+
+ // Do the GHASH reduction and the last round of AES.
+ movdqa (RNDKEYLAST_PTR), TMP0
+ _ghash_update_end_8x_step 0
+ _aesenclast_8x TMP0
+ _ghash_update_end_8x_step 1
+
+ // XOR the data with the AES-CTR keystream blocks.
+.if \enc
+ sub $-8*16, DST
+.endif
+ _xor_data_8x
+ sub $-8*16, SRC
+.if !\enc
+ sub $-8*16, DST
+.endif
+ add $-8*16, DATALEN
+ jge .Lcrypt_loop_8x\@
+
+.if \enc
+.Lghash_last_ciphertext_8x\@:
+ // Update GHASH with the last set of 8 ciphertext blocks.
+ _ghash_update_begin_8x \enc
+ .p2align 4
+1:
+ _ghash_update_continue_8x \enc
+ cmp $7*8, %eax
+ jne 1b
+ _ghash_update_end_8x_step 0
+ _ghash_update_end_8x_step 1
+ sub $-8*16, DST
+.endif
+
+.Lcrypt_loop_8x_done\@:
+
+ sub $-8*16, DATALEN
+ jz .Ldone\@
+
+ // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep
+ // things simple and keep the code size down by just going one block at
+ // a time, again taking advantage of hardware loop unrolling. Since
+ // there are enough key powers available for all remaining data, we do
+ // the GHASH multiplications unreduced, and only reduce at the very end.
+
+ .set HI, TMP2
+ .set H_POW, AESDATA0
+ .set H_POW_XORED, AESDATA1
+ .set ONE, AESDATA2
+
+ movq .Lone(%rip), ONE
+
+ // Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+ pxor LO, LO
+ pxor MI, MI
+ pxor HI, HI
+
+ // Set up a block counter %rax to contain 8*(8-n), where n is the number
+ // of blocks that remain, counting any partial block. This will be used
+ // to access the key powers H^n through H^1.
+ mov DATALEN, %eax
+ neg %eax
+ and $~15, %eax
+ sar $1, %eax
+ add $64, %eax
+
+ sub $16, DATALEN
+ jl .Lcrypt_loop_1x_done\@
+
+ // Process the data one full block at a time.
+.Lcrypt_loop_1x\@:
+
+ // Encrypt the next counter block.
+ _vpshufb BSWAP_MASK, LE_CTR, TMP0
+ paddd ONE, LE_CTR
+ pxor (KEY), TMP0
+ lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ aesenc -7*16(%rsi), TMP0
+ aesenc -6*16(%rsi), TMP0
+192:
+ aesenc -5*16(%rsi), TMP0
+ aesenc -4*16(%rsi), TMP0
+128:
+.irp i, -3,-2,-1,0,1,2,3,4,5
+ aesenc \i*16(%rsi), TMP0
+.endr
+ aesenclast (RNDKEYLAST_PTR), TMP0
+
+ // Load the next key power H^i.
+ movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
+ movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
+
+ // XOR the keystream block that was just generated in TMP0 with the next
+ // source data block and store the resulting en/decrypted data to DST.
+.if \enc
+ _xor_mem_to_reg (SRC), TMP0, tmp=TMP1
+ movdqu TMP0, (DST)
+.else
+ movdqu (SRC), TMP1
+ pxor TMP1, TMP0
+ movdqu TMP0, (DST)
+.endif
+
+ // Update GHASH with the ciphertext block.
+.if \enc
+ pshufb BSWAP_MASK, TMP0
+ pxor TMP0, GHASH_ACC
+.else
+ pshufb BSWAP_MASK, TMP1
+ pxor TMP1, GHASH_ACC
+.endif
+ _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
+ pxor GHASH_ACC, GHASH_ACC
+
+ add $8, %eax
+ add $16, SRC
+ add $16, DST
+ sub $16, DATALEN
+ jge .Lcrypt_loop_1x\@
+.Lcrypt_loop_1x_done\@:
+ // Check whether there is a partial block at the end.
+ add $16, DATALEN
+ jz .Lghash_reduce\@
+
+ // Process a partial block of length 1 <= DATALEN <= 15.
+
+ // Encrypt a counter block for the last time.
+ pshufb BSWAP_MASK, LE_CTR
+ pxor (KEY), LE_CTR
+ lea 16(KEY), %rsi
+1:
+ aesenc (%rsi), LE_CTR
+ add $16, %rsi
+ cmp %rsi, RNDKEYLAST_PTR
+ jne 1b
+ aesenclast (RNDKEYLAST_PTR), LE_CTR
+
+ // Load the lowest key power, H^1.
+ movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
+ movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
+
+ // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is
+ // in %rcx, but _load_partial_block needs DATALEN in %rcx instead.
+ // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.
+ mov SRC, RNDKEYLAST_PTR
+ mov DATALEN, %ecx
+ _load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi
+
+ // XOR the keystream block that was just generated in LE_CTR with the
+ // source data block and store the resulting en/decrypted data to DST.
+ pxor TMP0, LE_CTR
+ mov DATALEN, %ecx
+ _store_partial_block LE_CTR, DST
+
+ // If encrypting, zero-pad the final ciphertext block for GHASH. (If
+ // decrypting, this was already done by _load_partial_block.)
+.if \enc
+ lea .Lzeropad_mask+16(%rip), %rax
+ sub DATALEN64, %rax
+ _vpand (%rax), LE_CTR, TMP0
+.endif
+
+ // Update GHASH with the final ciphertext block.
+ pshufb BSWAP_MASK, TMP0
+ pxor TMP0, GHASH_ACC
+ _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
+
+.Lghash_reduce\@:
+ // Finally, do the GHASH reduction.
+ _ghash_reduce LO, MI, HI, GHASH_ACC, TMP0
+
+.Ldone\@:
+ // Store the updated GHASH accumulator back to memory.
+ movdqu GHASH_ACC, (GHASH_ACC_PTR)
+
+ RET
+.endm
+
+// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,
+// const u32 le_ctr[4], const u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen,
+// const u8 tag[16], int taglen);
+//
+// This macro generates one of the above two functions (with \enc selecting
+// which one). Both functions finish computing the GCM authentication tag by
+// updating GHASH with the lengths block and encrypting the GHASH accumulator.
+// |total_aadlen| and |total_datalen| must be the total length of the additional
+// authenticated data and the en/decrypted data in bytes, respectively.
+//
+// The encryption function then stores the full-length (16-byte) computed
+// authentication tag to |ghash_acc|. The decryption function instead loads the
+// expected authentication tag (the one that was transmitted) from the 16-byte
+// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
+// computed tag in constant time, and returns true if and only if they match.
+.macro _aes_gcm_final enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi
+ .set GHASH_ACC_PTR, %rdx
+ .set TOTAL_AADLEN, %rcx
+ .set TOTAL_DATALEN, %r8
+ .set TAG, %r9
+ .set TAGLEN, %r10d // Originally at 8(%rsp)
+ .set TAGLEN64, %r10
+
+ // Additional local variables.
+ // %rax and %xmm0-%xmm2 are used as temporary registers.
+ .set AESKEYLEN, %r11d
+ .set AESKEYLEN64, %r11
+ .set BSWAP_MASK, %xmm3
+ .set GHASH_ACC, %xmm4
+ .set H_POW1, %xmm5 // H^1
+ .set H_POW1_X64, %xmm6 // H^1 * x^64
+ .set GFPOLY, %xmm7
+
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+ // Set up a counter block with 1 in the low 32-bit word. This is the
+ // counter that produces the ciphertext needed to encrypt the auth tag.
+ movdqu (LE_CTR_PTR), %xmm0
+ mov $1, %eax
+ pinsrd $0, %eax, %xmm0
+
+ // Build the lengths block and XOR it into the GHASH accumulator.
+ movq TOTAL_DATALEN, GHASH_ACC
+ pinsrq $1, TOTAL_AADLEN, GHASH_ACC
+ psllq $3, GHASH_ACC // Bytes to bits
+ _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1
+
+ movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1
+ movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
+ movq .Lgfpoly(%rip), GFPOLY
+
+ // Make %rax point to the 6th from last AES round key. (Using signed
+ // byte offsets -7*16 through 6*16 decreases code size.)
+ lea (KEY,AESKEYLEN64,4), %rax
+
+ // AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
+ // Interleave the AES and GHASH instructions to improve performance.
+ pshufb BSWAP_MASK, %xmm0
+ pxor (KEY), %xmm0
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ aesenc -7*16(%rax), %xmm0
+ aesenc -6*16(%rax), %xmm0
+192:
+ aesenc -5*16(%rax), %xmm0
+ aesenc -4*16(%rax), %xmm0
+128:
+.irp i, 0,1,2,3,4,5,6,7,8
+ aesenc (\i-3)*16(%rax), %xmm0
+ _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
+.endr
+ aesenclast 6*16(%rax), %xmm0
+ _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
+
+ // Undo the byte reflection of the GHASH accumulator.
+ pshufb BSWAP_MASK, GHASH_ACC
+
+ // Encrypt the GHASH accumulator.
+ pxor %xmm0, GHASH_ACC
+
+.if \enc
+ // Return the computed auth tag.
+ movdqu GHASH_ACC, (GHASH_ACC_PTR)
+.else
+ .set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!
+
+ // Verify the auth tag in constant time by XOR'ing the transmitted and
+ // computed auth tags together and using the ptest instruction to check
+ // whether the first TAGLEN bytes of the result are zero.
+ _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0
+ movl 8(%rsp), TAGLEN
+ lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR
+ sub TAGLEN64, ZEROPAD_MASK_PTR
+ xor %eax, %eax
+ _test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0
+ sete %al
+.endif
+ RET
+.endm
+
+.set USE_AVX, 0
+SYM_FUNC_START(aes_gcm_precompute_aesni)
+ _aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_aesni)
+SYM_FUNC_START(aes_gcm_aad_update_aesni)
+ _aes_gcm_aad_update
+SYM_FUNC_END(aes_gcm_aad_update_aesni)
+SYM_FUNC_START(aes_gcm_enc_update_aesni)
+ _aes_gcm_update 1
+SYM_FUNC_END(aes_gcm_enc_update_aesni)
+SYM_FUNC_START(aes_gcm_dec_update_aesni)
+ _aes_gcm_update 0
+SYM_FUNC_END(aes_gcm_dec_update_aesni)
+SYM_FUNC_START(aes_gcm_enc_final_aesni)
+ _aes_gcm_final 1
+SYM_FUNC_END(aes_gcm_enc_final_aesni)
+SYM_FUNC_START(aes_gcm_dec_final_aesni)
+ _aes_gcm_final 0
+SYM_FUNC_END(aes_gcm_dec_final_aesni)
+
+.set USE_AVX, 1
+SYM_FUNC_START(aes_gcm_precompute_aesni_avx)
+ _aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_aesni_avx)
+SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)
+ _aes_gcm_aad_update
+SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)
+SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)
+ _aes_gcm_update 1
+SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)
+SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)
+ _aes_gcm_update 0
+SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)
+SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)
+ _aes_gcm_final 1
+SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)
+SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)
+ _aes_gcm_final 0
+SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)
diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
new file mode 100644
index 000000000000..97e0ee515fc5
--- /dev/null
+++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
@@ -0,0 +1,1222 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// VAES and VPCLMULQDQ optimized AES-GCM for x86_64
+//
+// Copyright 2024 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+//------------------------------------------------------------------------------
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+//
+// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
+// support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and
+// either AVX512 or AVX10. Some of the functions, notably the encryption and
+// decryption update functions which are the most performance-critical, are
+// provided in two variants generated from a macro: one using 256-bit vectors
+// (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The
+// other, "shared" functions (vaes_avx10) use at most 256-bit vectors.
+//
+// The functions that use 512-bit vectors are intended for CPUs that support
+// 512-bit vectors *and* where using them doesn't cause significant
+// downclocking. They require the following CPU features:
+//
+// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512)
+//
+// The other functions require the following CPU features:
+//
+// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256)
+//
+// All functions use the "System V" ABI. The Windows ABI is not supported.
+//
+// Note that we use "avx10" in the names of the functions as a shorthand to
+// really mean "AVX10 or a certain set of AVX512 features". Due to Intel's
+// introduction of AVX512 and then its replacement by AVX10, there doesn't seem
+// to be a simple way to name things that makes sense on all CPUs.
+//
+// Note that the macros that support both 256-bit and 512-bit vectors could
+// fairly easily be changed to support 128-bit too. However, this would *not*
+// be sufficient to allow the code to run on CPUs without AVX512 or AVX10,
+// because the code heavily uses several features of these extensions other than
+// the vector length: the increase in the number of SIMD registers from 16 to
+// 32, masking support, and new instructions such as vpternlogd (which can do a
+// three-argument XOR). These features are very useful for AES-GCM.
+
+#include <linux/linkage.h>
+
+.section .rodata
+.p2align 6
+
+ // A shuffle mask that reflects the bytes of 16-byte blocks
+.Lbswap_mask:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+
+ // This is the GHASH reducing polynomial without its constant term, i.e.
+ // x^128 + x^7 + x^2 + x, represented using the backwards mapping
+ // between bits and polynomial coefficients.
+ //
+ // Alternatively, it can be interpreted as the naturally-ordered
+ // representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
+ // "reversed" GHASH reducing polynomial without its x^128 term.
+.Lgfpoly:
+ .octa 0xc2000000000000000000000000000001
+
+ // Same as above, but with the (1 << 64) bit set.
+.Lgfpoly_and_internal_carrybit:
+ .octa 0xc2000000000000010000000000000001
+
+ // The below constants are used for incrementing the counter blocks.
+ // ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
+ // inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
+ // 4. Note that the same '2' is reused in ctr_pattern and inc_2blocks.
+.Lctr_pattern:
+ .octa 0
+ .octa 1
+.Linc_2blocks:
+ .octa 2
+ .octa 3
+.Linc_4blocks:
+ .octa 4
+
+// Number of powers of the hash key stored in the key struct. The powers are
+// stored from highest (H^NUM_H_POWERS) to lowest (H^1).
+#define NUM_H_POWERS 16
+
+// Offset to AES key length (in bytes) in the key struct
+#define OFFSETOF_AESKEYLEN 480
+
+// Offset to start of hash key powers array in the key struct
+#define OFFSETOF_H_POWERS 512
+
+// Offset to end of hash key powers array in the key struct.
+//
+// This is immediately followed by three zeroized padding blocks, which are
+// included so that partial vectors can be handled more easily. E.g. if VL=64
+// and two blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most
+// padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
+#define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16))
+
+.text
+
+// Set the vector length in bytes. This sets the VL variable and defines
+// register aliases V0-V31 that map to the ymm or zmm registers.
+.macro _set_veclen vl
+ .set VL, \vl
+.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
+ 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+.if VL == 32
+ .set V\i, %ymm\i
+.elseif VL == 64
+ .set V\i, %zmm\i
+.else
+ .error "Unsupported vector length"
+.endif
+.endr
+.endm
+
+// The _ghash_mul_step macro does one step of GHASH multiplication of the
+// 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
+// reduced products in \dst. \t0, \t1, and \t2 are temporary registers of the
+// same size as \a and \b. To complete all steps, this must invoked with \i=0
+// through \i=9. The division into steps allows users of this macro to
+// optionally interleave the computation with other instructions. Users of this
+// macro must preserve the parameter registers across steps.
+//
+// The multiplications are done in GHASH's representation of the finite field
+// GF(2^128). Elements of GF(2^128) are represented as binary polynomials
+// (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial
+// G. The GCM specification uses G = x^128 + x^7 + x^2 + x + 1. Addition is
+// just XOR, while multiplication is more complex and has two parts: (a) do
+// carryless multiplication of two 128-bit input polynomials to get a 256-bit
+// intermediate product polynomial, and (b) reduce the intermediate product to
+// 128 bits by adding multiples of G that cancel out terms in it. (Adding
+// multiples of G doesn't change which field element the polynomial represents.)
+//
+// Unfortunately, the GCM specification maps bits to/from polynomial
+// coefficients backwards from the natural order. In each byte it specifies the
+// highest bit to be the lowest order polynomial coefficient, *not* the highest!
+// This makes it nontrivial to work with the GHASH polynomials. We could
+// reflect the bits, but x86 doesn't have an instruction that does that.
+//
+// Instead, we operate on the values without bit-reflecting them. This *mostly*
+// just works, since XOR and carryless multiplication are symmetric with respect
+// to bit order, but it has some consequences. First, due to GHASH's byte
+// order, by skipping bit reflection, *byte* reflection becomes necessary to
+// give the polynomial terms a consistent order. E.g., considering an N-bit
+// value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0
+// through N-1 of the byte-reflected value represent the coefficients of x^(N-1)
+// through x^0, whereas bits 0 through N-1 of the non-byte-reflected value
+// represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked
+// with. Fortunately, x86's vpshufb instruction can do byte reflection.
+//
+// Second, forgoing the bit reflection causes an extra multiple of x (still
+// using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each
+// multiplication. This is because an M-bit by N-bit carryless multiplication
+// really produces a (M+N-1)-bit product, but in practice it's zero-extended to
+// M+N bits. In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits
+// to polynomial coefficients backwards, this zero-extension actually changes
+// the product by introducing an extra factor of x. Therefore, users of this
+// macro must ensure that one of the inputs has an extra factor of x^-1, i.e.
+// the multiplicative inverse of x, to cancel out the extra x.
+//
+// Third, the backwards coefficients convention is just confusing to work with,
+// since it makes "low" and "high" in the polynomial math mean the opposite of
+// their normal meaning in computer programming. This can be solved by using an
+// alternative interpretation: the polynomial coefficients are understood to be
+// in the natural order, and the multiplication is actually \a * \b * x^-128 mod
+// x^128 + x^127 + x^126 + x^121 + 1. This doesn't change the inputs, outputs,
+// or the implementation at all; it just changes the mathematical interpretation
+// of what each instruction is doing. Starting from here, we'll use this
+// alternative interpretation, as it's easier to understand the code that way.
+//
+// Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 =>
+// 128-bit carryless multiplication, so we break the 128 x 128 multiplication
+// into parts as follows (the _L and _H suffixes denote low and high 64 bits):
+//
+// LO = a_L * b_L
+// MI = (a_L * b_H) + (a_H * b_L)
+// HI = a_H * b_H
+//
+// The 256-bit product is x^128*HI + x^64*MI + LO. LO, MI, and HI are 128-bit.
+// Note that MI "overlaps" with LO and HI. We don't consolidate MI into LO and
+// HI right away, since the way the reduction works makes that unnecessary.
+//
+// For the reduction, we cancel out the low 128 bits by adding multiples of G =
+// x^128 + x^127 + x^126 + x^121 + 1. This is done by two iterations, each of
+// which cancels out the next lowest 64 bits. Consider a value x^64*A + B,
+// where A and B are 128-bit. Adding B_L*G to that value gives:
+//
+// x^64*A + B + B_L*G
+// = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1)
+// = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L
+// = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L
+// = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57))
+//
+// So: if we sum A, B with its halves swapped, and the low half of B times x^63
+// + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the
+// original value x^64*A + B. I.e., the low 64 bits got canceled out.
+//
+// We just need to apply this twice: first to fold LO into MI, and second to
+// fold the updated MI into HI.
+//
+// The needed three-argument XORs are done using the vpternlogd instruction with
+// immediate 0x96, since this is faster than two vpxord instructions.
+//
+// A potential optimization, assuming that b is fixed per-key (if a is fixed
+// per-key it would work the other way around), is to use one iteration of the
+// reduction described above to precompute a value c such that x^64*c = b mod G,
+// and then multiply a_L by c (and implicitly by x^64) instead of by b:
+//
+// MI = (a_L * c_L) + (a_H * b_L)
+// HI = (a_L * c_H) + (a_H * b_H)
+//
+// This would eliminate the LO part of the intermediate product, which would
+// eliminate the need to fold LO into MI. This would save two instructions,
+// including a vpclmulqdq. However, we currently don't use this optimization
+// because it would require twice as many per-key precomputed values.
+//
+// Using Karatsuba multiplication instead of "schoolbook" multiplication
+// similarly would save a vpclmulqdq but does not seem to be worth it.
+.macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2
+.if \i == 0
+ vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L
+ vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H
+.elseif \i == 1
+ vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L
+.elseif \i == 2
+ vpxord \t2, \t1, \t1 // MI = MI_0 + MI_1
+.elseif \i == 3
+ vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57)
+.elseif \i == 4
+ vpshufd $0x4e, \t0, \t0 // Swap halves of LO
+.elseif \i == 5
+ vpternlogd $0x96, \t2, \t0, \t1 // Fold LO into MI
+.elseif \i == 6
+ vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H
+.elseif \i == 7
+ vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+ vpshufd $0x4e, \t1, \t1 // Swap halves of MI
+.elseif \i == 9
+ vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI
+.endif
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
+// the reduced products in \dst. See _ghash_mul_step for full explanation.
+.macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2
+.irp i, 0,1,2,3,4,5,6,7,8,9
+ _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2
+.endr
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
+// *unreduced* products to \lo, \mi, and \hi.
+.macro _ghash_mul_noreduce a, b, lo, mi, hi, t0, t1, t2, t3
+ vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L
+ vpclmulqdq $0x01, \a, \b, \t1 // a_L * b_H
+ vpclmulqdq $0x10, \a, \b, \t2 // a_H * b_L
+ vpclmulqdq $0x11, \a, \b, \t3 // a_H * b_H
+ vpxord \t0, \lo, \lo
+ vpternlogd $0x96, \t2, \t1, \mi
+ vpxord \t3, \hi, \hi
+.endm
+
+// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
+// reduced products in \hi. See _ghash_mul_step for explanation of reduction.
+.macro _ghash_reduce lo, mi, hi, gfpoly, t0
+ vpclmulqdq $0x01, \lo, \gfpoly, \t0
+ vpshufd $0x4e, \lo, \lo
+ vpternlogd $0x96, \t0, \lo, \mi
+ vpclmulqdq $0x01, \mi, \gfpoly, \t0
+ vpshufd $0x4e, \mi, \mi
+ vpternlogd $0x96, \t0, \mi, \hi
+.endm
+
+// void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key);
+//
+// Given the expanded AES key |key->aes_key|, this function derives the GHASH
+// subkey and initializes |key->ghash_key_powers| with powers of it.
+//
+// The number of key powers initialized is NUM_H_POWERS, and they are stored in
+// the order H^NUM_H_POWERS to H^1. The zeroized padding blocks after the key
+// powers themselves are also initialized.
+//
+// This macro supports both VL=32 and VL=64. _set_veclen must have been invoked
+// with the desired length. In the VL=32 case, the function computes twice as
+// many key powers than are actually used by the VL=32 GCM update functions.
+// This is done to keep the key format the same regardless of vector length.
+.macro _aes_gcm_precompute
+
+ // Function arguments
+ .set KEY, %rdi
+
+ // Additional local variables. V0-V2 and %rax are used as temporaries.
+ .set POWERS_PTR, %rsi
+ .set RNDKEYLAST_PTR, %rdx
+ .set H_CUR, V3
+ .set H_CUR_YMM, %ymm3
+ .set H_CUR_XMM, %xmm3
+ .set H_INC, V4
+ .set H_INC_YMM, %ymm4
+ .set H_INC_XMM, %xmm4
+ .set GFPOLY, V5
+ .set GFPOLY_YMM, %ymm5
+ .set GFPOLY_XMM, %xmm5
+
+ // Get pointer to lowest set of key powers (located at end of array).
+ lea OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR
+
+ // Encrypt an all-zeroes block to get the raw hash subkey.
+ movl OFFSETOF_AESKEYLEN(KEY), %eax
+ lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR
+ vmovdqu (KEY), %xmm0 // Zero-th round key XOR all-zeroes block
+ add $16, KEY
+1:
+ vaesenc (KEY), %xmm0, %xmm0
+ add $16, KEY
+ cmp KEY, RNDKEYLAST_PTR
+ jne 1b
+ vaesenclast (RNDKEYLAST_PTR), %xmm0, %xmm0
+
+ // Reflect the bytes of the raw hash subkey.
+ vpshufb .Lbswap_mask(%rip), %xmm0, H_CUR_XMM
+
+ // Zeroize the padding blocks.
+ vpxor %xmm0, %xmm0, %xmm0
+ vmovdqu %ymm0, VL(POWERS_PTR)
+ vmovdqu %xmm0, VL+2*16(POWERS_PTR)
+
+ // Finish preprocessing the first key power, H^1. Since this GHASH
+ // implementation operates directly on values with the backwards bit
+ // order specified by the GCM standard, it's necessary to preprocess the
+ // raw key as follows. First, reflect its bytes. Second, multiply it
+ // by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards
+ // interpretation of polynomial coefficients), which can also be
+ // interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121
+ // + 1 using the alternative, natural interpretation of polynomial
+ // coefficients. For details, see the comment above _ghash_mul_step.
+ //
+ // Either way, for the multiplication the concrete operation performed
+ // is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2
+ // << 120) | 1 if a 1 bit was carried out. However, there's no 128-bit
+ // wide shift instruction, so instead double each of the two 64-bit
+ // halves and incorporate the internal carry bit into the value XOR'd.
+ vpshufd $0xd3, H_CUR_XMM, %xmm0
+ vpsrad $31, %xmm0, %xmm0
+ vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM
+ vpand .Lgfpoly_and_internal_carrybit(%rip), %xmm0, %xmm0
+ vpxor %xmm0, H_CUR_XMM, H_CUR_XMM
+
+ // Load the gfpoly constant.
+ vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY
+
+ // Square H^1 to get H^2.
+ //
+ // Note that as with H^1, all higher key powers also need an extra
+ // factor of x^-1 (or x using the natural interpretation). Nothing
+ // special needs to be done to make this happen, though: H^1 * H^1 would
+ // end up with two factors of x^-1, but the multiplication consumes one.
+ // So the product H^2 ends up with the desired one factor of x^-1.
+ _ghash_mul H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \
+ %xmm0, %xmm1, %xmm2
+
+ // Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].
+ vinserti128 $1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM
+ vinserti128 $1, H_INC_XMM, H_INC_YMM, H_INC_YMM
+
+.if VL == 64
+ // Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
+ _ghash_mul H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \
+ %ymm0, %ymm1, %ymm2
+ vinserti64x4 $1, H_CUR_YMM, H_INC, H_CUR
+ vshufi64x2 $0, H_INC, H_INC, H_INC
+.endif
+
+ // Store the lowest set of key powers.
+ vmovdqu8 H_CUR, (POWERS_PTR)
+
+ // Compute and store the remaining key powers. With VL=32, repeatedly
+ // multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)].
+ // With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
+ // [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].
+ mov $(NUM_H_POWERS*16/VL) - 1, %eax
+.Lprecompute_next\@:
+ sub $VL, POWERS_PTR
+ _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2
+ vmovdqu8 H_CUR, (POWERS_PTR)
+ dec %eax
+ jnz .Lprecompute_next\@
+
+ vzeroupper // This is needed after using ymm or zmm registers.
+ RET
+.endm
+
+// XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
+// the result in \dst_xmm. This implicitly zeroizes the other lanes of dst.
+.macro _horizontal_xor src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm
+ vextracti32x4 $1, \src, \t0_xmm
+.if VL == 32
+ vpxord \t0_xmm, \src_xmm, \dst_xmm
+.elseif VL == 64
+ vextracti32x4 $2, \src, \t1_xmm
+ vextracti32x4 $3, \src, \t2_xmm
+ vpxord \t0_xmm, \src_xmm, \dst_xmm
+ vpternlogd $0x96, \t1_xmm, \t2_xmm, \dst_xmm
+.else
+ .error "Unsupported vector length"
+.endif
+.endm
+
+// Do one step of the GHASH update of the data blocks given in the vector
+// registers GHASHDATA[0-3]. \i specifies the step to do, 0 through 9. The
+// division into steps allows users of this macro to optionally interleave the
+// computation with other instructions. This macro uses the vector register
+// GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered;
+// H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and
+// GHASHTMP[0-2] as temporaries. This macro handles the byte-reflection of the
+// data blocks. The parameter registers must be preserved across steps.
+//
+// The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) +
+// H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the
+// operations are vectorized operations on vectors of 16-byte blocks. E.g.,
+// with VL=32 there are 2 blocks per vector and the vectorized terms correspond
+// to the following non-vectorized terms:
+//
+// H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0)
+// H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3
+// H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5
+// H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7
+//
+// With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15.
+//
+// More concretely, this code does:
+// - Do vectorized "schoolbook" multiplications to compute the intermediate
+// 256-bit product of each block and its corresponding hash key power.
+// There are 4*VL/16 of these intermediate products.
+// - Sum (XOR) the intermediate 256-bit products across vectors. This leaves
+// VL/16 256-bit intermediate values.
+// - Do a vectorized reduction of these 256-bit intermediate values to
+// 128-bits each. This leaves VL/16 128-bit intermediate values.
+// - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
+//
+// See _ghash_mul_step for the full explanation of the operations performed for
+// each individual finite field multiplication and reduction.
+.macro _ghash_step_4x i
+.if \i == 0
+ vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0
+ vpxord GHASH_ACC, GHASHDATA0, GHASHDATA0
+ vpshufb BSWAP_MASK, GHASHDATA1, GHASHDATA1
+ vpshufb BSWAP_MASK, GHASHDATA2, GHASHDATA2
+.elseif \i == 1
+ vpshufb BSWAP_MASK, GHASHDATA3, GHASHDATA3
+ vpclmulqdq $0x00, H_POW4, GHASHDATA0, GHASH_ACC // LO_0
+ vpclmulqdq $0x00, H_POW3, GHASHDATA1, GHASHTMP0 // LO_1
+ vpclmulqdq $0x00, H_POW2, GHASHDATA2, GHASHTMP1 // LO_2
+.elseif \i == 2
+ vpxord GHASHTMP0, GHASH_ACC, GHASH_ACC // sum(LO_{1,0})
+ vpclmulqdq $0x00, H_POW1, GHASHDATA3, GHASHTMP2 // LO_3
+ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASH_ACC // LO = sum(LO_{3,2,1,0})
+ vpclmulqdq $0x01, H_POW4, GHASHDATA0, GHASHTMP0 // MI_0
+.elseif \i == 3
+ vpclmulqdq $0x01, H_POW3, GHASHDATA1, GHASHTMP1 // MI_1
+ vpclmulqdq $0x01, H_POW2, GHASHDATA2, GHASHTMP2 // MI_2
+ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{2,1,0})
+ vpclmulqdq $0x01, H_POW1, GHASHDATA3, GHASHTMP1 // MI_3
+.elseif \i == 4
+ vpclmulqdq $0x10, H_POW4, GHASHDATA0, GHASHTMP2 // MI_4
+ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{4,3,2,1,0})
+ vpclmulqdq $0x10, H_POW3, GHASHDATA1, GHASHTMP1 // MI_5
+ vpclmulqdq $0x10, H_POW2, GHASHDATA2, GHASHTMP2 // MI_6
+.elseif \i == 5
+ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{6,5,4,3,2,1,0})
+ vpclmulqdq $0x01, GHASH_ACC, GFPOLY, GHASHTMP2 // LO_L*(x^63 + x^62 + x^57)
+ vpclmulqdq $0x10, H_POW1, GHASHDATA3, GHASHTMP1 // MI_7
+ vpxord GHASHTMP1, GHASHTMP0, GHASHTMP0 // MI = sum(MI_{7,6,5,4,3,2,1,0})
+.elseif \i == 6
+ vpshufd $0x4e, GHASH_ACC, GHASH_ACC // Swap halves of LO
+ vpclmulqdq $0x11, H_POW4, GHASHDATA0, GHASHDATA0 // HI_0
+ vpclmulqdq $0x11, H_POW3, GHASHDATA1, GHASHDATA1 // HI_1
+ vpclmulqdq $0x11, H_POW2, GHASHDATA2, GHASHDATA2 // HI_2
+.elseif \i == 7
+ vpternlogd $0x96, GHASHTMP2, GHASH_ACC, GHASHTMP0 // Fold LO into MI
+ vpclmulqdq $0x11, H_POW1, GHASHDATA3, GHASHDATA3 // HI_3
+ vpternlogd $0x96, GHASHDATA2, GHASHDATA1, GHASHDATA0 // sum(HI_{2,1,0})
+ vpclmulqdq $0x01, GHASHTMP0, GFPOLY, GHASHTMP1 // MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+ vpxord GHASHDATA3, GHASHDATA0, GHASH_ACC // HI = sum(HI_{3,2,1,0})
+ vpshufd $0x4e, GHASHTMP0, GHASHTMP0 // Swap halves of MI
+ vpternlogd $0x96, GHASHTMP1, GHASHTMP0, GHASH_ACC // Fold MI into HI
+.elseif \i == 9
+ _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \
+ GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+.endif
+.endm
+
+// Do one non-last round of AES encryption on the counter blocks in V0-V3 using
+// the round key that has been broadcast to all 128-bit lanes of \round_key.
+.macro _vaesenc_4x round_key
+ vaesenc \round_key, V0, V0
+ vaesenc \round_key, V1, V1
+ vaesenc \round_key, V2, V2
+ vaesenc \round_key, V3, V3
+.endm
+
+// Start the AES encryption of four vectors of counter blocks.
+.macro _ctr_begin_4x
+
+ // Increment LE_CTR four times to generate four vectors of little-endian
+ // counter blocks, swap each to big-endian, and store them in V0-V3.
+ vpshufb BSWAP_MASK, LE_CTR, V0
+ vpaddd LE_CTR_INC, LE_CTR, LE_CTR
+ vpshufb BSWAP_MASK, LE_CTR, V1
+ vpaddd LE_CTR_INC, LE_CTR, LE_CTR
+ vpshufb BSWAP_MASK, LE_CTR, V2
+ vpaddd LE_CTR_INC, LE_CTR, LE_CTR
+ vpshufb BSWAP_MASK, LE_CTR, V3
+ vpaddd LE_CTR_INC, LE_CTR, LE_CTR
+
+ // AES "round zero": XOR in the zero-th round key.
+ vpxord RNDKEY0, V0, V0
+ vpxord RNDKEY0, V1, V1
+ vpxord RNDKEY0, V2, V2
+ vpxord RNDKEY0, V3, V3
+.endm
+
+// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// const u8 *src, u8 *dst, int datalen);
+//
+// This macro generates a GCM encryption or decryption update function with the
+// above prototype (with \enc selecting which one). This macro supports both
+// VL=32 and VL=64. _set_veclen must have been invoked with the desired length.
+//
+// This function computes the next portion of the CTR keystream, XOR's it with
+// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
+// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the
+// next |datalen| ciphertext bytes.
+//
+// |datalen| must be a multiple of 16, except on the last call where it can be
+// any length. The caller must do any buffering needed to ensure this. Both
+// in-place and out-of-place en/decryption are supported.
+//
+// |le_ctr| must give the current counter in little-endian format. For a new
+// message, the low word of the counter must be 2. This function loads the
+// counter from |le_ctr| and increments the loaded counter as needed, but it
+// does *not* store the updated counter back to |le_ctr|. The caller must
+// update |le_ctr| if any more data segments follow. Internally, only the low
+// 32-bit word of the counter is incremented, following the GCM standard.
+.macro _aes_gcm_update enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi
+ .set GHASH_ACC_PTR, %rdx
+ .set SRC, %rcx
+ .set DST, %r8
+ .set DATALEN, %r9d
+ .set DATALEN64, %r9 // Zero-extend DATALEN before using!
+
+ // Additional local variables
+
+ // %rax and %k1 are used as temporary registers. LE_CTR_PTR is also
+ // available as a temporary register after the counter is loaded.
+
+ // AES key length in bytes
+ .set AESKEYLEN, %r10d
+ .set AESKEYLEN64, %r10
+
+ // Pointer to the last AES round key for the chosen AES variant
+ .set RNDKEYLAST_PTR, %r11
+
+ // In the main loop, V0-V3 are used as AES input and output. Elsewhere
+ // they are used as temporary registers.
+
+ // GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
+ .set GHASHDATA0, V4
+ .set GHASHDATA0_XMM, %xmm4
+ .set GHASHDATA1, V5
+ .set GHASHDATA1_XMM, %xmm5
+ .set GHASHDATA2, V6
+ .set GHASHDATA2_XMM, %xmm6
+ .set GHASHDATA3, V7
+
+ // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
+ // using vpshufb, copied to all 128-bit lanes.
+ .set BSWAP_MASK, V8
+
+ // RNDKEY temporarily holds the next AES round key.
+ .set RNDKEY, V9
+
+ // GHASH_ACC is the accumulator variable for GHASH. When fully reduced,
+ // only the lowest 128-bit lane can be nonzero. When not fully reduced,
+ // more than one lane may be used, and they need to be XOR'd together.
+ .set GHASH_ACC, V10
+ .set GHASH_ACC_XMM, %xmm10
+
+ // LE_CTR_INC is the vector of 32-bit words that need to be added to a
+ // vector of little-endian counter blocks to advance it forwards.
+ .set LE_CTR_INC, V11
+
+ // LE_CTR contains the next set of little-endian counter blocks.
+ .set LE_CTR, V12
+
+ // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys,
+ // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key,
+ // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
+ .set RNDKEY0, V13
+ .set RNDKEYLAST, V14
+ .set RNDKEY_M9, V15
+ .set RNDKEY_M8, V16
+ .set RNDKEY_M7, V17
+ .set RNDKEY_M6, V18
+ .set RNDKEY_M5, V19
+
+ // RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with
+ // the corresponding block of source data. This is useful because
+ // vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can
+ // be computed in parallel with the AES rounds.
+ .set RNDKEYLAST0, V20
+ .set RNDKEYLAST1, V21
+ .set RNDKEYLAST2, V22
+ .set RNDKEYLAST3, V23
+
+ // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These
+ // cannot coincide with anything used for AES encryption, since for
+ // performance reasons GHASH and AES encryption are interleaved.
+ .set GHASHTMP0, V24
+ .set GHASHTMP1, V25
+ .set GHASHTMP2, V26
+
+ // H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The
+ // descending numbering reflects the order of the key powers.
+ .set H_POW4, V27
+ .set H_POW3, V28
+ .set H_POW2, V29
+ .set H_POW1, V30
+
+ // GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
+ .set GFPOLY, V31
+
+ // Load some constants.
+ vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK
+ vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY
+
+ // Load the GHASH accumulator and the starting counter.
+ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM
+ vbroadcasti32x4 (LE_CTR_PTR), LE_CTR
+
+ // Load the AES key length in bytes.
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+ // Make RNDKEYLAST_PTR point to the last AES round key. This is the
+ // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
+ // respectively. Then load the zero-th and last round keys.
+ lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
+ vbroadcasti32x4 (KEY), RNDKEY0
+ vbroadcasti32x4 (RNDKEYLAST_PTR), RNDKEYLAST
+
+ // Finish initializing LE_CTR by adding [0, 1, ...] to its low words.
+ vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR
+
+ // Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
+.if VL == 32
+ vbroadcasti32x4 .Linc_2blocks(%rip), LE_CTR_INC
+.elseif VL == 64
+ vbroadcasti32x4 .Linc_4blocks(%rip), LE_CTR_INC
+.else
+ .error "Unsupported vector length"
+.endif
+
+ // If there are at least 4*VL bytes of data, then continue into the loop
+ // that processes 4*VL bytes of data at a time. Otherwise skip it.
+ //
+ // Pre-subtracting 4*VL from DATALEN saves an instruction from the main
+ // loop and also ensures that at least one write always occurs to
+ // DATALEN, zero-extending it and allowing DATALEN64 to be used later.
+ sub $4*VL, DATALEN
+ jl .Lcrypt_loop_4x_done\@
+
+ // Load powers of the hash key.
+ vmovdqu8 OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4
+ vmovdqu8 OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3
+ vmovdqu8 OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2
+ vmovdqu8 OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1
+
+ // Main loop: en/decrypt and hash 4 vectors at a time.
+ //
+ // When possible, interleave the AES encryption of the counter blocks
+ // with the GHASH update of the ciphertext blocks. This improves
+ // performance on many CPUs because the execution ports used by the VAES
+ // instructions often differ from those used by vpclmulqdq and other
+ // instructions used in GHASH. For example, many Intel CPUs dispatch
+ // vaesenc to ports 0 and 1 and vpclmulqdq to port 5.
+ //
+ // The interleaving is easiest to do during decryption, since during
+ // decryption the ciphertext blocks are immediately available. For
+ // encryption, instead encrypt the first set of blocks, then hash those
+ // blocks while encrypting the next set of blocks, repeat that as
+ // needed, and finally hash the last set of blocks.
+
+.if \enc
+ // Encrypt the first 4 vectors of plaintext blocks. Leave the resulting
+ // ciphertext in GHASHDATA[0-3] for GHASH.
+ _ctr_begin_4x
+ lea 16(KEY), %rax
+1:
+ vbroadcasti32x4 (%rax), RNDKEY
+ _vaesenc_4x RNDKEY
+ add $16, %rax
+ cmp %rax, RNDKEYLAST_PTR
+ jne 1b
+ vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
+ vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
+ vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
+ vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
+ vaesenclast RNDKEYLAST0, V0, GHASHDATA0
+ vaesenclast RNDKEYLAST1, V1, GHASHDATA1
+ vaesenclast RNDKEYLAST2, V2, GHASHDATA2
+ vaesenclast RNDKEYLAST3, V3, GHASHDATA3
+ vmovdqu8 GHASHDATA0, 0*VL(DST)
+ vmovdqu8 GHASHDATA1, 1*VL(DST)
+ vmovdqu8 GHASHDATA2, 2*VL(DST)
+ vmovdqu8 GHASHDATA3, 3*VL(DST)
+ add $4*VL, SRC
+ add $4*VL, DST
+ sub $4*VL, DATALEN
+ jl .Lghash_last_ciphertext_4x\@
+.endif
+
+ // Cache as many additional AES round keys as possible.
+.irp i, 9,8,7,6,5
+ vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i
+.endr
+
+.Lcrypt_loop_4x\@:
+
+ // If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If
+ // encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
+.if !\enc
+ vmovdqu8 0*VL(SRC), GHASHDATA0
+ vmovdqu8 1*VL(SRC), GHASHDATA1
+ vmovdqu8 2*VL(SRC), GHASHDATA2
+ vmovdqu8 3*VL(SRC), GHASHDATA3
+.endif
+
+ // Start the AES encryption of the counter blocks.
+ _ctr_begin_4x
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ vbroadcasti32x4 -13*16(RNDKEYLAST_PTR), RNDKEY
+ _vaesenc_4x RNDKEY
+ vbroadcasti32x4 -12*16(RNDKEYLAST_PTR), RNDKEY
+ _vaesenc_4x RNDKEY
+192:
+ vbroadcasti32x4 -11*16(RNDKEYLAST_PTR), RNDKEY
+ _vaesenc_4x RNDKEY
+ vbroadcasti32x4 -10*16(RNDKEYLAST_PTR), RNDKEY
+ _vaesenc_4x RNDKEY
+128:
+
+ // XOR the source data with the last round key, saving the result in
+ // RNDKEYLAST[0-3]. This reduces latency by taking advantage of the
+ // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
+.if \enc
+ vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
+ vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
+ vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
+ vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
+.else
+ vpxord GHASHDATA0, RNDKEYLAST, RNDKEYLAST0
+ vpxord GHASHDATA1, RNDKEYLAST, RNDKEYLAST1
+ vpxord GHASHDATA2, RNDKEYLAST, RNDKEYLAST2
+ vpxord GHASHDATA3, RNDKEYLAST, RNDKEYLAST3
+.endif
+
+ // Finish the AES encryption of the counter blocks in V0-V3, interleaved
+ // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
+.irp i, 9,8,7,6,5
+ _vaesenc_4x RNDKEY_M\i
+ _ghash_step_4x (9 - \i)
+.endr
+.irp i, 4,3,2,1
+ vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY
+ _vaesenc_4x RNDKEY
+ _ghash_step_4x (9 - \i)
+.endr
+ _ghash_step_4x 9
+
+ // Do the last AES round. This handles the XOR with the source data
+ // too, as per the optimization described above.
+ vaesenclast RNDKEYLAST0, V0, GHASHDATA0
+ vaesenclast RNDKEYLAST1, V1, GHASHDATA1
+ vaesenclast RNDKEYLAST2, V2, GHASHDATA2
+ vaesenclast RNDKEYLAST3, V3, GHASHDATA3
+
+ // Store the en/decrypted data to DST.
+ vmovdqu8 GHASHDATA0, 0*VL(DST)
+ vmovdqu8 GHASHDATA1, 1*VL(DST)
+ vmovdqu8 GHASHDATA2, 2*VL(DST)
+ vmovdqu8 GHASHDATA3, 3*VL(DST)
+
+ add $4*VL, SRC
+ add $4*VL, DST
+ sub $4*VL, DATALEN
+ jge .Lcrypt_loop_4x\@
+
+.if \enc
+.Lghash_last_ciphertext_4x\@:
+ // Update GHASH with the last set of ciphertext blocks.
+.irp i, 0,1,2,3,4,5,6,7,8,9
+ _ghash_step_4x \i
+.endr
+.endif
+
+.Lcrypt_loop_4x_done\@:
+
+ // Undo the extra subtraction by 4*VL and check whether data remains.
+ add $4*VL, DATALEN
+ jz .Ldone\@
+
+ // The data length isn't a multiple of 4*VL. Process the remaining data
+ // of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time.
+ // Going one vector at a time may seem inefficient compared to having
+ // separate code paths for each possible number of vectors remaining.
+ // However, using a loop keeps the code size down, and it performs
+ // surprising well; modern CPUs will start executing the next iteration
+ // before the previous one finishes and also predict the number of loop
+ // iterations. For a similar reason, we roll up the AES rounds.
+ //
+ // On the last iteration, the remaining length may be less than VL.
+ // Handle this using masking.
+ //
+ // Since there are enough key powers available for all remaining data,
+ // there is no need to do a GHASH reduction after each iteration.
+ // Instead, multiply each remaining block by its own key power, and only
+ // do a GHASH reduction at the very end.
+
+ // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
+ // is the number of blocks that remain.
+ .set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused.
+ mov DATALEN, %eax
+ neg %rax
+ and $~15, %rax // -round_up(DATALEN, 16)
+ lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR
+
+ // Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+ .set LO, GHASHDATA0
+ .set LO_XMM, GHASHDATA0_XMM
+ .set MI, GHASHDATA1
+ .set MI_XMM, GHASHDATA1_XMM
+ .set HI, GHASHDATA2
+ .set HI_XMM, GHASHDATA2_XMM
+ vpxor LO_XMM, LO_XMM, LO_XMM
+ vpxor MI_XMM, MI_XMM, MI_XMM
+ vpxor HI_XMM, HI_XMM, HI_XMM
+
+.Lcrypt_loop_1x\@:
+
+ // Select the appropriate mask for this iteration: all 1's if
+ // DATALEN >= VL, otherwise DATALEN 1's. Do this branchlessly using the
+ // bzhi instruction from BMI2. (This relies on DATALEN <= 255.)
+.if VL < 64
+ mov $-1, %eax
+ bzhi DATALEN, %eax, %eax
+ kmovd %eax, %k1
+.else
+ mov $-1, %rax
+ bzhi DATALEN64, %rax, %rax
+ kmovq %rax, %k1
+.endif
+
+ // Encrypt a vector of counter blocks. This does not need to be masked.
+ vpshufb BSWAP_MASK, LE_CTR, V0
+ vpaddd LE_CTR_INC, LE_CTR, LE_CTR
+ vpxord RNDKEY0, V0, V0
+ lea 16(KEY), %rax
+1:
+ vbroadcasti32x4 (%rax), RNDKEY
+ vaesenc RNDKEY, V0, V0
+ add $16, %rax
+ cmp %rax, RNDKEYLAST_PTR
+ jne 1b
+ vaesenclast RNDKEYLAST, V0, V0
+
+ // XOR the data with the appropriate number of keystream bytes.
+ vmovdqu8 (SRC), V1{%k1}{z}
+ vpxord V1, V0, V0
+ vmovdqu8 V0, (DST){%k1}
+
+ // Update GHASH with the ciphertext block(s), without reducing.
+ //
+ // In the case of DATALEN < VL, the ciphertext is zero-padded to VL.
+ // (If decrypting, it's done by the above masked load. If encrypting,
+ // it's done by the below masked register-to-register move.) Note that
+ // if DATALEN <= VL - 16, there will be additional padding beyond the
+ // padding of the last block specified by GHASH itself; i.e., there may
+ // be whole block(s) that get processed by the GHASH multiplication and
+ // reduction instructions but should not actually be included in the
+ // GHASH. However, any such blocks are all-zeroes, and the values that
+ // they're multiplied with are also all-zeroes. Therefore they just add
+ // 0 * 0 = 0 to the final GHASH result, which makes no difference.
+ vmovdqu8 (POWERS_PTR), H_POW1
+.if \enc
+ vmovdqu8 V0, V1{%k1}{z}
+.endif
+ vpshufb BSWAP_MASK, V1, V0
+ vpxord GHASH_ACC, V0, V0
+ _ghash_mul_noreduce H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3
+ vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+
+ add $VL, POWERS_PTR
+ add $VL, SRC
+ add $VL, DST
+ sub $VL, DATALEN
+ jg .Lcrypt_loop_1x\@
+
+ // Finally, do the GHASH reduction.
+ _ghash_reduce LO, MI, HI, GFPOLY, V0
+ _horizontal_xor HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2
+
+.Ldone\@:
+ // Store the updated GHASH accumulator back to memory.
+ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+ vzeroupper // This is needed after using ymm or zmm registers.
+ RET
+.endm
+
+// void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+// const u32 le_ctr[4],
+// const u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen,
+// const u8 tag[16], int taglen);
+//
+// This macro generates one of the above two functions (with \enc selecting
+// which one). Both functions finish computing the GCM authentication tag by
+// updating GHASH with the lengths block and encrypting the GHASH accumulator.
+// |total_aadlen| and |total_datalen| must be the total length of the additional
+// authenticated data and the en/decrypted data in bytes, respectively.
+//
+// The encryption function then stores the full-length (16-byte) computed
+// authentication tag to |ghash_acc|. The decryption function instead loads the
+// expected authentication tag (the one that was transmitted) from the 16-byte
+// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
+// computed tag in constant time, and returns true if and only if they match.
+.macro _aes_gcm_final enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi
+ .set GHASH_ACC_PTR, %rdx
+ .set TOTAL_AADLEN, %rcx
+ .set TOTAL_DATALEN, %r8
+ .set TAG, %r9
+ .set TAGLEN, %r10d // Originally at 8(%rsp)
+
+ // Additional local variables.
+ // %rax, %xmm0-%xmm3, and %k1 are used as temporary registers.
+ .set AESKEYLEN, %r11d
+ .set AESKEYLEN64, %r11
+ .set GFPOLY, %xmm4
+ .set BSWAP_MASK, %xmm5
+ .set LE_CTR, %xmm6
+ .set GHASH_ACC, %xmm7
+ .set H_POW1, %xmm8
+
+ // Load some constants.
+ vmovdqa .Lgfpoly(%rip), GFPOLY
+ vmovdqa .Lbswap_mask(%rip), BSWAP_MASK
+
+ // Load the AES key length in bytes.
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+ // Set up a counter block with 1 in the low 32-bit word. This is the
+ // counter that produces the ciphertext needed to encrypt the auth tag.
+ // GFPOLY has 1 in the low word, so grab the 1 from there using a blend.
+ vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR
+
+ // Build the lengths block and XOR it with the GHASH accumulator.
+ // Although the lengths block is defined as the AAD length followed by
+ // the en/decrypted data length, both in big-endian byte order, a byte
+ // reflection of the full block is needed because of the way we compute
+ // GHASH (see _ghash_mul_step). By using little-endian values in the
+ // opposite order, we avoid having to reflect any bytes here.
+ vmovq TOTAL_DATALEN, %xmm0
+ vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm0
+ vpsllq $3, %xmm0, %xmm0 // Bytes to bits
+ vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC
+
+ // Load the first hash key power (H^1), which is stored last.
+ vmovdqu8 OFFSETOFEND_H_POWERS-16(KEY), H_POW1
+
+.if !\enc
+ // Prepare a mask of TAGLEN one bits.
+ movl 8(%rsp), TAGLEN
+ mov $-1, %eax
+ bzhi TAGLEN, %eax, %eax
+ kmovd %eax, %k1
+.endif
+
+ // Make %rax point to the last AES round key for the chosen AES variant.
+ lea 6*16(KEY,AESKEYLEN64,4), %rax
+
+ // Start the AES encryption of the counter block by swapping the counter
+ // block to big-endian and XOR-ing it with the zero-th AES round key.
+ vpshufb BSWAP_MASK, LE_CTR, %xmm0
+ vpxor (KEY), %xmm0, %xmm0
+
+ // Complete the AES encryption and multiply GHASH_ACC by H^1.
+ // Interleave the AES and GHASH instructions to improve performance.
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ vaesenc -13*16(%rax), %xmm0, %xmm0
+ vaesenc -12*16(%rax), %xmm0, %xmm0
+192:
+ vaesenc -11*16(%rax), %xmm0, %xmm0
+ vaesenc -10*16(%rax), %xmm0, %xmm0
+128:
+.irp i, 0,1,2,3,4,5,6,7,8
+ _ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ %xmm1, %xmm2, %xmm3
+ vaesenc (\i-9)*16(%rax), %xmm0, %xmm0
+.endr
+ _ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ %xmm1, %xmm2, %xmm3
+
+ // Undo the byte reflection of the GHASH accumulator.
+ vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC
+
+ // Do the last AES round and XOR the resulting keystream block with the
+ // GHASH accumulator to produce the full computed authentication tag.
+ //
+ // Reduce latency by taking advantage of the property vaesenclast(key,
+ // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last
+ // round key, instead of XOR'ing the final AES output with GHASH_ACC.
+ //
+ // enc_final then returns the computed auth tag, while dec_final
+ // compares it with the transmitted one and returns a bool. To compare
+ // the tags, dec_final XORs them together and uses vptest to check
+ // whether the result is all-zeroes. This should be constant-time.
+ // dec_final applies the vaesenclast optimization to this additional
+ // value XOR'd too, using vpternlogd to XOR the last round key, GHASH
+ // accumulator, and transmitted auth tag together in one instruction.
+.if \enc
+ vpxor (%rax), GHASH_ACC, %xmm1
+ vaesenclast %xmm1, %xmm0, GHASH_ACC
+ vmovdqu GHASH_ACC, (GHASH_ACC_PTR)
+.else
+ vmovdqu (TAG), %xmm1
+ vpternlogd $0x96, (%rax), GHASH_ACC, %xmm1
+ vaesenclast %xmm1, %xmm0, %xmm0
+ xor %eax, %eax
+ vmovdqu8 %xmm0, %xmm0{%k1}{z} // Truncate to TAGLEN bytes
+ vptest %xmm0, %xmm0
+ sete %al
+.endif
+ // No need for vzeroupper here, since only used xmm registers were used.
+ RET
+.endm
+
+_set_veclen 32
+SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256)
+ _aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256)
+SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256)
+ _aes_gcm_update 1
+SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256)
+SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256)
+ _aes_gcm_update 0
+SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256)
+
+_set_veclen 64
+SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512)
+ _aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512)
+SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512)
+ _aes_gcm_update 1
+SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512)
+SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512)
+ _aes_gcm_update 0
+SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512)
+
+// void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+// u8 ghash_acc[16],
+// const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|. |key->ghash_key_powers| must have been
+// initialized. On the first call, |ghash_acc| must be all zeroes. |aadlen|
+// must be a multiple of 16, except on the last call where it can be any length.
+// The caller must do any buffering needed to ensure this.
+//
+// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes.
+// Therefore, for AAD processing we currently only provide this implementation
+// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop. This
+// keeps the code size down, and it enables some micro-optimizations, e.g. using
+// VEX-coded instructions instead of EVEX-coded to save some instruction bytes.
+// To optimize for large amounts of AAD, we could implement a 4x-wide loop and
+// provide a version using 512-bit vectors, but that doesn't seem to be useful.
+SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10)
+
+ // Function arguments
+ .set KEY, %rdi
+ .set GHASH_ACC_PTR, %rsi
+ .set AAD, %rdx
+ .set AADLEN, %ecx
+ .set AADLEN64, %rcx // Zero-extend AADLEN before using!
+
+ // Additional local variables.
+ // %rax, %ymm0-%ymm3, and %k1 are used as temporary registers.
+ .set BSWAP_MASK, %ymm4
+ .set GFPOLY, %ymm5
+ .set GHASH_ACC, %ymm6
+ .set GHASH_ACC_XMM, %xmm6
+ .set H_POW1, %ymm7
+
+ // Load some constants.
+ vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK
+ vbroadcasti128 .Lgfpoly(%rip), GFPOLY
+
+ // Load the GHASH accumulator.
+ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM
+
+ // Update GHASH with 32 bytes of AAD at a time.
+ //
+ // Pre-subtracting 32 from AADLEN saves an instruction from the loop and
+ // also ensures that at least one write always occurs to AADLEN,
+ // zero-extending it and allowing AADLEN64 to be used later.
+ sub $32, AADLEN
+ jl .Laad_loop_1x_done
+ vmovdqu8 OFFSETOFEND_H_POWERS-32(KEY), H_POW1 // [H^2, H^1]
+.Laad_loop_1x:
+ vmovdqu (AAD), %ymm0
+ vpshufb BSWAP_MASK, %ymm0, %ymm0
+ vpxor %ymm0, GHASH_ACC, GHASH_ACC
+ _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ %ymm0, %ymm1, %ymm2
+ vextracti128 $1, GHASH_ACC, %xmm0
+ vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
+ add $32, AAD
+ sub $32, AADLEN
+ jge .Laad_loop_1x
+.Laad_loop_1x_done:
+ add $32, AADLEN
+ jz .Laad_done
+
+ // Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD.
+ mov $-1, %eax
+ bzhi AADLEN, %eax, %eax
+ kmovd %eax, %k1
+ vmovdqu8 (AAD), %ymm0{%k1}{z}
+ neg AADLEN64
+ and $~15, AADLEN64 // -round_up(AADLEN, 16)
+ vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1
+ vpshufb BSWAP_MASK, %ymm0, %ymm0
+ vpxor %ymm0, GHASH_ACC, GHASH_ACC
+ _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ %ymm0, %ymm1, %ymm2
+ vextracti128 $1, GHASH_ACC, %xmm0
+ vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
+
+.Laad_done:
+ // Store the updated GHASH accumulator back to memory.
+ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+ vzeroupper // This is needed after using ymm or zmm registers.
+ RET
+SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10)
+
+SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10)
+ _aes_gcm_final 1
+SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10)
+SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10)
+ _aes_gcm_final 0
+SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10)
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 39066b57a70e..eb153eff9331 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -10,16 +10,7 @@
* Vinodh Gopal <vinodh.gopal@intel.com>
* Kahraman Akdemir
*
- * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
- * interface for 64-bit kernels.
- * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
- * Aidan O'Mahony (aidan.o.mahony@intel.com)
- * Adrian Hoban <adrian.hoban@intel.com>
- * James Guilford (james.guilford@intel.com)
- * Gabriele Paoloni <gabriele.paoloni@intel.com>
- * Tadeusz Struk (tadeusz.struk@intel.com)
- * Wajdi Feghali (wajdi.k.feghali@intel.com)
- * Copyright (c) 2010, Intel Corporation.
+ * Copyright (c) 2010, Intel Corporation.
*
* Ported x86_64 version to x86:
* Author: Mathias Krause <minipli@googlemail.com>
@@ -27,95 +18,6 @@
#include <linux/linkage.h>
#include <asm/frame.h>
-#include <asm/nospec-branch.h>
-
-/*
- * The following macros are used to move an (un)aligned 16 byte value to/from
- * an XMM register. This can done for either FP or integer values, for FP use
- * movaps (move aligned packed single) or integer use movdqa (move double quad
- * aligned). It doesn't make a performance difference which instruction is used
- * since Nehalem (original Core i7) was released. However, the movaps is a byte
- * shorter, so that is the one we'll use for now. (same for unaligned).
- */
-#define MOVADQ movaps
-#define MOVUDQ movups
-
-#ifdef __x86_64__
-
-# constants in mergeable sections, linker can reorder and merge
-.section .rodata.cst16.POLY, "aM", @progbits, 16
-.align 16
-POLY: .octa 0xC2000000000000000000000000000001
-.section .rodata.cst16.TWOONE, "aM", @progbits, 16
-.align 16
-TWOONE: .octa 0x00000001000000000000000000000001
-
-.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
-.align 16
-SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
-.section .rodata.cst16.MASK1, "aM", @progbits, 16
-.align 16
-MASK1: .octa 0x0000000000000000ffffffffffffffff
-.section .rodata.cst16.MASK2, "aM", @progbits, 16
-.align 16
-MASK2: .octa 0xffffffffffffffff0000000000000000
-.section .rodata.cst16.ONE, "aM", @progbits, 16
-.align 16
-ONE: .octa 0x00000000000000000000000000000001
-.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
-.align 16
-F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
-.section .rodata.cst16.dec, "aM", @progbits, 16
-.align 16
-dec: .octa 0x1
-.section .rodata.cst16.enc, "aM", @progbits, 16
-.align 16
-enc: .octa 0x2
-
-# order of these constants should not change.
-# more specifically, ALL_F should follow SHIFT_MASK,
-# and zero should follow ALL_F
-.section .rodata, "a", @progbits
-.align 16
-SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
-ALL_F: .octa 0xffffffffffffffffffffffffffffffff
- .octa 0x00000000000000000000000000000000
-
-.text
-
-#define AadHash 16*0
-#define AadLen 16*1
-#define InLen (16*1)+8
-#define PBlockEncKey 16*2
-#define OrigIV 16*3
-#define CurCount 16*4
-#define PBlockLen 16*5
-#define HashKey 16*6 // store HashKey <<1 mod poly here
-#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
-#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
-#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
-#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
- // bits of HashKey <<1 mod poly here
- //(for Karatsuba purposes)
-#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
- // bits of HashKey^2 <<1 mod poly here
- // (for Karatsuba purposes)
-#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
- // bits of HashKey^3 <<1 mod poly here
- // (for Karatsuba purposes)
-#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
- // bits of HashKey^4 <<1 mod poly here
- // (for Karatsuba purposes)
-
-#define arg1 rdi
-#define arg2 rsi
-#define arg3 rdx
-#define arg4 rcx
-#define arg5 r8
-#define arg6 r9
-#define keysize 2*15*16(%arg1)
-#endif
-
#define STATE1 %xmm0
#define STATE2 %xmm4
@@ -162,1409 +64,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
#define TKEYP T1
#endif
-.macro FUNC_SAVE
- push %r12
- push %r13
- push %r14
-#
-# states of %xmm registers %xmm6:%xmm15 not saved
-# all %xmm registers are clobbered
-#
-.endm
-
-
-.macro FUNC_RESTORE
- pop %r14
- pop %r13
- pop %r12
-.endm
-
-# Precompute hashkeys.
-# Input: Hash subkey.
-# Output: HashKeys stored in gcm_context_data. Only needs to be called
-# once per key.
-# clobbers r12, and tmp xmm registers.
-.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
- mov \SUBKEY, %r12
- movdqu (%r12), \TMP3
- movdqa SHUF_MASK(%rip), \TMP2
- pshufb \TMP2, \TMP3
-
- # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
-
- movdqa \TMP3, \TMP2
- psllq $1, \TMP3
- psrlq $63, \TMP2
- movdqa \TMP2, \TMP1
- pslldq $8, \TMP2
- psrldq $8, \TMP1
- por \TMP2, \TMP3
-
- # reduce HashKey<<1
-
- pshufd $0x24, \TMP1, \TMP2
- pcmpeqd TWOONE(%rip), \TMP2
- pand POLY(%rip), \TMP2
- pxor \TMP2, \TMP3
- movdqu \TMP3, HashKey(%arg2)
-
- movdqa \TMP3, \TMP5
- pshufd $78, \TMP3, \TMP1
- pxor \TMP3, \TMP1
- movdqu \TMP1, HashKey_k(%arg2)
-
- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^2<<1 (mod poly)
- movdqu \TMP5, HashKey_2(%arg2)
-# HashKey_2 = HashKey^2<<1 (mod poly)
- pshufd $78, \TMP5, \TMP1
- pxor \TMP5, \TMP1
- movdqu \TMP1, HashKey_2_k(%arg2)
-
- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^3<<1 (mod poly)
- movdqu \TMP5, HashKey_3(%arg2)
- pshufd $78, \TMP5, \TMP1
- pxor \TMP5, \TMP1
- movdqu \TMP1, HashKey_3_k(%arg2)
-
- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^3<<1 (mod poly)
- movdqu \TMP5, HashKey_4(%arg2)
- pshufd $78, \TMP5, \TMP1
- pxor \TMP5, \TMP1
- movdqu \TMP1, HashKey_4_k(%arg2)
-.endm
-
-# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
-# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
-.macro GCM_INIT Iv SUBKEY AAD AADLEN
- mov \AADLEN, %r11
- mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
- xor %r11d, %r11d
- mov %r11, InLen(%arg2) # ctx_data.in_length = 0
- mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
- mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
- mov \Iv, %rax
- movdqu (%rax), %xmm0
- movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
-
- movdqa SHUF_MASK(%rip), %xmm2
- pshufb %xmm2, %xmm0
- movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
-
- PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
- movdqu HashKey(%arg2), %xmm13
-
- CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
- %xmm4, %xmm5, %xmm6
-.endm
-
-# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
-# struct has been initialized by GCM_INIT.
-# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
-# Clobbers rax, r10-r13, and xmm0-xmm15
-.macro GCM_ENC_DEC operation
- movdqu AadHash(%arg2), %xmm8
- movdqu HashKey(%arg2), %xmm13
- add %arg5, InLen(%arg2)
-
- xor %r11d, %r11d # initialise the data pointer offset as zero
- PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
-
- sub %r11, %arg5 # sub partial block data used
- mov %arg5, %r13 # save the number of bytes
-
- and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
- mov %r13, %r12
- # Encrypt/Decrypt first few blocks
-
- and $(3<<4), %r12
- jz .L_initial_num_blocks_is_0_\@
- cmp $(2<<4), %r12
- jb .L_initial_num_blocks_is_1_\@
- je .L_initial_num_blocks_is_2_\@
-.L_initial_num_blocks_is_3_\@:
- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
- sub $48, %r13
- jmp .L_initial_blocks_\@
-.L_initial_num_blocks_is_2_\@:
- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
- sub $32, %r13
- jmp .L_initial_blocks_\@
-.L_initial_num_blocks_is_1_\@:
- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
- sub $16, %r13
- jmp .L_initial_blocks_\@
-.L_initial_num_blocks_is_0_\@:
- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
-.L_initial_blocks_\@:
-
- # Main loop - Encrypt/Decrypt remaining blocks
-
- test %r13, %r13
- je .L_zero_cipher_left_\@
- sub $64, %r13
- je .L_four_cipher_left_\@
-.L_crypt_by_4_\@:
- GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
- %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
- %xmm7, %xmm8, enc
- add $64, %r11
- sub $64, %r13
- jne .L_crypt_by_4_\@
-.L_four_cipher_left_\@:
- GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
-%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
-.L_zero_cipher_left_\@:
- movdqu %xmm8, AadHash(%arg2)
- movdqu %xmm0, CurCount(%arg2)
-
- mov %arg5, %r13
- and $15, %r13 # %r13 = arg5 (mod 16)
- je .L_multiple_of_16_bytes_\@
-
- mov %r13, PBlockLen(%arg2)
-
- # Handle the last <16 Byte block separately
- paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
- movdqu %xmm0, CurCount(%arg2)
- movdqa SHUF_MASK(%rip), %xmm10
- pshufb %xmm10, %xmm0
-
- ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
- movdqu %xmm0, PBlockEncKey(%arg2)
-
- cmp $16, %arg5
- jge .L_large_enough_update_\@
-
- lea (%arg4,%r11,1), %r10
- mov %r13, %r12
- READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
- jmp .L_data_read_\@
-
-.L_large_enough_update_\@:
- sub $16, %r11
- add %r13, %r11
-
- # receive the last <16 Byte block
- movdqu (%arg4, %r11, 1), %xmm1
-
- sub %r13, %r11
- add $16, %r11
-
- lea SHIFT_MASK+16(%rip), %r12
- # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
- # (r13 is the number of bytes in plaintext mod 16)
- sub %r13, %r12
- # get the appropriate shuffle mask
- movdqu (%r12), %xmm2
- # shift right 16-r13 bytes
- pshufb %xmm2, %xmm1
-
-.L_data_read_\@:
- lea ALL_F+16(%rip), %r12
- sub %r13, %r12
-
-.ifc \operation, dec
- movdqa %xmm1, %xmm2
-.endif
- pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
- movdqu (%r12), %xmm1
- # get the appropriate mask to mask out top 16-r13 bytes of xmm0
- pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
-.ifc \operation, dec
- pand %xmm1, %xmm2
- movdqa SHUF_MASK(%rip), %xmm10
- pshufb %xmm10 ,%xmm2
-
- pxor %xmm2, %xmm8
-.else
- movdqa SHUF_MASK(%rip), %xmm10
- pshufb %xmm10,%xmm0
-
- pxor %xmm0, %xmm8
-.endif
-
- movdqu %xmm8, AadHash(%arg2)
-.ifc \operation, enc
- # GHASH computation for the last <16 byte block
- movdqa SHUF_MASK(%rip), %xmm10
- # shuffle xmm0 back to output as ciphertext
- pshufb %xmm10, %xmm0
-.endif
-
- # Output %r13 bytes
- movq %xmm0, %rax
- cmp $8, %r13
- jle .L_less_than_8_bytes_left_\@
- mov %rax, (%arg3 , %r11, 1)
- add $8, %r11
- psrldq $8, %xmm0
- movq %xmm0, %rax
- sub $8, %r13
-.L_less_than_8_bytes_left_\@:
- mov %al, (%arg3, %r11, 1)
- add $1, %r11
- shr $8, %rax
- sub $1, %r13
- jne .L_less_than_8_bytes_left_\@
-.L_multiple_of_16_bytes_\@:
-.endm
-
-# GCM_COMPLETE Finishes update of tag of last partial block
-# Output: Authorization Tag (AUTH_TAG)
-# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
-.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
- movdqu AadHash(%arg2), %xmm8
- movdqu HashKey(%arg2), %xmm13
-
- mov PBlockLen(%arg2), %r12
-
- test %r12, %r12
- je .L_partial_done\@
-
- GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
-
-.L_partial_done\@:
- mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
- shl $3, %r12 # convert into number of bits
- movd %r12d, %xmm15 # len(A) in %xmm15
- mov InLen(%arg2), %r12
- shl $3, %r12 # len(C) in bits (*128)
- movq %r12, %xmm1
-
- pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
- pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
- pxor %xmm15, %xmm8
- GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
- # final GHASH computation
- movdqa SHUF_MASK(%rip), %xmm10
- pshufb %xmm10, %xmm8
-
- movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
- ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
- pxor %xmm8, %xmm0
-.L_return_T_\@:
- mov \AUTHTAG, %r10 # %r10 = authTag
- mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
- cmp $16, %r11
- je .L_T_16_\@
- cmp $8, %r11
- jl .L_T_4_\@
-.L_T_8_\@:
- movq %xmm0, %rax
- mov %rax, (%r10)
- add $8, %r10
- sub $8, %r11
- psrldq $8, %xmm0
- test %r11, %r11
- je .L_return_T_done_\@
-.L_T_4_\@:
- movd %xmm0, %eax
- mov %eax, (%r10)
- add $4, %r10
- sub $4, %r11
- psrldq $4, %xmm0
- test %r11, %r11
- je .L_return_T_done_\@
-.L_T_123_\@:
- movd %xmm0, %eax
- cmp $2, %r11
- jl .L_T_1_\@
- mov %ax, (%r10)
- cmp $2, %r11
- je .L_return_T_done_\@
- add $2, %r10
- sar $16, %eax
-.L_T_1_\@:
- mov %al, (%r10)
- jmp .L_return_T_done_\@
-.L_T_16_\@:
- movdqu %xmm0, (%r10)
-.L_return_T_done_\@:
-.endm
-
-#ifdef __x86_64__
-/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
-*
-*
-* Input: A and B (128-bits each, bit-reflected)
-* Output: C = A*B*x mod poly, (i.e. >>1 )
-* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
-* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
-*
-*/
-.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
- movdqa \GH, \TMP1
- pshufd $78, \GH, \TMP2
- pshufd $78, \HK, \TMP3
- pxor \GH, \TMP2 # TMP2 = a1+a0
- pxor \HK, \TMP3 # TMP3 = b1+b0
- pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1
- pclmulqdq $0x00, \HK, \GH # GH = a0*b0
- pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
- pxor \GH, \TMP2
- pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
- movdqa \TMP2, \TMP3
- pslldq $8, \TMP3 # left shift TMP3 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP3, \GH
- pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
-
- # first phase of the reduction
-
- movdqa \GH, \TMP2
- movdqa \GH, \TMP3
- movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
- # in in order to perform
- # independent shifts
- pslld $31, \TMP2 # packed right shift <<31
- pslld $30, \TMP3 # packed right shift <<30
- pslld $25, \TMP4 # packed right shift <<25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP5
- psrldq $4, \TMP5 # right shift TMP5 1 DW
- pslldq $12, \TMP2 # left shift TMP2 3 DWs
- pxor \TMP2, \GH
-
- # second phase of the reduction
-
- movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
- # in in order to perform
- # independent shifts
- movdqa \GH,\TMP3
- movdqa \GH,\TMP4
- psrld $1,\TMP2 # packed left shift >>1
- psrld $2,\TMP3 # packed left shift >>2
- psrld $7,\TMP4 # packed left shift >>7
- pxor \TMP3,\TMP2 # xor the shifted versions
- pxor \TMP4,\TMP2
- pxor \TMP5, \TMP2
- pxor \TMP2, \GH
- pxor \TMP1, \GH # result is in TMP1
-.endm
-
-# Reads DLEN bytes starting at DPTR and stores in XMMDst
-# where 0 < DLEN < 16
-# Clobbers %rax, DLEN and XMM1
-.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
- cmp $8, \DLEN
- jl .L_read_lt8_\@
- mov (\DPTR), %rax
- movq %rax, \XMMDst
- sub $8, \DLEN
- jz .L_done_read_partial_block_\@
- xor %eax, %eax
-.L_read_next_byte_\@:
- shl $8, %rax
- mov 7(\DPTR, \DLEN, 1), %al
- dec \DLEN
- jnz .L_read_next_byte_\@
- movq %rax, \XMM1
- pslldq $8, \XMM1
- por \XMM1, \XMMDst
- jmp .L_done_read_partial_block_\@
-.L_read_lt8_\@:
- xor %eax, %eax
-.L_read_next_byte_lt8_\@:
- shl $8, %rax
- mov -1(\DPTR, \DLEN, 1), %al
- dec \DLEN
- jnz .L_read_next_byte_lt8_\@
- movq %rax, \XMMDst
-.L_done_read_partial_block_\@:
-.endm
-
-# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
-# clobbers r10-11, xmm14
-.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
- TMP6 TMP7
- MOVADQ SHUF_MASK(%rip), %xmm14
- mov \AAD, %r10 # %r10 = AAD
- mov \AADLEN, %r11 # %r11 = aadLen
- pxor \TMP7, \TMP7
- pxor \TMP6, \TMP6
-
- cmp $16, %r11
- jl .L_get_AAD_rest\@
-.L_get_AAD_blocks\@:
- movdqu (%r10), \TMP7
- pshufb %xmm14, \TMP7 # byte-reflect the AAD data
- pxor \TMP7, \TMP6
- GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
- add $16, %r10
- sub $16, %r11
- cmp $16, %r11
- jge .L_get_AAD_blocks\@
-
- movdqu \TMP6, \TMP7
-
- /* read the last <16B of AAD */
-.L_get_AAD_rest\@:
- test %r11, %r11
- je .L_get_AAD_done\@
-
- READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
- pshufb %xmm14, \TMP7 # byte-reflect the AAD data
- pxor \TMP6, \TMP7
- GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
- movdqu \TMP7, \TMP6
-
-.L_get_AAD_done\@:
- movdqu \TMP6, AadHash(%arg2)
-.endm
-
-# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
-# between update calls.
-# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
-# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
-# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
-.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
- AAD_HASH operation
- mov PBlockLen(%arg2), %r13
- test %r13, %r13
- je .L_partial_block_done_\@ # Leave Macro if no partial blocks
- # Read in input data without over reading
- cmp $16, \PLAIN_CYPH_LEN
- jl .L_fewer_than_16_bytes_\@
- movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
- jmp .L_data_read_\@
-
-.L_fewer_than_16_bytes_\@:
- lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
- mov \PLAIN_CYPH_LEN, %r12
- READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
-
- mov PBlockLen(%arg2), %r13
-
-.L_data_read_\@: # Finished reading in data
-
- movdqu PBlockEncKey(%arg2), %xmm9
- movdqu HashKey(%arg2), %xmm13
-
- lea SHIFT_MASK(%rip), %r12
-
- # adjust the shuffle mask pointer to be able to shift r13 bytes
- # r16-r13 is the number of bytes in plaintext mod 16)
- add %r13, %r12
- movdqu (%r12), %xmm2 # get the appropriate shuffle mask
- pshufb %xmm2, %xmm9 # shift right r13 bytes
-
-.ifc \operation, dec
- movdqa %xmm1, %xmm3
- pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn)
-
- mov \PLAIN_CYPH_LEN, %r10
- add %r13, %r10
- # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
- sub $16, %r10
- # Determine if partial block is not being filled and
- # shift mask accordingly
- jge .L_no_extra_mask_1_\@
- sub %r10, %r12
-.L_no_extra_mask_1_\@:
-
- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
- # get the appropriate mask to mask out bottom r13 bytes of xmm9
- pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
-
- pand %xmm1, %xmm3
- movdqa SHUF_MASK(%rip), %xmm10
- pshufb %xmm10, %xmm3
- pshufb %xmm2, %xmm3
- pxor %xmm3, \AAD_HASH
-
- test %r10, %r10
- jl .L_partial_incomplete_1_\@
-
- # GHASH computation for the last <16 Byte block
- GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %eax, %eax
-
- mov %rax, PBlockLen(%arg2)
- jmp .L_dec_done_\@
-.L_partial_incomplete_1_\@:
- add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
-.L_dec_done_\@:
- movdqu \AAD_HASH, AadHash(%arg2)
-.else
- pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
-
- mov \PLAIN_CYPH_LEN, %r10
- add %r13, %r10
- # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
- sub $16, %r10
- # Determine if partial block is not being filled and
- # shift mask accordingly
- jge .L_no_extra_mask_2_\@
- sub %r10, %r12
-.L_no_extra_mask_2_\@:
-
- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
- # get the appropriate mask to mask out bottom r13 bytes of xmm9
- pand %xmm1, %xmm9
-
- movdqa SHUF_MASK(%rip), %xmm1
- pshufb %xmm1, %xmm9
- pshufb %xmm2, %xmm9
- pxor %xmm9, \AAD_HASH
-
- test %r10, %r10
- jl .L_partial_incomplete_2_\@
-
- # GHASH computation for the last <16 Byte block
- GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %eax, %eax
-
- mov %rax, PBlockLen(%arg2)
- jmp .L_encode_done_\@
-.L_partial_incomplete_2_\@:
- add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
-.L_encode_done_\@:
- movdqu \AAD_HASH, AadHash(%arg2)
-
- movdqa SHUF_MASK(%rip), %xmm10
- # shuffle xmm9 back to output as ciphertext
- pshufb %xmm10, %xmm9
- pshufb %xmm2, %xmm9
-.endif
- # output encrypted Bytes
- test %r10, %r10
- jl .L_partial_fill_\@
- mov %r13, %r12
- mov $16, %r13
- # Set r13 to be the number of bytes to write out
- sub %r12, %r13
- jmp .L_count_set_\@
-.L_partial_fill_\@:
- mov \PLAIN_CYPH_LEN, %r13
-.L_count_set_\@:
- movdqa %xmm9, %xmm0
- movq %xmm0, %rax
- cmp $8, %r13
- jle .L_less_than_8_bytes_left_\@
-
- mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
- add $8, \DATA_OFFSET
- psrldq $8, %xmm0
- movq %xmm0, %rax
- sub $8, %r13
-.L_less_than_8_bytes_left_\@:
- movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
- add $1, \DATA_OFFSET
- shr $8, %rax
- sub $1, %r13
- jne .L_less_than_8_bytes_left_\@
-.L_partial_block_done_\@:
-.endm # PARTIAL_BLOCK
-
-/*
-* if a = number of total plaintext bytes
-* b = floor(a/16)
-* num_initial_blocks = b mod 4
-* encrypt the initial num_initial_blocks blocks and apply ghash on
-* the ciphertext
-* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
-* are clobbered
-* arg1, %arg2, %arg3 are used as a pointer only, not modified
-*/
-
-
-.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
- XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
- MOVADQ SHUF_MASK(%rip), %xmm14
-
- movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
-
- # start AES for num_initial_blocks blocks
-
- movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
-
-.if (\i == 5) || (\i == 6) || (\i == 7)
-
- MOVADQ ONE(%RIP),\TMP1
- MOVADQ 0(%arg1),\TMP2
-.irpc index, \i_seq
- paddd \TMP1, \XMM0 # INCR Y0
-.ifc \operation, dec
- movdqa \XMM0, %xmm\index
-.else
- MOVADQ \XMM0, %xmm\index
-.endif
- pshufb %xmm14, %xmm\index # perform a 16 byte swap
- pxor \TMP2, %xmm\index
-.endr
- lea 0x10(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- add $5,%eax # 128->9, 192->11, 256->13
-
-.Laes_loop_initial_\@:
- MOVADQ (%r10),\TMP1
-.irpc index, \i_seq
- aesenc \TMP1, %xmm\index
-.endr
- add $16,%r10
- sub $1,%eax
- jnz .Laes_loop_initial_\@
-
- MOVADQ (%r10), \TMP1
-.irpc index, \i_seq
- aesenclast \TMP1, %xmm\index # Last Round
-.endr
-.irpc index, \i_seq
- movdqu (%arg4 , %r11, 1), \TMP1
- pxor \TMP1, %xmm\index
- movdqu %xmm\index, (%arg3 , %r11, 1)
- # write back plaintext/ciphertext for num_initial_blocks
- add $16, %r11
-
-.ifc \operation, dec
- movdqa \TMP1, %xmm\index
-.endif
- pshufb %xmm14, %xmm\index
-
- # prepare plaintext/ciphertext for GHASH computation
-.endr
-.endif
-
- # apply GHASH on num_initial_blocks blocks
-
-.if \i == 5
- pxor %xmm5, %xmm6
- GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- pxor %xmm6, %xmm7
- GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- pxor %xmm7, %xmm8
- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.elseif \i == 6
- pxor %xmm6, %xmm7
- GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- pxor %xmm7, %xmm8
- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.elseif \i == 7
- pxor %xmm7, %xmm8
- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.endif
- cmp $64, %r13
- jl .L_initial_blocks_done\@
- # no need for precomputed values
-/*
-*
-* Precomputations for HashKey parallel with encryption of first 4 blocks.
-* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
-*/
- MOVADQ ONE(%RIP),\TMP1
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM1
- pshufb %xmm14, \XMM1 # perform a 16 byte swap
-
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM2
- pshufb %xmm14, \XMM2 # perform a 16 byte swap
-
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM3
- pshufb %xmm14, \XMM3 # perform a 16 byte swap
-
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM4
- pshufb %xmm14, \XMM4 # perform a 16 byte swap
-
- MOVADQ 0(%arg1),\TMP1
- pxor \TMP1, \XMM1
- pxor \TMP1, \XMM2
- pxor \TMP1, \XMM3
- pxor \TMP1, \XMM4
-.irpc index, 1234 # do 4 rounds
- movaps 0x10*\index(%arg1), \TMP1
- aesenc \TMP1, \XMM1
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
-.endr
-.irpc index, 56789 # do next 5 rounds
- movaps 0x10*\index(%arg1), \TMP1
- aesenc \TMP1, \XMM1
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
-.endr
- lea 0xa0(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- sub $4,%eax # 128->0, 192->2, 256->4
- jz .Laes_loop_pre_done\@
-
-.Laes_loop_pre_\@:
- MOVADQ (%r10),\TMP2
-.irpc index, 1234
- aesenc \TMP2, %xmm\index
-.endr
- add $16,%r10
- sub $1,%eax
- jnz .Laes_loop_pre_\@
-
-.Laes_loop_pre_done\@:
- MOVADQ (%r10), \TMP2
- aesenclast \TMP2, \XMM1
- aesenclast \TMP2, \XMM2
- aesenclast \TMP2, \XMM3
- aesenclast \TMP2, \XMM4
- movdqu 16*0(%arg4 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM1
-.ifc \operation, dec
- movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
- movdqa \TMP1, \XMM1
-.endif
- movdqu 16*1(%arg4 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM2
-.ifc \operation, dec
- movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
- movdqa \TMP1, \XMM2
-.endif
- movdqu 16*2(%arg4 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM3
-.ifc \operation, dec
- movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
- movdqa \TMP1, \XMM3
-.endif
- movdqu 16*3(%arg4 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM4
-.ifc \operation, dec
- movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
- movdqa \TMP1, \XMM4
-.else
- movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
- movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
- movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
- movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
-.endif
-
- add $64, %r11
- pshufb %xmm14, \XMM1 # perform a 16 byte swap
- pxor \XMMDst, \XMM1
-# combine GHASHed value with the corresponding ciphertext
- pshufb %xmm14, \XMM2 # perform a 16 byte swap
- pshufb %xmm14, \XMM3 # perform a 16 byte swap
- pshufb %xmm14, \XMM4 # perform a 16 byte swap
-
-.L_initial_blocks_done\@:
-
-.endm
-
-/*
-* encrypt 4 blocks at a time
-* ghash the 4 previously encrypted ciphertext blocks
-* arg1, %arg3, %arg4 are used as pointers only, not modified
-* %r11 is the data offset value
-*/
-.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
-TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
-
- movdqa \XMM1, \XMM5
- movdqa \XMM2, \XMM6
- movdqa \XMM3, \XMM7
- movdqa \XMM4, \XMM8
-
- movdqa SHUF_MASK(%rip), %xmm15
- # multiply TMP5 * HashKey using karatsuba
-
- movdqa \XMM5, \TMP4
- pshufd $78, \XMM5, \TMP6
- pxor \XMM5, \TMP6
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqu HashKey_4(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
- movdqa \XMM0, \XMM1
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM2
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM3
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM4
- pshufb %xmm15, \XMM1 # perform a 16 byte swap
- pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
- pshufb %xmm15, \XMM2 # perform a 16 byte swap
- pshufb %xmm15, \XMM3 # perform a 16 byte swap
- pshufb %xmm15, \XMM4 # perform a 16 byte swap
-
- pxor (%arg1), \XMM1
- pxor (%arg1), \XMM2
- pxor (%arg1), \XMM3
- pxor (%arg1), \XMM4
- movdqu HashKey_4_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
- movaps 0x10(%arg1), \TMP1
- aesenc \TMP1, \XMM1 # Round 1
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
- movaps 0x20(%arg1), \TMP1
- aesenc \TMP1, \XMM1 # Round 2
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
- movdqa \XMM6, \TMP1
- pshufd $78, \XMM6, \TMP2
- pxor \XMM6, \TMP2
- movdqu HashKey_3(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
- movaps 0x30(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 3
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
- movaps 0x40(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 4
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- movdqu HashKey_3_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x50(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 5
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pxor \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM6, \XMM5
- pxor \TMP2, \TMP6
- movdqa \XMM7, \TMP1
- pshufd $78, \XMM7, \TMP2
- pxor \XMM7, \TMP2
- movdqu HashKey_2(%arg2), \TMP5
-
- # Multiply TMP5 * HashKey using karatsuba
-
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x60(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 6
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
- movaps 0x70(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 7
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- movdqu HashKey_2_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x80(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 8
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pxor \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM7, \XMM5
- pxor \TMP2, \TMP6
-
- # Multiply XMM8 * HashKey
- # XMM8 and TMP5 hold the values for the two operands
-
- movdqa \XMM8, \TMP1
- pshufd $78, \XMM8, \TMP2
- pxor \XMM8, \TMP2
- movdqu HashKey(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x90(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 9
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
- lea 0xa0(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- sub $4,%eax # 128->0, 192->2, 256->4
- jz .Laes_loop_par_enc_done\@
-
-.Laes_loop_par_enc\@:
- MOVADQ (%r10),\TMP3
-.irpc index, 1234
- aesenc \TMP3, %xmm\index
-.endr
- add $16,%r10
- sub $1,%eax
- jnz .Laes_loop_par_enc\@
-
-.Laes_loop_par_enc_done\@:
- MOVADQ (%r10), \TMP3
- aesenclast \TMP3, \XMM1 # Round 10
- aesenclast \TMP3, \XMM2
- aesenclast \TMP3, \XMM3
- aesenclast \TMP3, \XMM4
- movdqu HashKey_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movdqu (%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
- movdqu 16(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
- movdqu 32(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
- movdqu 48(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
- movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
- movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
- movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
- movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
- pshufb %xmm15, \XMM1 # perform a 16 byte swap
- pshufb %xmm15, \XMM2 # perform a 16 byte swap
- pshufb %xmm15, \XMM3 # perform a 16 byte swap
- pshufb %xmm15, \XMM4 # perform a 16 byte swap
-
- pxor \TMP4, \TMP1
- pxor \XMM8, \XMM5
- pxor \TMP6, \TMP2
- pxor \TMP1, \TMP2
- pxor \XMM5, \TMP2
- movdqa \TMP2, \TMP3
- pslldq $8, \TMP3 # left shift TMP3 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP3, \XMM5
- pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
-
- # first phase of reduction
-
- movdqa \XMM5, \TMP2
- movdqa \XMM5, \TMP3
- movdqa \XMM5, \TMP4
-# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
- pslld $31, \TMP2 # packed right shift << 31
- pslld $30, \TMP3 # packed right shift << 30
- pslld $25, \TMP4 # packed right shift << 25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP5
- psrldq $4, \TMP5 # right shift T5 1 DW
- pslldq $12, \TMP2 # left shift T2 3 DWs
- pxor \TMP2, \XMM5
-
- # second phase of reduction
-
- movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
- movdqa \XMM5,\TMP3
- movdqa \XMM5,\TMP4
- psrld $1, \TMP2 # packed left shift >>1
- psrld $2, \TMP3 # packed left shift >>2
- psrld $7, \TMP4 # packed left shift >>7
- pxor \TMP3,\TMP2 # xor the shifted versions
- pxor \TMP4,\TMP2
- pxor \TMP5, \TMP2
- pxor \TMP2, \XMM5
- pxor \TMP1, \XMM5 # result is in TMP1
-
- pxor \XMM5, \XMM1
-.endm
-
-/*
-* decrypt 4 blocks at a time
-* ghash the 4 previously decrypted ciphertext blocks
-* arg1, %arg3, %arg4 are used as pointers only, not modified
-* %r11 is the data offset value
-*/
-.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
-TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
-
- movdqa \XMM1, \XMM5
- movdqa \XMM2, \XMM6
- movdqa \XMM3, \XMM7
- movdqa \XMM4, \XMM8
-
- movdqa SHUF_MASK(%rip), %xmm15
- # multiply TMP5 * HashKey using karatsuba
-
- movdqa \XMM5, \TMP4
- pshufd $78, \XMM5, \TMP6
- pxor \XMM5, \TMP6
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqu HashKey_4(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
- movdqa \XMM0, \XMM1
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM2
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM3
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM4
- pshufb %xmm15, \XMM1 # perform a 16 byte swap
- pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
- pshufb %xmm15, \XMM2 # perform a 16 byte swap
- pshufb %xmm15, \XMM3 # perform a 16 byte swap
- pshufb %xmm15, \XMM4 # perform a 16 byte swap
-
- pxor (%arg1), \XMM1
- pxor (%arg1), \XMM2
- pxor (%arg1), \XMM3
- pxor (%arg1), \XMM4
- movdqu HashKey_4_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
- movaps 0x10(%arg1), \TMP1
- aesenc \TMP1, \XMM1 # Round 1
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
- movaps 0x20(%arg1), \TMP1
- aesenc \TMP1, \XMM1 # Round 2
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
- movdqa \XMM6, \TMP1
- pshufd $78, \XMM6, \TMP2
- pxor \XMM6, \TMP2
- movdqu HashKey_3(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
- movaps 0x30(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 3
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
- movaps 0x40(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 4
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- movdqu HashKey_3_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x50(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 5
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pxor \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM6, \XMM5
- pxor \TMP2, \TMP6
- movdqa \XMM7, \TMP1
- pshufd $78, \XMM7, \TMP2
- pxor \XMM7, \TMP2
- movdqu HashKey_2(%arg2), \TMP5
-
- # Multiply TMP5 * HashKey using karatsuba
-
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x60(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 6
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
- movaps 0x70(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 7
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- movdqu HashKey_2_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x80(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 8
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pxor \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM7, \XMM5
- pxor \TMP2, \TMP6
-
- # Multiply XMM8 * HashKey
- # XMM8 and TMP5 hold the values for the two operands
-
- movdqa \XMM8, \TMP1
- pshufd $78, \XMM8, \TMP2
- pxor \XMM8, \TMP2
- movdqu HashKey(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x90(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 9
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
- lea 0xa0(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- sub $4,%eax # 128->0, 192->2, 256->4
- jz .Laes_loop_par_dec_done\@
-
-.Laes_loop_par_dec\@:
- MOVADQ (%r10),\TMP3
-.irpc index, 1234
- aesenc \TMP3, %xmm\index
-.endr
- add $16,%r10
- sub $1,%eax
- jnz .Laes_loop_par_dec\@
-
-.Laes_loop_par_dec_done\@:
- MOVADQ (%r10), \TMP3
- aesenclast \TMP3, \XMM1 # last round
- aesenclast \TMP3, \XMM2
- aesenclast \TMP3, \XMM3
- aesenclast \TMP3, \XMM4
- movdqu HashKey_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movdqu (%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
- movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM1
- movdqu 16(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
- movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM2
- movdqu 32(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
- movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM3
- movdqu 48(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
- movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM4
- pshufb %xmm15, \XMM1 # perform a 16 byte swap
- pshufb %xmm15, \XMM2 # perform a 16 byte swap
- pshufb %xmm15, \XMM3 # perform a 16 byte swap
- pshufb %xmm15, \XMM4 # perform a 16 byte swap
-
- pxor \TMP4, \TMP1
- pxor \XMM8, \XMM5
- pxor \TMP6, \TMP2
- pxor \TMP1, \TMP2
- pxor \XMM5, \TMP2
- movdqa \TMP2, \TMP3
- pslldq $8, \TMP3 # left shift TMP3 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP3, \XMM5
- pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
-
- # first phase of reduction
-
- movdqa \XMM5, \TMP2
- movdqa \XMM5, \TMP3
- movdqa \XMM5, \TMP4
-# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
- pslld $31, \TMP2 # packed right shift << 31
- pslld $30, \TMP3 # packed right shift << 30
- pslld $25, \TMP4 # packed right shift << 25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP5
- psrldq $4, \TMP5 # right shift T5 1 DW
- pslldq $12, \TMP2 # left shift T2 3 DWs
- pxor \TMP2, \XMM5
-
- # second phase of reduction
-
- movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
- movdqa \XMM5,\TMP3
- movdqa \XMM5,\TMP4
- psrld $1, \TMP2 # packed left shift >>1
- psrld $2, \TMP3 # packed left shift >>2
- psrld $7, \TMP4 # packed left shift >>7
- pxor \TMP3,\TMP2 # xor the shifted versions
- pxor \TMP4,\TMP2
- pxor \TMP5, \TMP2
- pxor \TMP2, \XMM5
- pxor \TMP1, \XMM5 # result is in TMP1
-
- pxor \XMM5, \XMM1
-.endm
-
-/* GHASH the last 4 ciphertext blocks. */
-.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
-TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
-
- # Multiply TMP6 * HashKey (using Karatsuba)
-
- movdqa \XMM1, \TMP6
- pshufd $78, \XMM1, \TMP2
- pxor \XMM1, \TMP2
- movdqu HashKey_4(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1
- pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0
- movdqu HashKey_4_k(%arg2), \TMP4
- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movdqa \XMM1, \XMMDst
- movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
-
- # Multiply TMP1 * HashKey (using Karatsuba)
-
- movdqa \XMM2, \TMP1
- pshufd $78, \XMM2, \TMP2
- pxor \XMM2, \TMP2
- movdqu HashKey_3(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0
- movdqu HashKey_3_k(%arg2), \TMP4
- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- pxor \TMP1, \TMP6
- pxor \XMM2, \XMMDst
- pxor \TMP2, \XMM1
-# results accumulated in TMP6, XMMDst, XMM1
-
- # Multiply TMP1 * HashKey (using Karatsuba)
-
- movdqa \XMM3, \TMP1
- pshufd $78, \XMM3, \TMP2
- pxor \XMM3, \TMP2
- movdqu HashKey_2(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0
- movdqu HashKey_2_k(%arg2), \TMP4
- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- pxor \TMP1, \TMP6
- pxor \XMM3, \XMMDst
- pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
-
- # Multiply TMP1 * HashKey (using Karatsuba)
- movdqa \XMM4, \TMP1
- pshufd $78, \XMM4, \TMP2
- pxor \XMM4, \TMP2
- movdqu HashKey(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0
- movdqu HashKey_k(%arg2), \TMP4
- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- pxor \TMP1, \TMP6
- pxor \XMM4, \XMMDst
- pxor \XMM1, \TMP2
- pxor \TMP6, \TMP2
- pxor \XMMDst, \TMP2
- # middle section of the temp results combined as in karatsuba algorithm
- movdqa \TMP2, \TMP4
- pslldq $8, \TMP4 # left shift TMP4 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP4, \XMMDst
- pxor \TMP2, \TMP6
-# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
- # first phase of the reduction
- movdqa \XMMDst, \TMP2
- movdqa \XMMDst, \TMP3
- movdqa \XMMDst, \TMP4
-# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
- pslld $31, \TMP2 # packed right shifting << 31
- pslld $30, \TMP3 # packed right shifting << 30
- pslld $25, \TMP4 # packed right shifting << 25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP7
- psrldq $4, \TMP7 # right shift TMP7 1 DW
- pslldq $12, \TMP2 # left shift TMP2 3 DWs
- pxor \TMP2, \XMMDst
-
- # second phase of the reduction
- movdqa \XMMDst, \TMP2
- # make 3 copies of XMMDst for doing 3 shift operations
- movdqa \XMMDst, \TMP3
- movdqa \XMMDst, \TMP4
- psrld $1, \TMP2 # packed left shift >> 1
- psrld $2, \TMP3 # packed left shift >> 2
- psrld $7, \TMP4 # packed left shift >> 7
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- pxor \TMP7, \TMP2
- pxor \TMP2, \XMMDst
- pxor \TMP6, \XMMDst # reduced result is in XMMDst
-.endm
-
-
-/* Encryption of a single block
-* uses eax & r10
-*/
-
-.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
-
- pxor (%arg1), \XMM0
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- add $5,%eax # 128->9, 192->11, 256->13
- lea 16(%arg1), %r10 # get first expanded key address
-
-_esb_loop_\@:
- MOVADQ (%r10),\TMP1
- aesenc \TMP1,\XMM0
- add $16,%r10
- sub $1,%eax
- jnz _esb_loop_\@
-
- MOVADQ (%r10),\TMP1
- aesenclast \TMP1,\XMM0
-.endm
-
-/*****************************************************************************
-* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data,
-* // context data
-* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
-* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
-* // concatenated with 0x00000001. 16-byte aligned pointer.
-* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
-* const u8 *aad, // Additional Authentication Data (AAD)
-* u64 aad_len) // Length of AAD in bytes.
-*/
-SYM_FUNC_START(aesni_gcm_init)
- FUNC_SAVE
- GCM_INIT %arg3, %arg4,%arg5, %arg6
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_init)
-
-/*****************************************************************************
-* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data,
-* // context data
-* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
-* const u8 *in, // Plaintext input
-* u64 plaintext_len, // Length of data in bytes for encryption.
-*/
-SYM_FUNC_START(aesni_gcm_enc_update)
- FUNC_SAVE
- GCM_ENC_DEC enc
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_enc_update)
-
-/*****************************************************************************
-* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data,
-* // context data
-* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
-* const u8 *in, // Plaintext input
-* u64 plaintext_len, // Length of data in bytes for encryption.
-*/
-SYM_FUNC_START(aesni_gcm_dec_update)
- FUNC_SAVE
- GCM_ENC_DEC dec
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_dec_update)
-
-/*****************************************************************************
-* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data,
-* // context data
-* u8 *auth_tag, // Authenticated Tag output.
-* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
-* // 12 or 8.
-*/
-SYM_FUNC_START(aesni_gcm_finalize)
- FUNC_SAVE
- GCM_COMPLETE %arg3 %arg4
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_finalize)
-
-#endif
-
SYM_FUNC_START_LOCAL(_key_expansion_256a)
pshufd $0b11111111, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
deleted file mode 100644
index 8c9749ed0651..000000000000
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ /dev/null
@@ -1,2804 +0,0 @@
-########################################################################
-# Copyright (c) 2013, Intel Corporation
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the
-# distribution.
-#
-# * Neither the name of the Intel Corporation nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-#
-# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
-# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-########################################################################
-##
-## Authors:
-## Erdinc Ozturk <erdinc.ozturk@intel.com>
-## Vinodh Gopal <vinodh.gopal@intel.com>
-## James Guilford <james.guilford@intel.com>
-## Tim Chen <tim.c.chen@linux.intel.com>
-##
-## References:
-## This code was derived and highly optimized from the code described in paper:
-## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
-## on Intel Architecture Processors. August, 2010
-## The details of the implementation is explained in:
-## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
-## on Intel Architecture Processors. October, 2012.
-##
-## Assumptions:
-##
-##
-##
-## iv:
-## 0 1 2 3
-## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | Salt (From the SA) |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | Initialization Vector |
-## | (This is the sequence number from IPSec header) |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | 0x1 |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##
-##
-##
-## AAD:
-## AAD padded to 128 bits with 0
-## for example, assume AAD is a u32 vector
-##
-## if AAD is 8 bytes:
-## AAD[3] = {A0, A1}#
-## padded AAD in xmm register = {A1 A0 0 0}
-##
-## 0 1 2 3
-## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | SPI (A1) |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | 32-bit Sequence Number (A0) |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | 0x0 |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##
-## AAD Format with 32-bit Sequence Number
-##
-## if AAD is 12 bytes:
-## AAD[3] = {A0, A1, A2}#
-## padded AAD in xmm register = {A2 A1 A0 0}
-##
-## 0 1 2 3
-## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | SPI (A2) |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | 64-bit Extended Sequence Number {A1,A0} |
-## | |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | 0x0 |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##
-## AAD Format with 64-bit Extended Sequence Number
-##
-##
-## aadLen:
-## from the definition of the spec, aadLen can only be 8 or 12 bytes.
-## The code additionally supports aadLen of length 16 bytes.
-##
-## TLen:
-## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
-##
-## poly = x^128 + x^127 + x^126 + x^121 + 1
-## throughout the code, one tab and two tab indentations are used. one tab is
-## for GHASH part, two tabs is for AES part.
-##
-
-#include <linux/linkage.h>
-
-# constants in mergeable sections, linker can reorder and merge
-.section .rodata.cst16.POLY, "aM", @progbits, 16
-.align 16
-POLY: .octa 0xC2000000000000000000000000000001
-
-.section .rodata.cst16.POLY2, "aM", @progbits, 16
-.align 16
-POLY2: .octa 0xC20000000000000000000001C2000000
-
-.section .rodata.cst16.TWOONE, "aM", @progbits, 16
-.align 16
-TWOONE: .octa 0x00000001000000000000000000000001
-
-.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
-.align 16
-SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
-
-.section .rodata.cst16.ONE, "aM", @progbits, 16
-.align 16
-ONE: .octa 0x00000000000000000000000000000001
-
-.section .rodata.cst16.ONEf, "aM", @progbits, 16
-.align 16
-ONEf: .octa 0x01000000000000000000000000000000
-
-# order of these constants should not change.
-# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
-.section .rodata, "a", @progbits
-.align 16
-SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
-ALL_F: .octa 0xffffffffffffffffffffffffffffffff
- .octa 0x00000000000000000000000000000000
-
-.text
-
-
-#define AadHash 16*0
-#define AadLen 16*1
-#define InLen (16*1)+8
-#define PBlockEncKey 16*2
-#define OrigIV 16*3
-#define CurCount 16*4
-#define PBlockLen 16*5
-
-HashKey = 16*6 # store HashKey <<1 mod poly here
-HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
-HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
-HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
-HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
-HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
-HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
-HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
-HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
-HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
-HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
-HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
-HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
-HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
-HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
-HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
-
-#define arg1 %rdi
-#define arg2 %rsi
-#define arg3 %rdx
-#define arg4 %rcx
-#define arg5 %r8
-#define arg6 %r9
-#define keysize 2*15*16(arg1)
-
-i = 0
-j = 0
-
-out_order = 0
-in_order = 1
-DEC = 0
-ENC = 1
-
-.macro define_reg r n
-reg_\r = %xmm\n
-.endm
-
-.macro setreg
-.altmacro
-define_reg i %i
-define_reg j %j
-.noaltmacro
-.endm
-
-TMP1 = 16*0 # Temporary storage for AAD
-TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
-TMP3 = 16*2 # Temporary storage for AES State 3
-TMP4 = 16*3 # Temporary storage for AES State 4
-TMP5 = 16*4 # Temporary storage for AES State 5
-TMP6 = 16*5 # Temporary storage for AES State 6
-TMP7 = 16*6 # Temporary storage for AES State 7
-TMP8 = 16*7 # Temporary storage for AES State 8
-
-VARIABLE_OFFSET = 16*8
-
-################################
-# Utility Macros
-################################
-
-.macro FUNC_SAVE
- push %r12
- push %r13
- push %r15
-
- push %rbp
- mov %rsp, %rbp
-
- sub $VARIABLE_OFFSET, %rsp
- and $~63, %rsp # align rsp to 64 bytes
-.endm
-
-.macro FUNC_RESTORE
- mov %rbp, %rsp
- pop %rbp
-
- pop %r15
- pop %r13
- pop %r12
-.endm
-
-# Encryption of a single block
-.macro ENCRYPT_SINGLE_BLOCK REP XMM0
- vpxor (arg1), \XMM0, \XMM0
- i = 1
- setreg
-.rep \REP
- vaesenc 16*i(arg1), \XMM0, \XMM0
- i = (i+1)
- setreg
-.endr
- vaesenclast 16*i(arg1), \XMM0, \XMM0
-.endm
-
-# combined for GCM encrypt and decrypt functions
-# clobbering all xmm registers
-# clobbering r10, r11, r12, r13, r15, rax
-.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
- vmovdqu AadHash(arg2), %xmm8
- vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
- add arg5, InLen(arg2)
-
- # initialize the data pointer offset as zero
- xor %r11d, %r11d
-
- PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
- sub %r11, arg5
-
- mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
- and $-16, %r13 # r13 = r13 - (r13 mod 16)
-
- mov %r13, %r12
- shr $4, %r12
- and $7, %r12
- jz .L_initial_num_blocks_is_0\@
-
- cmp $7, %r12
- je .L_initial_num_blocks_is_7\@
- cmp $6, %r12
- je .L_initial_num_blocks_is_6\@
- cmp $5, %r12
- je .L_initial_num_blocks_is_5\@
- cmp $4, %r12
- je .L_initial_num_blocks_is_4\@
- cmp $3, %r12
- je .L_initial_num_blocks_is_3\@
- cmp $2, %r12
- je .L_initial_num_blocks_is_2\@
-
- jmp .L_initial_num_blocks_is_1\@
-
-.L_initial_num_blocks_is_7\@:
- \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*7, %r13
- jmp .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_6\@:
- \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*6, %r13
- jmp .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_5\@:
- \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*5, %r13
- jmp .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_4\@:
- \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*4, %r13
- jmp .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_3\@:
- \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*3, %r13
- jmp .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_2\@:
- \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*2, %r13
- jmp .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_1\@:
- \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*1, %r13
- jmp .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_0\@:
- \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-
-
-.L_initial_blocks_encrypted\@:
- test %r13, %r13
- je .L_zero_cipher_left\@
-
- sub $128, %r13
- je .L_eight_cipher_left\@
-
-
-
-
- vmovd %xmm9, %r15d
- and $255, %r15d
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
-
-.L_encrypt_by_8_new\@:
- cmp $(255-8), %r15d
- jg .L_encrypt_by_8\@
-
-
-
- add $8, %r15b
- \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
- add $128, %r11
- sub $128, %r13
- jne .L_encrypt_by_8_new\@
-
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- jmp .L_eight_cipher_left\@
-
-.L_encrypt_by_8\@:
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- add $8, %r15b
- \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- add $128, %r11
- sub $128, %r13
- jne .L_encrypt_by_8_new\@
-
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
-
-
-
-.L_eight_cipher_left\@:
- \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
-
-
-.L_zero_cipher_left\@:
- vmovdqu %xmm14, AadHash(arg2)
- vmovdqu %xmm9, CurCount(arg2)
-
- # check for 0 length
- mov arg5, %r13
- and $15, %r13 # r13 = (arg5 mod 16)
-
- je .L_multiple_of_16_bytes\@
-
- # handle the last <16 Byte block separately
-
- mov %r13, PBlockLen(arg2)
-
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
- vmovdqu %xmm9, CurCount(arg2)
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
- ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
- vmovdqu %xmm9, PBlockEncKey(arg2)
-
- cmp $16, arg5
- jge .L_large_enough_update\@
-
- lea (arg4,%r11,1), %r10
- mov %r13, %r12
-
- READ_PARTIAL_BLOCK %r10 %r12 %xmm1
-
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12 # adjust the shuffle mask pointer to be
- # able to shift 16-r13 bytes (r13 is the
- # number of bytes in plaintext mod 16)
-
- jmp .L_final_ghash_mul\@
-
-.L_large_enough_update\@:
- sub $16, %r11
- add %r13, %r11
-
- # receive the last <16 Byte block
- vmovdqu (arg4, %r11, 1), %xmm1
-
- sub %r13, %r11
- add $16, %r11
-
- lea SHIFT_MASK+16(%rip), %r12
- # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
- # (r13 is the number of bytes in plaintext mod 16)
- sub %r13, %r12
- # get the appropriate shuffle mask
- vmovdqu (%r12), %xmm2
- # shift right 16-r13 bytes
- vpshufb %xmm2, %xmm1, %xmm1
-
-.L_final_ghash_mul\@:
- .if \ENC_DEC == DEC
- vmovdqa %xmm1, %xmm2
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
- # mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm2, %xmm2
- vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
- vpxor %xmm2, %xmm14, %xmm14
-
- vmovdqu %xmm14, AadHash(arg2)
- .else
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
- # mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- vpxor %xmm9, %xmm14, %xmm14
-
- vmovdqu %xmm14, AadHash(arg2)
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
- .endif
-
-
- #############################
- # output r13 Bytes
- vmovq %xmm9, %rax
- cmp $8, %r13
- jle .L_less_than_8_bytes_left\@
-
- mov %rax, (arg3 , %r11)
- add $8, %r11
- vpsrldq $8, %xmm9, %xmm9
- vmovq %xmm9, %rax
- sub $8, %r13
-
-.L_less_than_8_bytes_left\@:
- movb %al, (arg3 , %r11)
- add $1, %r11
- shr $8, %rax
- sub $1, %r13
- jne .L_less_than_8_bytes_left\@
- #############################
-
-.L_multiple_of_16_bytes\@:
-.endm
-
-
-# GCM_COMPLETE Finishes update of tag of last partial block
-# Output: Authorization Tag (AUTH_TAG)
-# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
-.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
- vmovdqu AadHash(arg2), %xmm14
- vmovdqu HashKey(arg2), %xmm13
-
- mov PBlockLen(arg2), %r12
- test %r12, %r12
- je .L_partial_done\@
-
- #GHASH computation for the last <16 Byte block
- \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-
-.L_partial_done\@:
- mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
- shl $3, %r12 # convert into number of bits
- vmovd %r12d, %xmm15 # len(A) in xmm15
-
- mov InLen(arg2), %r12
- shl $3, %r12 # len(C) in bits (*128)
- vmovq %r12, %xmm1
- vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
- vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
-
- vpxor %xmm15, %xmm14, %xmm14
- \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
- vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
-
- vmovdqu OrigIV(arg2), %xmm9
-
- ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
-
- vpxor %xmm14, %xmm9, %xmm9
-
-
-
-.L_return_T\@:
- mov \AUTH_TAG, %r10 # r10 = authTag
- mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
-
- cmp $16, %r11
- je .L_T_16\@
-
- cmp $8, %r11
- jl .L_T_4\@
-
-.L_T_8\@:
- vmovq %xmm9, %rax
- mov %rax, (%r10)
- add $8, %r10
- sub $8, %r11
- vpsrldq $8, %xmm9, %xmm9
- test %r11, %r11
- je .L_return_T_done\@
-.L_T_4\@:
- vmovd %xmm9, %eax
- mov %eax, (%r10)
- add $4, %r10
- sub $4, %r11
- vpsrldq $4, %xmm9, %xmm9
- test %r11, %r11
- je .L_return_T_done\@
-.L_T_123\@:
- vmovd %xmm9, %eax
- cmp $2, %r11
- jl .L_T_1\@
- mov %ax, (%r10)
- cmp $2, %r11
- je .L_return_T_done\@
- add $2, %r10
- sar $16, %eax
-.L_T_1\@:
- mov %al, (%r10)
- jmp .L_return_T_done\@
-
-.L_T_16\@:
- vmovdqu %xmm9, (%r10)
-
-.L_return_T_done\@:
-.endm
-
-.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
-
- mov \AAD, %r10 # r10 = AAD
- mov \AADLEN, %r12 # r12 = aadLen
-
-
- mov %r12, %r11
-
- vpxor \T8, \T8, \T8
- vpxor \T7, \T7, \T7
- cmp $16, %r11
- jl .L_get_AAD_rest8\@
-.L_get_AAD_blocks\@:
- vmovdqu (%r10), \T7
- vpshufb SHUF_MASK(%rip), \T7, \T7
- vpxor \T7, \T8, \T8
- \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
- add $16, %r10
- sub $16, %r12
- sub $16, %r11
- cmp $16, %r11
- jge .L_get_AAD_blocks\@
- vmovdqu \T8, \T7
- test %r11, %r11
- je .L_get_AAD_done\@
-
- vpxor \T7, \T7, \T7
-
- /* read the last <16B of AAD. since we have at least 4B of
- data right after the AAD (the ICV, and maybe some CT), we can
- read 4B/8B blocks safely, and then get rid of the extra stuff */
-.L_get_AAD_rest8\@:
- cmp $4, %r11
- jle .L_get_AAD_rest4\@
- movq (%r10), \T1
- add $8, %r10
- sub $8, %r11
- vpslldq $8, \T1, \T1
- vpsrldq $8, \T7, \T7
- vpxor \T1, \T7, \T7
- jmp .L_get_AAD_rest8\@
-.L_get_AAD_rest4\@:
- test %r11, %r11
- jle .L_get_AAD_rest0\@
- mov (%r10), %eax
- movq %rax, \T1
- add $4, %r10
- sub $4, %r11
- vpslldq $12, \T1, \T1
- vpsrldq $4, \T7, \T7
- vpxor \T1, \T7, \T7
-.L_get_AAD_rest0\@:
- /* finalize: shift out the extra bytes we read, and align
- left. since pslldq can only shift by an immediate, we use
- vpshufb and a pair of shuffle masks */
- leaq ALL_F(%rip), %r11
- subq %r12, %r11
- vmovdqu 16(%r11), \T1
- andq $~3, %r11
- vpshufb (%r11), \T7, \T7
- vpand \T1, \T7, \T7
-.L_get_AAD_rest_final\@:
- vpshufb SHUF_MASK(%rip), \T7, \T7
- vpxor \T8, \T7, \T7
- \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
-
-.L_get_AAD_done\@:
- vmovdqu \T7, AadHash(arg2)
-.endm
-
-.macro INIT GHASH_MUL PRECOMPUTE
- mov arg6, %r11
- mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
- xor %r11d, %r11d
- mov %r11, InLen(arg2) # ctx_data.in_length = 0
-
- mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
- mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
- mov arg3, %rax
- movdqu (%rax), %xmm0
- movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
-
- vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
- movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
-
- vmovdqu (arg4), %xmm6 # xmm6 = HashKey
-
- vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
- ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
- vmovdqa %xmm6, %xmm2
- vpsllq $1, %xmm6, %xmm6
- vpsrlq $63, %xmm2, %xmm2
- vmovdqa %xmm2, %xmm1
- vpslldq $8, %xmm2, %xmm2
- vpsrldq $8, %xmm1, %xmm1
- vpor %xmm2, %xmm6, %xmm6
- #reduction
- vpshufd $0b00100100, %xmm1, %xmm2
- vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
- vpand POLY(%rip), %xmm2, %xmm2
- vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
- #######################################################################
- vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
-
- CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
-
- \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
-.endm
-
-
-# Reads DLEN bytes starting at DPTR and stores in XMMDst
-# where 0 < DLEN < 16
-# Clobbers %rax, DLEN
-.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
- vpxor \XMMDst, \XMMDst, \XMMDst
-
- cmp $8, \DLEN
- jl .L_read_lt8_\@
- mov (\DPTR), %rax
- vpinsrq $0, %rax, \XMMDst, \XMMDst
- sub $8, \DLEN
- jz .L_done_read_partial_block_\@
- xor %eax, %eax
-.L_read_next_byte_\@:
- shl $8, %rax
- mov 7(\DPTR, \DLEN, 1), %al
- dec \DLEN
- jnz .L_read_next_byte_\@
- vpinsrq $1, %rax, \XMMDst, \XMMDst
- jmp .L_done_read_partial_block_\@
-.L_read_lt8_\@:
- xor %eax, %eax
-.L_read_next_byte_lt8_\@:
- shl $8, %rax
- mov -1(\DPTR, \DLEN, 1), %al
- dec \DLEN
- jnz .L_read_next_byte_lt8_\@
- vpinsrq $0, %rax, \XMMDst, \XMMDst
-.L_done_read_partial_block_\@:
-.endm
-
-# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
-# between update calls.
-# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
-# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
-# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
-.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
- AAD_HASH ENC_DEC
- mov PBlockLen(arg2), %r13
- test %r13, %r13
- je .L_partial_block_done_\@ # Leave Macro if no partial blocks
- # Read in input data without over reading
- cmp $16, \PLAIN_CYPH_LEN
- jl .L_fewer_than_16_bytes_\@
- vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
- jmp .L_data_read_\@
-
-.L_fewer_than_16_bytes_\@:
- lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
- mov \PLAIN_CYPH_LEN, %r12
- READ_PARTIAL_BLOCK %r10 %r12 %xmm1
-
- mov PBlockLen(arg2), %r13
-
-.L_data_read_\@: # Finished reading in data
-
- vmovdqu PBlockEncKey(arg2), %xmm9
- vmovdqu HashKey(arg2), %xmm13
-
- lea SHIFT_MASK(%rip), %r12
-
- # adjust the shuffle mask pointer to be able to shift r13 bytes
- # r16-r13 is the number of bytes in plaintext mod 16)
- add %r13, %r12
- vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
- vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
-
-.if \ENC_DEC == DEC
- vmovdqa %xmm1, %xmm3
- pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn)
-
- mov \PLAIN_CYPH_LEN, %r10
- add %r13, %r10
- # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
- sub $16, %r10
- # Determine if partial block is not being filled and
- # shift mask accordingly
- jge .L_no_extra_mask_1_\@
- sub %r10, %r12
-.L_no_extra_mask_1_\@:
-
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
- # get the appropriate mask to mask out bottom r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
-
- vpand %xmm1, %xmm3, %xmm3
- vmovdqa SHUF_MASK(%rip), %xmm10
- vpshufb %xmm10, %xmm3, %xmm3
- vpshufb %xmm2, %xmm3, %xmm3
- vpxor %xmm3, \AAD_HASH, \AAD_HASH
-
- test %r10, %r10
- jl .L_partial_incomplete_1_\@
-
- # GHASH computation for the last <16 Byte block
- \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %eax,%eax
-
- mov %rax, PBlockLen(arg2)
- jmp .L_dec_done_\@
-.L_partial_incomplete_1_\@:
- add \PLAIN_CYPH_LEN, PBlockLen(arg2)
-.L_dec_done_\@:
- vmovdqu \AAD_HASH, AadHash(arg2)
-.else
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
-
- mov \PLAIN_CYPH_LEN, %r10
- add %r13, %r10
- # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
- sub $16, %r10
- # Determine if partial block is not being filled and
- # shift mask accordingly
- jge .L_no_extra_mask_2_\@
- sub %r10, %r12
-.L_no_extra_mask_2_\@:
-
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
- # get the appropriate mask to mask out bottom r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9
-
- vmovdqa SHUF_MASK(%rip), %xmm1
- vpshufb %xmm1, %xmm9, %xmm9
- vpshufb %xmm2, %xmm9, %xmm9
- vpxor %xmm9, \AAD_HASH, \AAD_HASH
-
- test %r10, %r10
- jl .L_partial_incomplete_2_\@
-
- # GHASH computation for the last <16 Byte block
- \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %eax,%eax
-
- mov %rax, PBlockLen(arg2)
- jmp .L_encode_done_\@
-.L_partial_incomplete_2_\@:
- add \PLAIN_CYPH_LEN, PBlockLen(arg2)
-.L_encode_done_\@:
- vmovdqu \AAD_HASH, AadHash(arg2)
-
- vmovdqa SHUF_MASK(%rip), %xmm10
- # shuffle xmm9 back to output as ciphertext
- vpshufb %xmm10, %xmm9, %xmm9
- vpshufb %xmm2, %xmm9, %xmm9
-.endif
- # output encrypted Bytes
- test %r10, %r10
- jl .L_partial_fill_\@
- mov %r13, %r12
- mov $16, %r13
- # Set r13 to be the number of bytes to write out
- sub %r12, %r13
- jmp .L_count_set_\@
-.L_partial_fill_\@:
- mov \PLAIN_CYPH_LEN, %r13
-.L_count_set_\@:
- vmovdqa %xmm9, %xmm0
- vmovq %xmm0, %rax
- cmp $8, %r13
- jle .L_less_than_8_bytes_left_\@
-
- mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
- add $8, \DATA_OFFSET
- psrldq $8, %xmm0
- vmovq %xmm0, %rax
- sub $8, %r13
-.L_less_than_8_bytes_left_\@:
- movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
- add $1, \DATA_OFFSET
- shr $8, %rax
- sub $1, %r13
- jne .L_less_than_8_bytes_left_\@
-.L_partial_block_done_\@:
-.endm # PARTIAL_BLOCK
-
-###############################################################################
-# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
-# Input: A and B (128-bits each, bit-reflected)
-# Output: C = A*B*x mod poly, (i.e. >>1 )
-# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
-# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
-###############################################################################
-.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
-
- vpshufd $0b01001110, \GH, \T2
- vpshufd $0b01001110, \HK, \T3
- vpxor \GH , \T2, \T2 # T2 = (a1+a0)
- vpxor \HK , \T3, \T3 # T3 = (b1+b0)
-
- vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
- vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
- vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
- vpxor \GH, \T2,\T2
- vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
-
- vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
- vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
- vpxor \T3, \GH, \GH
- vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
-
- #first phase of the reduction
- vpslld $31, \GH, \T2 # packed right shifting << 31
- vpslld $30, \GH, \T3 # packed right shifting shift << 30
- vpslld $25, \GH, \T4 # packed right shifting shift << 25
-
- vpxor \T3, \T2, \T2 # xor the shifted versions
- vpxor \T4, \T2, \T2
-
- vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
-
- vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
- vpxor \T2, \GH, \GH # first phase of the reduction complete
-
- #second phase of the reduction
-
- vpsrld $1,\GH, \T2 # packed left shifting >> 1
- vpsrld $2,\GH, \T3 # packed left shifting >> 2
- vpsrld $7,\GH, \T4 # packed left shifting >> 7
- vpxor \T3, \T2, \T2 # xor the shifted versions
- vpxor \T4, \T2, \T2
-
- vpxor \T5, \T2, \T2
- vpxor \T2, \GH, \GH
- vpxor \T1, \GH, \GH # the result is in GH
-
-
-.endm
-
-.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
-
- # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
- vmovdqa \HK, \T5
-
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_k(arg2)
-
- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
- vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_2_k(arg2)
-
- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
- vmovdqu \T5, HashKey_3(arg2)
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_3_k(arg2)
-
- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
- vmovdqu \T5, HashKey_4(arg2)
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_4_k(arg2)
-
- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
- vmovdqu \T5, HashKey_5(arg2)
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_5_k(arg2)
-
- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
- vmovdqu \T5, HashKey_6(arg2)
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_6_k(arg2)
-
- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
- vmovdqu \T5, HashKey_7(arg2)
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_7_k(arg2)
-
- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
- vmovdqu \T5, HashKey_8(arg2)
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_8_k(arg2)
-
-.endm
-
-## if a = number of total plaintext bytes
-## b = floor(a/16)
-## num_initial_blocks = b mod 4#
-## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
-## r10, r11, r12, rax are clobbered
-## arg1, arg2, arg3, arg4 are used as pointers only, not modified
-
-.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
- i = (8-\num_initial_blocks)
- setreg
- vmovdqu AadHash(arg2), reg_i
-
- # start AES for num_initial_blocks blocks
- vmovdqu CurCount(arg2), \CTR
-
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, reg_i
- vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
- i = (i+1)
- setreg
-.endr
-
- vmovdqa (arg1), \T_key
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vpxor \T_key, reg_i, reg_i
- i = (i+1)
- setreg
-.endr
-
- j = 1
- setreg
-.rep \REP
- vmovdqa 16*j(arg1), \T_key
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vaesenc \T_key, reg_i, reg_i
- i = (i+1)
- setreg
-.endr
-
- j = (j+1)
- setreg
-.endr
-
- vmovdqa 16*j(arg1), \T_key
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vaesenclast \T_key, reg_i, reg_i
- i = (i+1)
- setreg
-.endr
-
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vmovdqu (arg4, %r11), \T1
- vpxor \T1, reg_i, reg_i
- vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
- add $16, %r11
-.if \ENC_DEC == DEC
- vmovdqa \T1, reg_i
-.endif
- vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
- i = (i+1)
- setreg
-.endr
-
-
- i = (8-\num_initial_blocks)
- j = (9-\num_initial_blocks)
- setreg
-
-.rep \num_initial_blocks
- vpxor reg_i, reg_j, reg_j
- GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
- i = (i+1)
- j = (j+1)
- setreg
-.endr
- # XMM8 has the combined result here
-
- vmovdqa \XMM8, TMP1(%rsp)
- vmovdqa \XMM8, \T3
-
- cmp $128, %r13
- jl .L_initial_blocks_done\@ # no need for precomputed constants
-
-###############################################################################
-# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM1
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM2
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM3
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM4
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM5
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM6
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM7
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM8
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-
- vmovdqa (arg1), \T_key
- vpxor \T_key, \XMM1, \XMM1
- vpxor \T_key, \XMM2, \XMM2
- vpxor \T_key, \XMM3, \XMM3
- vpxor \T_key, \XMM4, \XMM4
- vpxor \T_key, \XMM5, \XMM5
- vpxor \T_key, \XMM6, \XMM6
- vpxor \T_key, \XMM7, \XMM7
- vpxor \T_key, \XMM8, \XMM8
-
- i = 1
- setreg
-.rep \REP # do REP rounds
- vmovdqa 16*i(arg1), \T_key
- vaesenc \T_key, \XMM1, \XMM1
- vaesenc \T_key, \XMM2, \XMM2
- vaesenc \T_key, \XMM3, \XMM3
- vaesenc \T_key, \XMM4, \XMM4
- vaesenc \T_key, \XMM5, \XMM5
- vaesenc \T_key, \XMM6, \XMM6
- vaesenc \T_key, \XMM7, \XMM7
- vaesenc \T_key, \XMM8, \XMM8
- i = (i+1)
- setreg
-.endr
-
- vmovdqa 16*i(arg1), \T_key
- vaesenclast \T_key, \XMM1, \XMM1
- vaesenclast \T_key, \XMM2, \XMM2
- vaesenclast \T_key, \XMM3, \XMM3
- vaesenclast \T_key, \XMM4, \XMM4
- vaesenclast \T_key, \XMM5, \XMM5
- vaesenclast \T_key, \XMM6, \XMM6
- vaesenclast \T_key, \XMM7, \XMM7
- vaesenclast \T_key, \XMM8, \XMM8
-
- vmovdqu (arg4, %r11), \T1
- vpxor \T1, \XMM1, \XMM1
- vmovdqu \XMM1, (arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM1
- .endif
-
- vmovdqu 16*1(arg4, %r11), \T1
- vpxor \T1, \XMM2, \XMM2
- vmovdqu \XMM2, 16*1(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM2
- .endif
-
- vmovdqu 16*2(arg4, %r11), \T1
- vpxor \T1, \XMM3, \XMM3
- vmovdqu \XMM3, 16*2(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM3
- .endif
-
- vmovdqu 16*3(arg4, %r11), \T1
- vpxor \T1, \XMM4, \XMM4
- vmovdqu \XMM4, 16*3(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM4
- .endif
-
- vmovdqu 16*4(arg4, %r11), \T1
- vpxor \T1, \XMM5, \XMM5
- vmovdqu \XMM5, 16*4(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM5
- .endif
-
- vmovdqu 16*5(arg4, %r11), \T1
- vpxor \T1, \XMM6, \XMM6
- vmovdqu \XMM6, 16*5(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM6
- .endif
-
- vmovdqu 16*6(arg4, %r11), \T1
- vpxor \T1, \XMM7, \XMM7
- vmovdqu \XMM7, 16*6(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM7
- .endif
-
- vmovdqu 16*7(arg4, %r11), \T1
- vpxor \T1, \XMM8, \XMM8
- vmovdqu \XMM8, 16*7(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM8
- .endif
-
- add $128, %r11
-
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
- vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-
-###############################################################################
-
-.L_initial_blocks_done\@:
-
-.endm
-
-# encrypt 8 blocks at a time
-# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg2, arg3, arg4 are used as pointers only, not modified
-# r11 is the data offset value
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
-
- vmovdqa \XMM1, \T2
- vmovdqa \XMM2, TMP2(%rsp)
- vmovdqa \XMM3, TMP3(%rsp)
- vmovdqa \XMM4, TMP4(%rsp)
- vmovdqa \XMM5, TMP5(%rsp)
- vmovdqa \XMM6, TMP6(%rsp)
- vmovdqa \XMM7, TMP7(%rsp)
- vmovdqa \XMM8, TMP8(%rsp)
-
-.if \loop_idx == in_order
- vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
- vpaddd ONE(%rip), \XMM1, \XMM2
- vpaddd ONE(%rip), \XMM2, \XMM3
- vpaddd ONE(%rip), \XMM3, \XMM4
- vpaddd ONE(%rip), \XMM4, \XMM5
- vpaddd ONE(%rip), \XMM5, \XMM6
- vpaddd ONE(%rip), \XMM6, \XMM7
- vpaddd ONE(%rip), \XMM7, \XMM8
- vmovdqa \XMM8, \CTR
-
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-.else
- vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
- vpaddd ONEf(%rip), \XMM1, \XMM2
- vpaddd ONEf(%rip), \XMM2, \XMM3
- vpaddd ONEf(%rip), \XMM3, \XMM4
- vpaddd ONEf(%rip), \XMM4, \XMM5
- vpaddd ONEf(%rip), \XMM5, \XMM6
- vpaddd ONEf(%rip), \XMM6, \XMM7
- vpaddd ONEf(%rip), \XMM7, \XMM8
- vmovdqa \XMM8, \CTR
-.endif
-
-
- #######################################################################
-
- vmovdqu (arg1), \T1
- vpxor \T1, \XMM1, \XMM1
- vpxor \T1, \XMM2, \XMM2
- vpxor \T1, \XMM3, \XMM3
- vpxor \T1, \XMM4, \XMM4
- vpxor \T1, \XMM5, \XMM5
- vpxor \T1, \XMM6, \XMM6
- vpxor \T1, \XMM7, \XMM7
- vpxor \T1, \XMM8, \XMM8
-
- #######################################################################
-
-
-
-
-
- vmovdqu 16*1(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqu 16*2(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
-
- #######################################################################
-
- vmovdqu HashKey_8(arg2), \T5
- vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
- vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
-
- vpshufd $0b01001110, \T2, \T6
- vpxor \T2, \T6, \T6
-
- vmovdqu HashKey_8_k(arg2), \T5
- vpclmulqdq $0x00, \T5, \T6, \T6
-
- vmovdqu 16*3(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP2(%rsp), \T1
- vmovdqu HashKey_7(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpshufd $0b01001110, \T1, \T3
- vpxor \T1, \T3, \T3
- vmovdqu HashKey_7_k(arg2), \T5
- vpclmulqdq $0x10, \T5, \T3, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*4(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- #######################################################################
-
- vmovdqa TMP3(%rsp), \T1
- vmovdqu HashKey_6(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpshufd $0b01001110, \T1, \T3
- vpxor \T1, \T3, \T3
- vmovdqu HashKey_6_k(arg2), \T5
- vpclmulqdq $0x10, \T5, \T3, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*5(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP4(%rsp), \T1
- vmovdqu HashKey_5(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpshufd $0b01001110, \T1, \T3
- vpxor \T1, \T3, \T3
- vmovdqu HashKey_5_k(arg2), \T5
- vpclmulqdq $0x10, \T5, \T3, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*6(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
-
- vmovdqa TMP5(%rsp), \T1
- vmovdqu HashKey_4(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpshufd $0b01001110, \T1, \T3
- vpxor \T1, \T3, \T3
- vmovdqu HashKey_4_k(arg2), \T5
- vpclmulqdq $0x10, \T5, \T3, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*7(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP6(%rsp), \T1
- vmovdqu HashKey_3(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpshufd $0b01001110, \T1, \T3
- vpxor \T1, \T3, \T3
- vmovdqu HashKey_3_k(arg2), \T5
- vpclmulqdq $0x10, \T5, \T3, \T3
- vpxor \T3, \T6, \T6
-
-
- vmovdqu 16*8(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP7(%rsp), \T1
- vmovdqu HashKey_2(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpshufd $0b01001110, \T1, \T3
- vpxor \T1, \T3, \T3
- vmovdqu HashKey_2_k(arg2), \T5
- vpclmulqdq $0x10, \T5, \T3, \T3
- vpxor \T3, \T6, \T6
-
- #######################################################################
-
- vmovdqu 16*9(arg1), \T5
- vaesenc \T5, \XMM1, \XMM1
- vaesenc \T5, \XMM2, \XMM2
- vaesenc \T5, \XMM3, \XMM3
- vaesenc \T5, \XMM4, \XMM4
- vaesenc \T5, \XMM5, \XMM5
- vaesenc \T5, \XMM6, \XMM6
- vaesenc \T5, \XMM7, \XMM7
- vaesenc \T5, \XMM8, \XMM8
-
- vmovdqa TMP8(%rsp), \T1
- vmovdqu HashKey(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpshufd $0b01001110, \T1, \T3
- vpxor \T1, \T3, \T3
- vmovdqu HashKey_k(arg2), \T5
- vpclmulqdq $0x10, \T5, \T3, \T3
- vpxor \T3, \T6, \T6
-
- vpxor \T4, \T6, \T6
- vpxor \T7, \T6, \T6
-
- vmovdqu 16*10(arg1), \T5
-
- i = 11
- setreg
-.rep (\REP-9)
-
- vaesenc \T5, \XMM1, \XMM1
- vaesenc \T5, \XMM2, \XMM2
- vaesenc \T5, \XMM3, \XMM3
- vaesenc \T5, \XMM4, \XMM4
- vaesenc \T5, \XMM5, \XMM5
- vaesenc \T5, \XMM6, \XMM6
- vaesenc \T5, \XMM7, \XMM7
- vaesenc \T5, \XMM8, \XMM8
-
- vmovdqu 16*i(arg1), \T5
- i = i + 1
- setreg
-.endr
-
- i = 0
- j = 1
- setreg
-.rep 8
- vpxor 16*i(arg4, %r11), \T5, \T2
- .if \ENC_DEC == ENC
- vaesenclast \T2, reg_j, reg_j
- .else
- vaesenclast \T2, reg_j, \T3
- vmovdqu 16*i(arg4, %r11), reg_j
- vmovdqu \T3, 16*i(arg3, %r11)
- .endif
- i = (i+1)
- j = (j+1)
- setreg
-.endr
- #######################################################################
-
-
- vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
- vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
- vpxor \T3, \T7, \T7
- vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
-
-
-
- #######################################################################
- #first phase of the reduction
- #######################################################################
- vpslld $31, \T7, \T2 # packed right shifting << 31
- vpslld $30, \T7, \T3 # packed right shifting shift << 30
- vpslld $25, \T7, \T4 # packed right shifting shift << 25
-
- vpxor \T3, \T2, \T2 # xor the shifted versions
- vpxor \T4, \T2, \T2
-
- vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
-
- vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
- vpxor \T2, \T7, \T7 # first phase of the reduction complete
- #######################################################################
- .if \ENC_DEC == ENC
- vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
- .endif
-
- #######################################################################
- #second phase of the reduction
- vpsrld $1, \T7, \T2 # packed left shifting >> 1
- vpsrld $2, \T7, \T3 # packed left shifting >> 2
- vpsrld $7, \T7, \T4 # packed left shifting >> 7
- vpxor \T3, \T2, \T2 # xor the shifted versions
- vpxor \T4, \T2, \T2
-
- vpxor \T1, \T2, \T2
- vpxor \T2, \T7, \T7
- vpxor \T7, \T6, \T6 # the result is in T6
- #######################################################################
-
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-
-
- vpxor \T6, \XMM1, \XMM1
-
-
-
-.endm
-
-
-# GHASH the last 4 ciphertext blocks.
-.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
-
- ## Karatsuba Method
-
-
- vpshufd $0b01001110, \XMM1, \T2
- vpxor \XMM1, \T2, \T2
- vmovdqu HashKey_8(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM1, \T6
- vpclmulqdq $0x00, \T5, \XMM1, \T7
-
- vmovdqu HashKey_8_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \XMM1
-
- ######################
-
- vpshufd $0b01001110, \XMM2, \T2
- vpxor \XMM2, \T2, \T2
- vmovdqu HashKey_7(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM2, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM2, \T4
- vpxor \T4, \T7, \T7
-
- vmovdqu HashKey_7_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \T2
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vpshufd $0b01001110, \XMM3, \T2
- vpxor \XMM3, \T2, \T2
- vmovdqu HashKey_6(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM3, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM3, \T4
- vpxor \T4, \T7, \T7
-
- vmovdqu HashKey_6_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \T2
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vpshufd $0b01001110, \XMM4, \T2
- vpxor \XMM4, \T2, \T2
- vmovdqu HashKey_5(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM4, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM4, \T4
- vpxor \T4, \T7, \T7
-
- vmovdqu HashKey_5_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \T2
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vpshufd $0b01001110, \XMM5, \T2
- vpxor \XMM5, \T2, \T2
- vmovdqu HashKey_4(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM5, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM5, \T4
- vpxor \T4, \T7, \T7
-
- vmovdqu HashKey_4_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \T2
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vpshufd $0b01001110, \XMM6, \T2
- vpxor \XMM6, \T2, \T2
- vmovdqu HashKey_3(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM6, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM6, \T4
- vpxor \T4, \T7, \T7
-
- vmovdqu HashKey_3_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \T2
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vpshufd $0b01001110, \XMM7, \T2
- vpxor \XMM7, \T2, \T2
- vmovdqu HashKey_2(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM7, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM7, \T4
- vpxor \T4, \T7, \T7
-
- vmovdqu HashKey_2_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \T2
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vpshufd $0b01001110, \XMM8, \T2
- vpxor \XMM8, \T2, \T2
- vmovdqu HashKey(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM8, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM8, \T4
- vpxor \T4, \T7, \T7
-
- vmovdqu HashKey_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
- vpxor \T6, \XMM1, \XMM1
- vpxor \T7, \XMM1, \T2
-
-
-
-
- vpslldq $8, \T2, \T4
- vpsrldq $8, \T2, \T2
-
- vpxor \T4, \T7, \T7
- vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
- # the accumulated carry-less multiplications
-
- #######################################################################
- #first phase of the reduction
- vpslld $31, \T7, \T2 # packed right shifting << 31
- vpslld $30, \T7, \T3 # packed right shifting shift << 30
- vpslld $25, \T7, \T4 # packed right shifting shift << 25
-
- vpxor \T3, \T2, \T2 # xor the shifted versions
- vpxor \T4, \T2, \T2
-
- vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
-
- vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
- vpxor \T2, \T7, \T7 # first phase of the reduction complete
- #######################################################################
-
-
- #second phase of the reduction
- vpsrld $1, \T7, \T2 # packed left shifting >> 1
- vpsrld $2, \T7, \T3 # packed left shifting >> 2
- vpsrld $7, \T7, \T4 # packed left shifting >> 7
- vpxor \T3, \T2, \T2 # xor the shifted versions
- vpxor \T4, \T2, \T2
-
- vpxor \T1, \T2, \T2
- vpxor \T2, \T7, \T7
- vpxor \T7, \T6, \T6 # the result is in T6
-
-.endm
-
-#############################################################
-#void aesni_gcm_precomp_avx_gen2
-# (gcm_data *my_ctx_data,
-# gcm_context_data *data,
-# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
-# u8 *iv, /* Pre-counter block j0: 4 byte salt
-# (from Security Association) concatenated with 8 byte
-# Initialisation Vector (from IPSec ESP Payload)
-# concatenated with 0x00000001. 16-byte aligned pointer. */
-# const u8 *aad, /* Additional Authentication Data (AAD)*/
-# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-#############################################################
-SYM_FUNC_START(aesni_gcm_init_avx_gen2)
- FUNC_SAVE
- INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_init_avx_gen2)
-
-###############################################################################
-#void aesni_gcm_enc_update_avx_gen2(
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
-# gcm_context_data *data,
-# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
-# const u8 *in, /* Plaintext input */
-# u64 plaintext_len) /* Length of data in Bytes for encryption. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
- FUNC_SAVE
- mov keysize, %eax
- cmp $32, %eax
- je key_256_enc_update
- cmp $16, %eax
- je key_128_enc_update
- # must be 192
- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
- FUNC_RESTORE
- RET
-key_128_enc_update:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
- FUNC_RESTORE
- RET
-key_256_enc_update:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
-
-###############################################################################
-#void aesni_gcm_dec_update_avx_gen2(
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
-# gcm_context_data *data,
-# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
-# const u8 *in, /* Ciphertext input */
-# u64 plaintext_len) /* Length of data in Bytes for encryption. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
- FUNC_SAVE
- mov keysize,%eax
- cmp $32, %eax
- je key_256_dec_update
- cmp $16, %eax
- je key_128_dec_update
- # must be 192
- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
- FUNC_RESTORE
- RET
-key_128_dec_update:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
- FUNC_RESTORE
- RET
-key_256_dec_update:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
-
-###############################################################################
-#void aesni_gcm_finalize_avx_gen2(
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
-# gcm_context_data *data,
-# u8 *auth_tag, /* Authenticated Tag output. */
-# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
-# Valid values are 16 (most likely), 12 or 8. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
- FUNC_SAVE
- mov keysize,%eax
- cmp $32, %eax
- je key_256_finalize
- cmp $16, %eax
- je key_128_finalize
- # must be 192
- GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
- FUNC_RESTORE
- RET
-key_128_finalize:
- GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
- FUNC_RESTORE
- RET
-key_256_finalize:
- GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
-
-###############################################################################
-# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
-# Input: A and B (128-bits each, bit-reflected)
-# Output: C = A*B*x mod poly, (i.e. >>1 )
-# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
-# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
-###############################################################################
-.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
-
- vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
- vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
- vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
- vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
- vpxor \T3, \GH, \GH
-
-
- vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
- vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
-
- vpxor \T3, \T1, \T1
- vpxor \T2, \GH, \GH
-
- #######################################################################
- #first phase of the reduction
- vmovdqa POLY2(%rip), \T3
-
- vpclmulqdq $0x01, \GH, \T3, \T2
- vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
-
- vpxor \T2, \GH, \GH # first phase of the reduction complete
- #######################################################################
- #second phase of the reduction
- vpclmulqdq $0x00, \GH, \T3, \T2
- vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
-
- vpclmulqdq $0x10, \GH, \T3, \GH
- vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
-
- vpxor \T2, \GH, \GH # second phase of the reduction complete
- #######################################################################
- vpxor \T1, \GH, \GH # the result is in GH
-
-
-.endm
-
-.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
-
- # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
- vmovdqa \HK, \T5
- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
- vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
-
- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
- vmovdqu \T5, HashKey_3(arg2)
-
- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
- vmovdqu \T5, HashKey_4(arg2)
-
- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
- vmovdqu \T5, HashKey_5(arg2)
-
- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
- vmovdqu \T5, HashKey_6(arg2)
-
- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
- vmovdqu \T5, HashKey_7(arg2)
-
- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
- vmovdqu \T5, HashKey_8(arg2)
-
-.endm
-
-## if a = number of total plaintext bytes
-## b = floor(a/16)
-## num_initial_blocks = b mod 4#
-## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
-## r10, r11, r12, rax are clobbered
-## arg1, arg2, arg3, arg4 are used as pointers only, not modified
-
-.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
- i = (8-\num_initial_blocks)
- setreg
- vmovdqu AadHash(arg2), reg_i
-
- # start AES for num_initial_blocks blocks
- vmovdqu CurCount(arg2), \CTR
-
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, reg_i
- vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
- i = (i+1)
- setreg
-.endr
-
- vmovdqa (arg1), \T_key
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vpxor \T_key, reg_i, reg_i
- i = (i+1)
- setreg
-.endr
-
- j = 1
- setreg
-.rep \REP
- vmovdqa 16*j(arg1), \T_key
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vaesenc \T_key, reg_i, reg_i
- i = (i+1)
- setreg
-.endr
-
- j = (j+1)
- setreg
-.endr
-
-
- vmovdqa 16*j(arg1), \T_key
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vaesenclast \T_key, reg_i, reg_i
- i = (i+1)
- setreg
-.endr
-
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vmovdqu (arg4, %r11), \T1
- vpxor \T1, reg_i, reg_i
- vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
- # num_initial_blocks blocks
- add $16, %r11
-.if \ENC_DEC == DEC
- vmovdqa \T1, reg_i
-.endif
- vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
- i = (i+1)
- setreg
-.endr
-
-
- i = (8-\num_initial_blocks)
- j = (9-\num_initial_blocks)
- setreg
-
-.rep \num_initial_blocks
- vpxor reg_i, reg_j, reg_j
- GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
- i = (i+1)
- j = (j+1)
- setreg
-.endr
- # XMM8 has the combined result here
-
- vmovdqa \XMM8, TMP1(%rsp)
- vmovdqa \XMM8, \T3
-
- cmp $128, %r13
- jl .L_initial_blocks_done\@ # no need for precomputed constants
-
-###############################################################################
-# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM1
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM2
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM3
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM4
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM5
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM6
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM7
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM8
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-
- vmovdqa (arg1), \T_key
- vpxor \T_key, \XMM1, \XMM1
- vpxor \T_key, \XMM2, \XMM2
- vpxor \T_key, \XMM3, \XMM3
- vpxor \T_key, \XMM4, \XMM4
- vpxor \T_key, \XMM5, \XMM5
- vpxor \T_key, \XMM6, \XMM6
- vpxor \T_key, \XMM7, \XMM7
- vpxor \T_key, \XMM8, \XMM8
-
- i = 1
- setreg
-.rep \REP # do REP rounds
- vmovdqa 16*i(arg1), \T_key
- vaesenc \T_key, \XMM1, \XMM1
- vaesenc \T_key, \XMM2, \XMM2
- vaesenc \T_key, \XMM3, \XMM3
- vaesenc \T_key, \XMM4, \XMM4
- vaesenc \T_key, \XMM5, \XMM5
- vaesenc \T_key, \XMM6, \XMM6
- vaesenc \T_key, \XMM7, \XMM7
- vaesenc \T_key, \XMM8, \XMM8
- i = (i+1)
- setreg
-.endr
-
-
- vmovdqa 16*i(arg1), \T_key
- vaesenclast \T_key, \XMM1, \XMM1
- vaesenclast \T_key, \XMM2, \XMM2
- vaesenclast \T_key, \XMM3, \XMM3
- vaesenclast \T_key, \XMM4, \XMM4
- vaesenclast \T_key, \XMM5, \XMM5
- vaesenclast \T_key, \XMM6, \XMM6
- vaesenclast \T_key, \XMM7, \XMM7
- vaesenclast \T_key, \XMM8, \XMM8
-
- vmovdqu (arg4, %r11), \T1
- vpxor \T1, \XMM1, \XMM1
- vmovdqu \XMM1, (arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM1
- .endif
-
- vmovdqu 16*1(arg4, %r11), \T1
- vpxor \T1, \XMM2, \XMM2
- vmovdqu \XMM2, 16*1(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM2
- .endif
-
- vmovdqu 16*2(arg4, %r11), \T1
- vpxor \T1, \XMM3, \XMM3
- vmovdqu \XMM3, 16*2(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM3
- .endif
-
- vmovdqu 16*3(arg4, %r11), \T1
- vpxor \T1, \XMM4, \XMM4
- vmovdqu \XMM4, 16*3(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM4
- .endif
-
- vmovdqu 16*4(arg4, %r11), \T1
- vpxor \T1, \XMM5, \XMM5
- vmovdqu \XMM5, 16*4(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM5
- .endif
-
- vmovdqu 16*5(arg4, %r11), \T1
- vpxor \T1, \XMM6, \XMM6
- vmovdqu \XMM6, 16*5(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM6
- .endif
-
- vmovdqu 16*6(arg4, %r11), \T1
- vpxor \T1, \XMM7, \XMM7
- vmovdqu \XMM7, 16*6(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM7
- .endif
-
- vmovdqu 16*7(arg4, %r11), \T1
- vpxor \T1, \XMM8, \XMM8
- vmovdqu \XMM8, 16*7(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM8
- .endif
-
- add $128, %r11
-
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
- vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
- # the corresponding ciphertext
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-
-###############################################################################
-
-.L_initial_blocks_done\@:
-
-
-.endm
-
-
-
-# encrypt 8 blocks at a time
-# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg2, arg3, arg4 are used as pointers only, not modified
-# r11 is the data offset value
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
-
- vmovdqa \XMM1, \T2
- vmovdqa \XMM2, TMP2(%rsp)
- vmovdqa \XMM3, TMP3(%rsp)
- vmovdqa \XMM4, TMP4(%rsp)
- vmovdqa \XMM5, TMP5(%rsp)
- vmovdqa \XMM6, TMP6(%rsp)
- vmovdqa \XMM7, TMP7(%rsp)
- vmovdqa \XMM8, TMP8(%rsp)
-
-.if \loop_idx == in_order
- vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
- vpaddd ONE(%rip), \XMM1, \XMM2
- vpaddd ONE(%rip), \XMM2, \XMM3
- vpaddd ONE(%rip), \XMM3, \XMM4
- vpaddd ONE(%rip), \XMM4, \XMM5
- vpaddd ONE(%rip), \XMM5, \XMM6
- vpaddd ONE(%rip), \XMM6, \XMM7
- vpaddd ONE(%rip), \XMM7, \XMM8
- vmovdqa \XMM8, \CTR
-
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-.else
- vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
- vpaddd ONEf(%rip), \XMM1, \XMM2
- vpaddd ONEf(%rip), \XMM2, \XMM3
- vpaddd ONEf(%rip), \XMM3, \XMM4
- vpaddd ONEf(%rip), \XMM4, \XMM5
- vpaddd ONEf(%rip), \XMM5, \XMM6
- vpaddd ONEf(%rip), \XMM6, \XMM7
- vpaddd ONEf(%rip), \XMM7, \XMM8
- vmovdqa \XMM8, \CTR
-.endif
-
-
- #######################################################################
-
- vmovdqu (arg1), \T1
- vpxor \T1, \XMM1, \XMM1
- vpxor \T1, \XMM2, \XMM2
- vpxor \T1, \XMM3, \XMM3
- vpxor \T1, \XMM4, \XMM4
- vpxor \T1, \XMM5, \XMM5
- vpxor \T1, \XMM6, \XMM6
- vpxor \T1, \XMM7, \XMM7
- vpxor \T1, \XMM8, \XMM8
-
- #######################################################################
-
-
-
-
-
- vmovdqu 16*1(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqu 16*2(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
-
- #######################################################################
-
- vmovdqu HashKey_8(arg2), \T5
- vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
- vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
- vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
- vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
- vpxor \T5, \T6, \T6
-
- vmovdqu 16*3(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP2(%rsp), \T1
- vmovdqu HashKey_7(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
-
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpclmulqdq $0x01, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x10, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*4(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- #######################################################################
-
- vmovdqa TMP3(%rsp), \T1
- vmovdqu HashKey_6(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
-
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpclmulqdq $0x01, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x10, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*5(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP4(%rsp), \T1
- vmovdqu HashKey_5(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
-
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpclmulqdq $0x01, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x10, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*6(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
-
- vmovdqa TMP5(%rsp), \T1
- vmovdqu HashKey_4(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
-
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpclmulqdq $0x01, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x10, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*7(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP6(%rsp), \T1
- vmovdqu HashKey_3(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
-
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpclmulqdq $0x01, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x10, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*8(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP7(%rsp), \T1
- vmovdqu HashKey_2(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
-
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpclmulqdq $0x01, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x10, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
-
- #######################################################################
-
- vmovdqu 16*9(arg1), \T5
- vaesenc \T5, \XMM1, \XMM1
- vaesenc \T5, \XMM2, \XMM2
- vaesenc \T5, \XMM3, \XMM3
- vaesenc \T5, \XMM4, \XMM4
- vaesenc \T5, \XMM5, \XMM5
- vaesenc \T5, \XMM6, \XMM6
- vaesenc \T5, \XMM7, \XMM7
- vaesenc \T5, \XMM8, \XMM8
-
- vmovdqa TMP8(%rsp), \T1
- vmovdqu HashKey(arg2), \T5
-
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpclmulqdq $0x01, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x10, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T1
-
-
- vmovdqu 16*10(arg1), \T5
-
- i = 11
- setreg
-.rep (\REP-9)
- vaesenc \T5, \XMM1, \XMM1
- vaesenc \T5, \XMM2, \XMM2
- vaesenc \T5, \XMM3, \XMM3
- vaesenc \T5, \XMM4, \XMM4
- vaesenc \T5, \XMM5, \XMM5
- vaesenc \T5, \XMM6, \XMM6
- vaesenc \T5, \XMM7, \XMM7
- vaesenc \T5, \XMM8, \XMM8
-
- vmovdqu 16*i(arg1), \T5
- i = i + 1
- setreg
-.endr
-
- i = 0
- j = 1
- setreg
-.rep 8
- vpxor 16*i(arg4, %r11), \T5, \T2
- .if \ENC_DEC == ENC
- vaesenclast \T2, reg_j, reg_j
- .else
- vaesenclast \T2, reg_j, \T3
- vmovdqu 16*i(arg4, %r11), reg_j
- vmovdqu \T3, 16*i(arg3, %r11)
- .endif
- i = (i+1)
- j = (j+1)
- setreg
-.endr
- #######################################################################
-
-
- vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
- vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
- vpxor \T3, \T7, \T7
- vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
-
-
-
- #######################################################################
- #first phase of the reduction
- vmovdqa POLY2(%rip), \T3
-
- vpclmulqdq $0x01, \T7, \T3, \T2
- vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
-
- vpxor \T2, \T7, \T7 # first phase of the reduction complete
- #######################################################################
- .if \ENC_DEC == ENC
- vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
- .endif
-
- #######################################################################
- #second phase of the reduction
- vpclmulqdq $0x00, \T7, \T3, \T2
- vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
-
- vpclmulqdq $0x10, \T7, \T3, \T4
- vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
-
- vpxor \T2, \T4, \T4 # second phase of the reduction complete
- #######################################################################
- vpxor \T4, \T1, \T1 # the result is in T1
-
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-
-
- vpxor \T1, \XMM1, \XMM1
-
-
-
-.endm
-
-
-# GHASH the last 4 ciphertext blocks.
-.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
-
- ## Karatsuba Method
-
- vmovdqu HashKey_8(arg2), \T5
-
- vpshufd $0b01001110, \XMM1, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM1, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM1, \T6
- vpclmulqdq $0x00, \T5, \XMM1, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \XMM1
-
- ######################
-
- vmovdqu HashKey_7(arg2), \T5
- vpshufd $0b01001110, \XMM2, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM2, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM2, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM2, \T4
- vpxor \T4, \T7, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vmovdqu HashKey_6(arg2), \T5
- vpshufd $0b01001110, \XMM3, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM3, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM3, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM3, \T4
- vpxor \T4, \T7, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vmovdqu HashKey_5(arg2), \T5
- vpshufd $0b01001110, \XMM4, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM4, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM4, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM4, \T4
- vpxor \T4, \T7, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vmovdqu HashKey_4(arg2), \T5
- vpshufd $0b01001110, \XMM5, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM5, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM5, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM5, \T4
- vpxor \T4, \T7, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vmovdqu HashKey_3(arg2), \T5
- vpshufd $0b01001110, \XMM6, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM6, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM6, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM6, \T4
- vpxor \T4, \T7, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vmovdqu HashKey_2(arg2), \T5
- vpshufd $0b01001110, \XMM7, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM7, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM7, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM7, \T4
- vpxor \T4, \T7, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vmovdqu HashKey(arg2), \T5
- vpshufd $0b01001110, \XMM8, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM8, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM8, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM8, \T4
- vpxor \T4, \T7, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
- vpxor \T6, \XMM1, \XMM1
- vpxor \T7, \XMM1, \T2
-
-
-
-
- vpslldq $8, \T2, \T4
- vpsrldq $8, \T2, \T2
-
- vpxor \T4, \T7, \T7
- vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
- # accumulated carry-less multiplications
-
- #######################################################################
- #first phase of the reduction
- vmovdqa POLY2(%rip), \T3
-
- vpclmulqdq $0x01, \T7, \T3, \T2
- vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
-
- vpxor \T2, \T7, \T7 # first phase of the reduction complete
- #######################################################################
-
-
- #second phase of the reduction
- vpclmulqdq $0x00, \T7, \T3, \T2
- vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
-
- vpclmulqdq $0x10, \T7, \T3, \T4
- vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
-
- vpxor \T2, \T4, \T4 # second phase of the reduction complete
- #######################################################################
- vpxor \T4, \T6, \T6 # the result is in T6
-.endm
-
-
-
-#############################################################
-#void aesni_gcm_init_avx_gen4
-# (gcm_data *my_ctx_data,
-# gcm_context_data *data,
-# u8 *iv, /* Pre-counter block j0: 4 byte salt
-# (from Security Association) concatenated with 8 byte
-# Initialisation Vector (from IPSec ESP Payload)
-# concatenated with 0x00000001. 16-byte aligned pointer. */
-# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
-# const u8 *aad, /* Additional Authentication Data (AAD)*/
-# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-#############################################################
-SYM_FUNC_START(aesni_gcm_init_avx_gen4)
- FUNC_SAVE
- INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_init_avx_gen4)
-
-###############################################################################
-#void aesni_gcm_enc_avx_gen4(
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
-# gcm_context_data *data,
-# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
-# const u8 *in, /* Plaintext input */
-# u64 plaintext_len) /* Length of data in Bytes for encryption. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
- FUNC_SAVE
- mov keysize,%eax
- cmp $32, %eax
- je key_256_enc_update4
- cmp $16, %eax
- je key_128_enc_update4
- # must be 192
- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
- FUNC_RESTORE
- RET
-key_128_enc_update4:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
- FUNC_RESTORE
- RET
-key_256_enc_update4:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
-
-###############################################################################
-#void aesni_gcm_dec_update_avx_gen4(
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
-# gcm_context_data *data,
-# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
-# const u8 *in, /* Ciphertext input */
-# u64 plaintext_len) /* Length of data in Bytes for encryption. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
- FUNC_SAVE
- mov keysize,%eax
- cmp $32, %eax
- je key_256_dec_update4
- cmp $16, %eax
- je key_128_dec_update4
- # must be 192
- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
- FUNC_RESTORE
- RET
-key_128_dec_update4:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
- FUNC_RESTORE
- RET
-key_256_dec_update4:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
-
-###############################################################################
-#void aesni_gcm_finalize_avx_gen4(
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
-# gcm_context_data *data,
-# u8 *auth_tag, /* Authenticated Tag output. */
-# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
-# Valid values are 16 (most likely), 12 or 8. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
- FUNC_SAVE
- mov keysize,%eax
- cmp $32, %eax
- je key_256_finalize4
- cmp $16, %eax
- je key_128_finalize4
- # must be 192
- GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
- FUNC_RESTORE
- RET
-key_128_finalize4:
- GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
- FUNC_RESTORE
- RET
-key_256_finalize4:
- GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index ef031655b2d3..cd37de5ec404 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Support for Intel AES-NI instructions. This file contains glue
- * code, the real AES implementation is in intel-aes_asm.S.
+ * Support for AES-NI and VAES instructions. This file contains glue code.
+ * The real AES implementations are in aesni-intel_asm.S and other .S files.
*
* Copyright (C) 2008, Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
@@ -13,6 +13,8 @@
* Tadeusz Struk (tadeusz.struk@intel.com)
* Aidan O'Mahony (aidan.o.mahony@intel.com)
* Copyright (c) 2010, Intel Corporation.
+ *
+ * Copyright 2024 Google LLC
*/
#include <linux/hardirq.h>
@@ -44,41 +46,11 @@
#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
-/* This data is stored at the end of the crypto_tfm struct.
- * It's a type of per "session" data storage location.
- * This needs to be 16 byte aligned.
- */
-struct aesni_rfc4106_gcm_ctx {
- u8 hash_subkey[16] AESNI_ALIGN_ATTR;
- struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
- u8 nonce[4];
-};
-
-struct generic_gcmaes_ctx {
- u8 hash_subkey[16] AESNI_ALIGN_ATTR;
- struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
-};
-
struct aesni_xts_ctx {
struct crypto_aes_ctx tweak_ctx AESNI_ALIGN_ATTR;
struct crypto_aes_ctx crypt_ctx AESNI_ALIGN_ATTR;
};
-#define GCM_BLOCK_LEN 16
-
-struct gcm_context_data {
- /* init, update and finalize context data */
- u8 aad_hash[GCM_BLOCK_LEN];
- u64 aad_length;
- u64 in_length;
- u8 partial_block_enc_key[GCM_BLOCK_LEN];
- u8 orig_IV[GCM_BLOCK_LEN];
- u8 current_counter[GCM_BLOCK_LEN];
- u64 partial_block_len;
- u64 unused;
- u8 hash_keys[GCM_BLOCK_LEN * 16];
-};
-
static inline void *aes_align_addr(void *addr)
{
if (crypto_tfm_ctx_alignment() >= AESNI_ALIGN)
@@ -103,9 +75,6 @@ asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
-#define AVX_GEN2_OPTSIZE 640
-#define AVX_GEN4_OPTSIZE 4096
-
asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
@@ -118,23 +87,6 @@ asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc);
-/* Scatter / Gather routines, with args similar to above */
-asmlinkage void aesni_gcm_init(void *ctx,
- struct gcm_context_data *gdata,
- u8 *iv,
- u8 *hash_subkey, const u8 *aad,
- unsigned long aad_len);
-asmlinkage void aesni_gcm_enc_update(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long plaintext_len);
-asmlinkage void aesni_gcm_dec_update(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in,
- unsigned long ciphertext_len);
-asmlinkage void aesni_gcm_finalize(void *ctx,
- struct gcm_context_data *gdata,
- u8 *auth_tag, unsigned long auth_tag_len);
-
asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
@@ -154,67 +106,6 @@ asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv,
asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv,
const void *keys, u8 *out, unsigned int num_bytes,
unsigned int byte_ctr);
-
-/*
- * asmlinkage void aesni_gcm_init_avx_gen2()
- * gcm_data *my_ctx_data, context data
- * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
- */
-asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data,
- struct gcm_context_data *gdata,
- u8 *iv,
- u8 *hash_subkey,
- const u8 *aad,
- unsigned long aad_len);
-
-asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long plaintext_len);
-asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in,
- unsigned long ciphertext_len);
-asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx,
- struct gcm_context_data *gdata,
- u8 *auth_tag, unsigned long auth_tag_len);
-
-/*
- * asmlinkage void aesni_gcm_init_avx_gen4()
- * gcm_data *my_ctx_data, context data
- * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
- */
-asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data,
- struct gcm_context_data *gdata,
- u8 *iv,
- u8 *hash_subkey,
- const u8 *aad,
- unsigned long aad_len);
-
-asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long plaintext_len);
-asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in,
- unsigned long ciphertext_len);
-asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx,
- struct gcm_context_data *gdata,
- u8 *auth_tag, unsigned long auth_tag_len);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2);
-
-static inline struct
-aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
-{
- return aes_align_addr(crypto_aead_ctx(tfm));
-}
-
-static inline struct
-generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm)
-{
- return aes_align_addr(crypto_aead_ctx(tfm));
-}
#endif
static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
@@ -588,280 +479,6 @@ static int xctr_crypt(struct skcipher_request *req)
}
return err;
}
-
-static int aes_gcm_derive_hash_subkey(const struct crypto_aes_ctx *aes_key,
- u8 hash_subkey[AES_BLOCK_SIZE])
-{
- static const u8 zeroes[AES_BLOCK_SIZE];
-
- aes_encrypt(aes_key, hash_subkey, zeroes);
- return 0;
-}
-
-static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
- unsigned int key_len)
-{
- struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead);
-
- if (key_len < 4)
- return -EINVAL;
-
- /*Account for 4 byte nonce at the end.*/
- key_len -= 4;
-
- memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
-
- return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?:
- aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded,
- ctx->hash_subkey);
-}
-
-/* This is the Integrity Check Value (aka the authentication tag) length and can
- * be 8, 12 or 16 bytes long. */
-static int common_rfc4106_set_authsize(struct crypto_aead *aead,
- unsigned int authsize)
-{
- switch (authsize) {
- case 8:
- case 12:
- case 16:
- break;
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
- unsigned int authsize)
-{
- switch (authsize) {
- case 4:
- case 8:
- case 12:
- case 13:
- case 14:
- case 15:
- case 16:
- break;
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
- unsigned int assoclen, u8 *hash_subkey,
- u8 *iv, void *aes_ctx, u8 *auth_tag,
- unsigned long auth_tag_len)
-{
- u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8);
- struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN);
- unsigned long left = req->cryptlen;
- struct scatter_walk assoc_sg_walk;
- struct skcipher_walk walk;
- bool do_avx, do_avx2;
- u8 *assocmem = NULL;
- u8 *assoc;
- int err;
-
- if (!enc)
- left -= auth_tag_len;
-
- do_avx = (left >= AVX_GEN2_OPTSIZE);
- do_avx2 = (left >= AVX_GEN4_OPTSIZE);
-
- /* Linearize assoc, if not already linear */
- if (req->src->length >= assoclen && req->src->length) {
- scatterwalk_start(&assoc_sg_walk, req->src);
- assoc = scatterwalk_map(&assoc_sg_walk);
- } else {
- gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
- GFP_KERNEL : GFP_ATOMIC;
-
- /* assoc can be any length, so must be on heap */
- assocmem = kmalloc(assoclen, flags);
- if (unlikely(!assocmem))
- return -ENOMEM;
- assoc = assocmem;
-
- scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
- }
-
- kernel_fpu_begin();
- if (static_branch_likely(&gcm_use_avx2) && do_avx2)
- aesni_gcm_init_avx_gen4(aes_ctx, data, iv, hash_subkey, assoc,
- assoclen);
- else if (static_branch_likely(&gcm_use_avx) && do_avx)
- aesni_gcm_init_avx_gen2(aes_ctx, data, iv, hash_subkey, assoc,
- assoclen);
- else
- aesni_gcm_init(aes_ctx, data, iv, hash_subkey, assoc, assoclen);
- kernel_fpu_end();
-
- if (!assocmem)
- scatterwalk_unmap(assoc);
- else
- kfree(assocmem);
-
- err = enc ? skcipher_walk_aead_encrypt(&walk, req, false)
- : skcipher_walk_aead_decrypt(&walk, req, false);
-
- while (walk.nbytes > 0) {
- kernel_fpu_begin();
- if (static_branch_likely(&gcm_use_avx2) && do_avx2) {
- if (enc)
- aesni_gcm_enc_update_avx_gen4(aes_ctx, data,
- walk.dst.virt.addr,
- walk.src.virt.addr,
- walk.nbytes);
- else
- aesni_gcm_dec_update_avx_gen4(aes_ctx, data,
- walk.dst.virt.addr,
- walk.src.virt.addr,
- walk.nbytes);
- } else if (static_branch_likely(&gcm_use_avx) && do_avx) {
- if (enc)
- aesni_gcm_enc_update_avx_gen2(aes_ctx, data,
- walk.dst.virt.addr,
- walk.src.virt.addr,
- walk.nbytes);
- else
- aesni_gcm_dec_update_avx_gen2(aes_ctx, data,
- walk.dst.virt.addr,
- walk.src.virt.addr,
- walk.nbytes);
- } else if (enc) {
- aesni_gcm_enc_update(aes_ctx, data, walk.dst.virt.addr,
- walk.src.virt.addr, walk.nbytes);
- } else {
- aesni_gcm_dec_update(aes_ctx, data, walk.dst.virt.addr,
- walk.src.virt.addr, walk.nbytes);
- }
- kernel_fpu_end();
-
- err = skcipher_walk_done(&walk, 0);
- }
-
- if (err)
- return err;
-
- kernel_fpu_begin();
- if (static_branch_likely(&gcm_use_avx2) && do_avx2)
- aesni_gcm_finalize_avx_gen4(aes_ctx, data, auth_tag,
- auth_tag_len);
- else if (static_branch_likely(&gcm_use_avx) && do_avx)
- aesni_gcm_finalize_avx_gen2(aes_ctx, data, auth_tag,
- auth_tag_len);
- else
- aesni_gcm_finalize(aes_ctx, data, auth_tag, auth_tag_len);
- kernel_fpu_end();
-
- return 0;
-}
-
-static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
- u8 *hash_subkey, u8 *iv, void *aes_ctx)
-{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- unsigned long auth_tag_len = crypto_aead_authsize(tfm);
- u8 auth_tag[16];
- int err;
-
- err = gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx,
- auth_tag, auth_tag_len);
- if (err)
- return err;
-
- scatterwalk_map_and_copy(auth_tag, req->dst,
- req->assoclen + req->cryptlen,
- auth_tag_len, 1);
- return 0;
-}
-
-static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
- u8 *hash_subkey, u8 *iv, void *aes_ctx)
-{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- unsigned long auth_tag_len = crypto_aead_authsize(tfm);
- u8 auth_tag_msg[16];
- u8 auth_tag[16];
- int err;
-
- err = gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx,
- auth_tag, auth_tag_len);
- if (err)
- return err;
-
- /* Copy out original auth_tag */
- scatterwalk_map_and_copy(auth_tag_msg, req->src,
- req->assoclen + req->cryptlen - auth_tag_len,
- auth_tag_len, 0);
-
- /* Compare generated tag with passed in tag. */
- if (crypto_memneq(auth_tag_msg, auth_tag, auth_tag_len)) {
- memzero_explicit(auth_tag, sizeof(auth_tag));
- return -EBADMSG;
- }
- return 0;
-}
-
-static int helper_rfc4106_encrypt(struct aead_request *req)
-{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
- void *aes_ctx = &(ctx->aes_key_expanded);
- u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
- u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
- unsigned int i;
- __be32 counter = cpu_to_be32(1);
-
- /* Assuming we are supporting rfc4106 64-bit extended */
- /* sequence numbers We need to have the AAD length equal */
- /* to 16 or 20 bytes */
- if (unlikely(req->assoclen != 16 && req->assoclen != 20))
- return -EINVAL;
-
- /* IV below built */
- for (i = 0; i < 4; i++)
- *(iv+i) = ctx->nonce[i];
- for (i = 0; i < 8; i++)
- *(iv+4+i) = req->iv[i];
- *((__be32 *)(iv+12)) = counter;
-
- return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
- aes_ctx);
-}
-
-static int helper_rfc4106_decrypt(struct aead_request *req)
-{
- __be32 counter = cpu_to_be32(1);
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
- void *aes_ctx = &(ctx->aes_key_expanded);
- u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
- u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
- unsigned int i;
-
- if (unlikely(req->assoclen != 16 && req->assoclen != 20))
- return -EINVAL;
-
- /* Assuming we are supporting rfc4106 64-bit extended */
- /* sequence numbers We need to have the AAD length */
- /* equal to 16 or 20 bytes */
-
- /* IV below built */
- for (i = 0; i < 4; i++)
- *(iv+i) = ctx->nonce[i];
- for (i = 0; i < 8; i++)
- *(iv+4+i) = req->iv[i];
- *((__be32 *)(iv+12)) = counter;
-
- return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
- aes_ctx);
-}
#endif
static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
@@ -1216,11 +833,717 @@ DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700);
DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800);
#endif
+/* The common part of the x86_64 AES-GCM key struct */
+struct aes_gcm_key {
+ /* Expanded AES key and the AES key length in bytes */
+ struct crypto_aes_ctx aes_key;
+
+ /* RFC4106 nonce (used only by the rfc4106 algorithms) */
+ u32 rfc4106_nonce;
+};
+
+/* Key struct used by the AES-NI implementations of AES-GCM */
+struct aes_gcm_key_aesni {
+ /*
+ * Common part of the key. The assembly code requires 16-byte alignment
+ * for the round keys; we get this by them being located at the start of
+ * the struct and the whole struct being 16-byte aligned.
+ */
+ struct aes_gcm_key base;
+
+ /*
+ * Powers of the hash key H^8 through H^1. These are 128-bit values.
+ * They all have an extra factor of x^-1 and are byte-reversed. 16-byte
+ * alignment is required by the assembly code.
+ */
+ u64 h_powers[8][2] __aligned(16);
+
+ /*
+ * h_powers_xored[i] contains the two 64-bit halves of h_powers[i] XOR'd
+ * together. It's used for Karatsuba multiplication. 16-byte alignment
+ * is required by the assembly code.
+ */
+ u64 h_powers_xored[8] __aligned(16);
+
+ /*
+ * H^1 times x^64 (and also the usual extra factor of x^-1). 16-byte
+ * alignment is required by the assembly code.
+ */
+ u64 h_times_x64[2] __aligned(16);
+};
+#define AES_GCM_KEY_AESNI(key) \
+ container_of((key), struct aes_gcm_key_aesni, base)
+#define AES_GCM_KEY_AESNI_SIZE \
+ (sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1)))
+
+/* Key struct used by the VAES + AVX10 implementations of AES-GCM */
+struct aes_gcm_key_avx10 {
+ /*
+ * Common part of the key. The assembly code prefers 16-byte alignment
+ * for the round keys; we get this by them being located at the start of
+ * the struct and the whole struct being 64-byte aligned.
+ */
+ struct aes_gcm_key base;
+
+ /*
+ * Powers of the hash key H^16 through H^1. These are 128-bit values.
+ * They all have an extra factor of x^-1 and are byte-reversed. This
+ * array is aligned to a 64-byte boundary to make it naturally aligned
+ * for 512-bit loads, which can improve performance. (The assembly code
+ * doesn't *need* the alignment; this is just an optimization.)
+ */
+ u64 h_powers[16][2] __aligned(64);
+
+ /* Three padding blocks required by the assembly code */
+ u64 padding[3][2];
+};
+#define AES_GCM_KEY_AVX10(key) \
+ container_of((key), struct aes_gcm_key_avx10, base)
+#define AES_GCM_KEY_AVX10_SIZE \
+ (sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1)))
+
+/*
+ * These flags are passed to the AES-GCM helper functions to specify the
+ * specific version of AES-GCM (RFC4106 or not), whether it's encryption or
+ * decryption, and which assembly functions should be called. Assembly
+ * functions are selected using flags instead of function pointers to avoid
+ * indirect calls (which are very expensive on x86) regardless of inlining.
+ */
+#define FLAG_RFC4106 BIT(0)
+#define FLAG_ENC BIT(1)
+#define FLAG_AVX BIT(2)
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+# define FLAG_AVX10_256 BIT(3)
+# define FLAG_AVX10_512 BIT(4)
+#else
+ /*
+ * This should cause all calls to the AVX10 assembly functions to be
+ * optimized out, avoiding the need to ifdef each call individually.
+ */
+# define FLAG_AVX10_256 0
+# define FLAG_AVX10_512 0
+#endif
+
+static inline struct aes_gcm_key *
+aes_gcm_key_get(struct crypto_aead *tfm, int flags)
+{
+ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+ return PTR_ALIGN(crypto_aead_ctx(tfm), 64);
+ else
+ return PTR_ALIGN(crypto_aead_ctx(tfm), 16);
+}
+
+asmlinkage void
+aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key);
+asmlinkage void
+aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key);
+asmlinkage void
+aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key);
+asmlinkage void
+aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key);
+
+static void aes_gcm_precompute(struct aes_gcm_key *key, int flags)
+{
+ /*
+ * To make things a bit easier on the assembly side, the AVX10
+ * implementations use the same key format. Therefore, a single
+ * function using 256-bit vectors would suffice here. However, it's
+ * straightforward to provide a 512-bit one because of how the assembly
+ * code is structured, and it works nicely because the total size of the
+ * key powers is a multiple of 512 bits. So we take advantage of that.
+ *
+ * A similar situation applies to the AES-NI implementations.
+ */
+ if (flags & FLAG_AVX10_512)
+ aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key));
+ else if (flags & FLAG_AVX10_256)
+ aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key));
+ else if (flags & FLAG_AVX)
+ aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key));
+ else
+ aes_gcm_precompute_aesni(AES_GCM_KEY_AESNI(key));
+}
+
+asmlinkage void
+aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
+ u8 ghash_acc[16], const u8 *aad, int aadlen);
+asmlinkage void
+aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key,
+ u8 ghash_acc[16], const u8 *aad, int aadlen);
+asmlinkage void
+aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+ u8 ghash_acc[16], const u8 *aad, int aadlen);
+
+static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16],
+ const u8 *aad, int aadlen, int flags)
+{
+ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+ aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc,
+ aad, aadlen);
+ else if (flags & FLAG_AVX)
+ aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc,
+ aad, aadlen);
+ else
+ aes_gcm_aad_update_aesni(AES_GCM_KEY_AESNI(key), ghash_acc,
+ aad, aadlen);
+}
+
+asmlinkage void
+aes_gcm_enc_update_aesni(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+
+asmlinkage void
+aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline void
+aes_gcm_update(const struct aes_gcm_key *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen, int flags)
+{
+ if (flags & FLAG_ENC) {
+ if (flags & FLAG_AVX10_512)
+ aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else if (flags & FLAG_AVX10_256)
+ aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else if (flags & FLAG_AVX)
+ aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else
+ aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr,
+ ghash_acc, src, dst, datalen);
+ } else {
+ if (flags & FLAG_AVX10_512)
+ aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else if (flags & FLAG_AVX10_256)
+ aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else if (flags & FLAG_AVX)
+ aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else
+ aes_gcm_dec_update_aesni(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ }
+}
+
+asmlinkage void
+aes_gcm_enc_final_aesni(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen);
+asmlinkage void
+aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen);
+asmlinkage void
+aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen);
+
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline void
+aes_gcm_enc_final(const struct aes_gcm_key *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen, int flags)
+{
+ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+ aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen);
+ else if (flags & FLAG_AVX)
+ aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen);
+ else
+ aes_gcm_enc_final_aesni(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen);
+}
+
+asmlinkage bool __must_check
+aes_gcm_dec_final_aesni(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], const u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen,
+ const u8 tag[16], int taglen);
+asmlinkage bool __must_check
+aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], const u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen,
+ const u8 tag[16], int taglen);
+asmlinkage bool __must_check
+aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+ const u32 le_ctr[4], const u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen,
+ const u8 tag[16], int taglen);
+
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline bool __must_check
+aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4],
+ u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen,
+ u8 tag[16], int taglen, int flags)
+{
+ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+ return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen,
+ tag, taglen);
+ else if (flags & FLAG_AVX)
+ return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen,
+ tag, taglen);
+ else
+ return aes_gcm_dec_final_aesni(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen,
+ tag, taglen);
+}
+
+/*
+ * This is the Integrity Check Value (aka the authentication tag) length and can
+ * be 8, 12 or 16 bytes long.
+ */
+static int common_rfc4106_set_authsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
+ switch (authsize) {
+ case 8:
+ case 12:
+ case 16:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ switch (authsize) {
+ case 4:
+ case 8:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ case 16:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * This is the setkey function for the x86_64 implementations of AES-GCM. It
+ * saves the RFC4106 nonce if applicable, expands the AES key, and precomputes
+ * powers of the hash key.
+ *
+ * To comply with the crypto_aead API, this has to be usable in no-SIMD context.
+ * For that reason, this function includes a portable C implementation of the
+ * needed logic. However, the portable C implementation is very slow, taking
+ * about the same time as encrypting 37 KB of data. To be ready for users that
+ * may set a key even somewhat frequently, we therefore also include a SIMD
+ * assembly implementation, expanding the AES key using AES-NI and precomputing
+ * the hash key powers using PCLMULQDQ or VPCLMULQDQ.
+ */
+static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
+ unsigned int keylen, int flags)
+{
+ struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags);
+ int err;
+
+ if (flags & FLAG_RFC4106) {
+ if (keylen < 4)
+ return -EINVAL;
+ keylen -= 4;
+ key->rfc4106_nonce = get_unaligned_be32(raw_key + keylen);
+ }
+
+ /* The assembly code assumes the following offsets. */
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_enc) != 0);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_length) != 480);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768);
+
+ if (likely(crypto_simd_usable())) {
+ err = aes_check_keylen(keylen);
+ if (err)
+ return err;
+ kernel_fpu_begin();
+ aesni_set_key(&key->aes_key, raw_key, keylen);
+ aes_gcm_precompute(key, flags);
+ kernel_fpu_end();
+ } else {
+ static const u8 x_to_the_minus1[16] __aligned(__alignof__(be128)) = {
+ [0] = 0xc2, [15] = 1
+ };
+ static const u8 x_to_the_63[16] __aligned(__alignof__(be128)) = {
+ [7] = 1,
+ };
+ be128 h1 = {};
+ be128 h;
+ int i;
+
+ err = aes_expandkey(&key->aes_key, raw_key, keylen);
+ if (err)
+ return err;
+
+ /* Encrypt the all-zeroes block to get the hash key H^1 */
+ aes_encrypt(&key->aes_key, (u8 *)&h1, (u8 *)&h1);
+
+ /* Compute H^1 * x^-1 */
+ h = h1;
+ gf128mul_lle(&h, (const be128 *)x_to_the_minus1);
+
+ /* Compute the needed key powers */
+ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) {
+ struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key);
+
+ for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
+ k->h_powers[i][0] = be64_to_cpu(h.b);
+ k->h_powers[i][1] = be64_to_cpu(h.a);
+ gf128mul_lle(&h, &h1);
+ }
+ memset(k->padding, 0, sizeof(k->padding));
+ } else {
+ struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key);
+
+ for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
+ k->h_powers[i][0] = be64_to_cpu(h.b);
+ k->h_powers[i][1] = be64_to_cpu(h.a);
+ k->h_powers_xored[i] = k->h_powers[i][0] ^
+ k->h_powers[i][1];
+ gf128mul_lle(&h, &h1);
+ }
+ gf128mul_lle(&h1, (const be128 *)x_to_the_63);
+ k->h_times_x64[0] = be64_to_cpu(h1.b);
+ k->h_times_x64[1] = be64_to_cpu(h1.a);
+ }
+ }
+ return 0;
+}
+
+/*
+ * Initialize @ghash_acc, then pass all @assoclen bytes of associated data
+ * (a.k.a. additional authenticated data) from @sg_src through the GHASH update
+ * assembly function. kernel_fpu_begin() must have already been called.
+ */
+static void gcm_process_assoc(const struct aes_gcm_key *key, u8 ghash_acc[16],
+ struct scatterlist *sg_src, unsigned int assoclen,
+ int flags)
+{
+ struct scatter_walk walk;
+ /*
+ * The assembly function requires that the length of any non-last
+ * segment of associated data be a multiple of 16 bytes, so this
+ * function does the buffering needed to achieve that.
+ */
+ unsigned int pos = 0;
+ u8 buf[16];
+
+ memset(ghash_acc, 0, 16);
+ scatterwalk_start(&walk, sg_src);
+
+ while (assoclen) {
+ unsigned int len_this_page = scatterwalk_clamp(&walk, assoclen);
+ void *mapped = scatterwalk_map(&walk);
+ const void *src = mapped;
+ unsigned int len;
+
+ assoclen -= len_this_page;
+ scatterwalk_advance(&walk, len_this_page);
+ if (unlikely(pos)) {
+ len = min(len_this_page, 16 - pos);
+ memcpy(&buf[pos], src, len);
+ pos += len;
+ src += len;
+ len_this_page -= len;
+ if (pos < 16)
+ goto next;
+ aes_gcm_aad_update(key, ghash_acc, buf, 16, flags);
+ pos = 0;
+ }
+ len = len_this_page;
+ if (unlikely(assoclen)) /* Not the last segment yet? */
+ len = round_down(len, 16);
+ aes_gcm_aad_update(key, ghash_acc, src, len, flags);
+ src += len;
+ len_this_page -= len;
+ if (unlikely(len_this_page)) {
+ memcpy(buf, src, len_this_page);
+ pos = len_this_page;
+ }
+next:
+ scatterwalk_unmap(mapped);
+ scatterwalk_pagedone(&walk, 0, assoclen);
+ if (need_resched()) {
+ kernel_fpu_end();
+ kernel_fpu_begin();
+ }
+ }
+ if (unlikely(pos))
+ aes_gcm_aad_update(key, ghash_acc, buf, pos, flags);
+}
+
+
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline int
+gcm_crypt(struct aead_request *req, int flags)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ const struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags);
+ unsigned int assoclen = req->assoclen;
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ u8 ghash_acc[16]; /* GHASH accumulator */
+ u32 le_ctr[4]; /* Counter in little-endian format */
+ int taglen;
+ int err;
+
+ /* Initialize the counter and determine the associated data length. */
+ le_ctr[0] = 2;
+ if (flags & FLAG_RFC4106) {
+ if (unlikely(assoclen != 16 && assoclen != 20))
+ return -EINVAL;
+ assoclen -= 8;
+ le_ctr[1] = get_unaligned_be32(req->iv + 4);
+ le_ctr[2] = get_unaligned_be32(req->iv + 0);
+ le_ctr[3] = key->rfc4106_nonce; /* already byte-swapped */
+ } else {
+ le_ctr[1] = get_unaligned_be32(req->iv + 8);
+ le_ctr[2] = get_unaligned_be32(req->iv + 4);
+ le_ctr[3] = get_unaligned_be32(req->iv + 0);
+ }
+
+ /* Begin walking through the plaintext or ciphertext. */
+ if (flags & FLAG_ENC)
+ err = skcipher_walk_aead_encrypt(&walk, req, false);
+ else
+ err = skcipher_walk_aead_decrypt(&walk, req, false);
+
+ /*
+ * Since the AES-GCM assembly code requires that at least three assembly
+ * functions be called to process any message (this is needed to support
+ * incremental updates cleanly), to reduce overhead we try to do all
+ * three calls in the same kernel FPU section if possible. We close the
+ * section and start a new one if there are multiple data segments or if
+ * rescheduling is needed while processing the associated data.
+ */
+ kernel_fpu_begin();
+
+ /* Pass the associated data through GHASH. */
+ gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags);
+
+ /* En/decrypt the data and pass the ciphertext through GHASH. */
+ while ((nbytes = walk.nbytes) != 0) {
+ if (unlikely(nbytes < walk.total)) {
+ /*
+ * Non-last segment. In this case, the assembly
+ * function requires that the length be a multiple of 16
+ * (AES_BLOCK_SIZE) bytes. The needed buffering of up
+ * to 16 bytes is handled by the skcipher_walk. Here we
+ * just need to round down to a multiple of 16.
+ */
+ nbytes = round_down(nbytes, AES_BLOCK_SIZE);
+ aes_gcm_update(key, le_ctr, ghash_acc,
+ walk.src.virt.addr, walk.dst.virt.addr,
+ nbytes, flags);
+ le_ctr[0] += nbytes / AES_BLOCK_SIZE;
+ kernel_fpu_end();
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ kernel_fpu_begin();
+ } else {
+ /* Last segment: process all remaining data. */
+ aes_gcm_update(key, le_ctr, ghash_acc,
+ walk.src.virt.addr, walk.dst.virt.addr,
+ nbytes, flags);
+ err = skcipher_walk_done(&walk, 0);
+ /*
+ * The low word of the counter isn't used by the
+ * finalize, so there's no need to increment it here.
+ */
+ }
+ }
+ if (err)
+ goto out;
+
+ /* Finalize */
+ taglen = crypto_aead_authsize(tfm);
+ if (flags & FLAG_ENC) {
+ /* Finish computing the auth tag. */
+ aes_gcm_enc_final(key, le_ctr, ghash_acc, assoclen,
+ req->cryptlen, flags);
+
+ /* Store the computed auth tag in the dst scatterlist. */
+ scatterwalk_map_and_copy(ghash_acc, req->dst, req->assoclen +
+ req->cryptlen, taglen, 1);
+ } else {
+ unsigned int datalen = req->cryptlen - taglen;
+ u8 tag[16];
+
+ /* Get the transmitted auth tag from the src scatterlist. */
+ scatterwalk_map_and_copy(tag, req->src, req->assoclen + datalen,
+ taglen, 0);
+ /*
+ * Finish computing the auth tag and compare it to the
+ * transmitted one. The assembly function does the actual tag
+ * comparison. Here, just check the boolean result.
+ */
+ if (!aes_gcm_dec_final(key, le_ctr, ghash_acc, assoclen,
+ datalen, tag, taglen, flags))
+ err = -EBADMSG;
+ }
+out:
+ kernel_fpu_end();
+ return err;
+}
+
+#define DEFINE_GCM_ALGS(suffix, flags, generic_driver_name, rfc_driver_name, \
+ ctxsize, priority) \
+ \
+static int gcm_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \
+ unsigned int keylen) \
+{ \
+ return gcm_setkey(tfm, raw_key, keylen, (flags)); \
+} \
+ \
+static int gcm_encrypt_##suffix(struct aead_request *req) \
+{ \
+ return gcm_crypt(req, (flags) | FLAG_ENC); \
+} \
+ \
+static int gcm_decrypt_##suffix(struct aead_request *req) \
+{ \
+ return gcm_crypt(req, (flags)); \
+} \
+ \
+static int rfc4106_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \
+ unsigned int keylen) \
+{ \
+ return gcm_setkey(tfm, raw_key, keylen, (flags) | FLAG_RFC4106); \
+} \
+ \
+static int rfc4106_encrypt_##suffix(struct aead_request *req) \
+{ \
+ return gcm_crypt(req, (flags) | FLAG_RFC4106 | FLAG_ENC); \
+} \
+ \
+static int rfc4106_decrypt_##suffix(struct aead_request *req) \
+{ \
+ return gcm_crypt(req, (flags) | FLAG_RFC4106); \
+} \
+ \
+static struct aead_alg aes_gcm_algs_##suffix[] = { { \
+ .setkey = gcm_setkey_##suffix, \
+ .setauthsize = generic_gcmaes_set_authsize, \
+ .encrypt = gcm_encrypt_##suffix, \
+ .decrypt = gcm_decrypt_##suffix, \
+ .ivsize = GCM_AES_IV_SIZE, \
+ .chunksize = AES_BLOCK_SIZE, \
+ .maxauthsize = 16, \
+ .base = { \
+ .cra_name = "__gcm(aes)", \
+ .cra_driver_name = "__" generic_driver_name, \
+ .cra_priority = (priority), \
+ .cra_flags = CRYPTO_ALG_INTERNAL, \
+ .cra_blocksize = 1, \
+ .cra_ctxsize = (ctxsize), \
+ .cra_module = THIS_MODULE, \
+ }, \
+}, { \
+ .setkey = rfc4106_setkey_##suffix, \
+ .setauthsize = common_rfc4106_set_authsize, \
+ .encrypt = rfc4106_encrypt_##suffix, \
+ .decrypt = rfc4106_decrypt_##suffix, \
+ .ivsize = GCM_RFC4106_IV_SIZE, \
+ .chunksize = AES_BLOCK_SIZE, \
+ .maxauthsize = 16, \
+ .base = { \
+ .cra_name = "__rfc4106(gcm(aes))", \
+ .cra_driver_name = "__" rfc_driver_name, \
+ .cra_priority = (priority), \
+ .cra_flags = CRYPTO_ALG_INTERNAL, \
+ .cra_blocksize = 1, \
+ .cra_ctxsize = (ctxsize), \
+ .cra_module = THIS_MODULE, \
+ }, \
+} }; \
+ \
+static struct simd_aead_alg *aes_gcm_simdalgs_##suffix[2] \
+
+/* aes_gcm_algs_aesni */
+DEFINE_GCM_ALGS(aesni, /* no flags */ 0,
+ "generic-gcm-aesni", "rfc4106-gcm-aesni",
+ AES_GCM_KEY_AESNI_SIZE, 400);
+
+/* aes_gcm_algs_aesni_avx */
+DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX,
+ "generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx",
+ AES_GCM_KEY_AESNI_SIZE, 500);
+
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+/* aes_gcm_algs_vaes_avx10_256 */
+DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256,
+ "generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256",
+ AES_GCM_KEY_AVX10_SIZE, 700);
+
+/* aes_gcm_algs_vaes_avx10_512 */
+DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512,
+ "generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512",
+ AES_GCM_KEY_AVX10_SIZE, 800);
+#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
+
/*
* This is a list of CPU models that are known to suffer from downclocking when
- * zmm registers (512-bit vectors) are used. On these CPUs, the AES-XTS
- * implementation with zmm registers won't be used by default. An
- * implementation with ymm registers (256-bit vectors) will be used instead.
+ * zmm registers (512-bit vectors) are used. On these CPUs, the AES mode
+ * implementations with zmm registers won't be used by default. Implementations
+ * with ymm registers (256-bit vectors) will be used by default instead.
*/
static const struct x86_cpu_id zmm_exclusion_list[] = {
X86_MATCH_VFM(INTEL_SKYLAKE_X, 0),
@@ -1236,7 +1559,7 @@ static const struct x86_cpu_id zmm_exclusion_list[] = {
{},
};
-static int __init register_xts_algs(void)
+static int __init register_avx_algs(void)
{
int err;
@@ -1246,6 +1569,11 @@ static int __init register_xts_algs(void)
&aes_xts_simdalg_aesni_avx);
if (err)
return err;
+ err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx,
+ ARRAY_SIZE(aes_gcm_algs_aesni_avx),
+ aes_gcm_simdalgs_aesni_avx);
+ if (err)
+ return err;
#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
if (!boot_cpu_has(X86_FEATURE_AVX2) ||
!boot_cpu_has(X86_FEATURE_VAES) ||
@@ -1269,23 +1597,42 @@ static int __init register_xts_algs(void)
&aes_xts_simdalg_vaes_avx10_256);
if (err)
return err;
+ err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256,
+ ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
+ aes_gcm_simdalgs_vaes_avx10_256);
+ if (err)
+ return err;
+
+ if (x86_match_cpu(zmm_exclusion_list)) {
+ int i;
- if (x86_match_cpu(zmm_exclusion_list))
aes_xts_alg_vaes_avx10_512.base.cra_priority = 1;
+ for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
+ aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
+ }
err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1,
&aes_xts_simdalg_vaes_avx10_512);
if (err)
return err;
+ err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512,
+ ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512),
+ aes_gcm_simdalgs_vaes_avx10_512);
+ if (err)
+ return err;
#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
return 0;
}
-static void unregister_xts_algs(void)
+static void unregister_avx_algs(void)
{
if (aes_xts_simdalg_aesni_avx)
simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1,
&aes_xts_simdalg_aesni_avx);
+ if (aes_gcm_simdalgs_aesni_avx[0])
+ simd_unregister_aeads(aes_gcm_algs_aesni_avx,
+ ARRAY_SIZE(aes_gcm_algs_aesni_avx),
+ aes_gcm_simdalgs_aesni_avx);
#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
if (aes_xts_simdalg_vaes_avx2)
simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1,
@@ -1293,106 +1640,33 @@ static void unregister_xts_algs(void)
if (aes_xts_simdalg_vaes_avx10_256)
simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1,
&aes_xts_simdalg_vaes_avx10_256);
+ if (aes_gcm_simdalgs_vaes_avx10_256[0])
+ simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256,
+ ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
+ aes_gcm_simdalgs_vaes_avx10_256);
if (aes_xts_simdalg_vaes_avx10_512)
simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1,
&aes_xts_simdalg_vaes_avx10_512);
+ if (aes_gcm_simdalgs_vaes_avx10_512[0])
+ simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512,
+ ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512),
+ aes_gcm_simdalgs_vaes_avx10_512);
#endif
}
#else /* CONFIG_X86_64 */
-static int __init register_xts_algs(void)
+static struct aead_alg aes_gcm_algs_aesni[0];
+static struct simd_aead_alg *aes_gcm_simdalgs_aesni[0];
+
+static int __init register_avx_algs(void)
{
return 0;
}
-static void unregister_xts_algs(void)
+static void unregister_avx_algs(void)
{
}
#endif /* !CONFIG_X86_64 */
-#ifdef CONFIG_X86_64
-static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
- unsigned int key_len)
-{
- struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead);
-
- return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?:
- aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded,
- ctx->hash_subkey);
-}
-
-static int generic_gcmaes_encrypt(struct aead_request *req)
-{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
- void *aes_ctx = &(ctx->aes_key_expanded);
- u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
- u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
- __be32 counter = cpu_to_be32(1);
-
- memcpy(iv, req->iv, 12);
- *((__be32 *)(iv+12)) = counter;
-
- return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv,
- aes_ctx);
-}
-
-static int generic_gcmaes_decrypt(struct aead_request *req)
-{
- __be32 counter = cpu_to_be32(1);
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
- void *aes_ctx = &(ctx->aes_key_expanded);
- u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
- u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
-
- memcpy(iv, req->iv, 12);
- *((__be32 *)(iv+12)) = counter;
-
- return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv,
- aes_ctx);
-}
-
-static struct aead_alg aesni_aeads[] = { {
- .setkey = common_rfc4106_set_key,
- .setauthsize = common_rfc4106_set_authsize,
- .encrypt = helper_rfc4106_encrypt,
- .decrypt = helper_rfc4106_decrypt,
- .ivsize = GCM_RFC4106_IV_SIZE,
- .maxauthsize = 16,
- .base = {
- .cra_name = "__rfc4106(gcm(aes))",
- .cra_driver_name = "__rfc4106-gcm-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx),
- .cra_alignmask = 0,
- .cra_module = THIS_MODULE,
- },
-}, {
- .setkey = generic_gcmaes_set_key,
- .setauthsize = generic_gcmaes_set_authsize,
- .encrypt = generic_gcmaes_encrypt,
- .decrypt = generic_gcmaes_decrypt,
- .ivsize = GCM_AES_IV_SIZE,
- .maxauthsize = 16,
- .base = {
- .cra_name = "__gcm(aes)",
- .cra_driver_name = "__generic-gcm-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct generic_gcmaes_ctx),
- .cra_alignmask = 0,
- .cra_module = THIS_MODULE,
- },
-} };
-#else
-static struct aead_alg aesni_aeads[0];
-#endif
-
-static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)];
-
static const struct x86_cpu_id aesni_cpu_id[] = {
X86_MATCH_FEATURE(X86_FEATURE_AES, NULL),
{}
@@ -1406,17 +1680,6 @@ static int __init aesni_init(void)
if (!x86_match_cpu(aesni_cpu_id))
return -ENODEV;
#ifdef CONFIG_X86_64
- if (boot_cpu_has(X86_FEATURE_AVX2)) {
- pr_info("AVX2 version of gcm_enc/dec engaged.\n");
- static_branch_enable(&gcm_use_avx);
- static_branch_enable(&gcm_use_avx2);
- } else
- if (boot_cpu_has(X86_FEATURE_AVX)) {
- pr_info("AVX version of gcm_enc/dec engaged.\n");
- static_branch_enable(&gcm_use_avx);
- } else {
- pr_info("SSE version of gcm_enc/dec engaged.\n");
- }
if (boot_cpu_has(X86_FEATURE_AVX)) {
/* optimize performance of ctr mode encryption transform */
static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm);
@@ -1434,8 +1697,9 @@ static int __init aesni_init(void)
if (err)
goto unregister_cipher;
- err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads),
- aesni_simd_aeads);
+ err = simd_register_aeads_compat(aes_gcm_algs_aesni,
+ ARRAY_SIZE(aes_gcm_algs_aesni),
+ aes_gcm_simdalgs_aesni);
if (err)
goto unregister_skciphers;
@@ -1447,22 +1711,22 @@ static int __init aesni_init(void)
goto unregister_aeads;
#endif /* CONFIG_X86_64 */
- err = register_xts_algs();
+ err = register_avx_algs();
if (err)
- goto unregister_xts;
+ goto unregister_avx;
return 0;
-unregister_xts:
- unregister_xts_algs();
+unregister_avx:
+ unregister_avx_algs();
#ifdef CONFIG_X86_64
if (aesni_simd_xctr)
simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
unregister_aeads:
#endif /* CONFIG_X86_64 */
- simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
- aesni_simd_aeads);
-
+ simd_unregister_aeads(aes_gcm_algs_aesni,
+ ARRAY_SIZE(aes_gcm_algs_aesni),
+ aes_gcm_simdalgs_aesni);
unregister_skciphers:
simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
aesni_simd_skciphers);
@@ -1473,8 +1737,9 @@ unregister_cipher:
static void __exit aesni_exit(void)
{
- simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
- aesni_simd_aeads);
+ simd_unregister_aeads(aes_gcm_algs_aesni,
+ ARRAY_SIZE(aes_gcm_algs_aesni),
+ aes_gcm_simdalgs_aesni);
simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
aesni_simd_skciphers);
crypto_unregister_alg(&aesni_cipher_alg);
@@ -1482,7 +1747,7 @@ static void __exit aesni_exit(void)
if (boot_cpu_has(X86_FEATURE_AVX))
simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
#endif /* CONFIG_X86_64 */
- unregister_xts_algs();
+ unregister_avx_algs();
}
late_initcall(aesni_init);
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 98cf3b4e4c9f..9f5e342b9845 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -195,6 +195,7 @@ module_init(crc32_pclmul_mod_init);
module_exit(crc32_pclmul_mod_fini);
MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>");
+MODULE_DESCRIPTION("CRC32 algorithm (IEEE 802.3) accelerated with PCLMULQDQ");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("crc32");
diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
index d55fa9e9b9e6..dcfc0de333de 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -1720,5 +1720,6 @@ module_exit(curve25519_mod_exit);
MODULE_ALIAS_CRYPTO("curve25519");
MODULE_ALIAS_CRYPTO("curve25519-x86");
+MODULE_DESCRIPTION("Curve25519 algorithm, ADX optimized");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 1dfb8af48a3c..08ff4b489f7e 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -12,7 +12,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sizes.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/simd.h>
asmlinkage void poly1305_init_x86_64(void *ctx,
@@ -269,7 +269,7 @@ static int __init poly1305_simd_mod_init(void)
boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
/* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
- boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
+ boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X)
static_branch_enable(&poly1305_use_avx512);
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
}
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 90454cf18e0d..1a1ecfa7f72a 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -5,6 +5,7 @@
* Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
*/
+#include <asm/cpu_device_id.h>
#include <crypto/algapi.h>
#include <crypto/twofish.h>
#include <linux/crypto.h>
@@ -107,10 +108,10 @@ static bool is_blacklisted_cpu(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return false;
- if (boot_cpu_data.x86 == 0x06 &&
- (boot_cpu_data.x86_model == 0x1c ||
- boot_cpu_data.x86_model == 0x26 ||
- boot_cpu_data.x86_model == 0x36)) {
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_ATOM_BONNELL:
+ case INTEL_ATOM_BONNELL_MID:
+ case INTEL_ATOM_SALTWELL:
/*
* On Atom, twofish-3way is slower than original assembler
* implementation. Twofish-3way trades off some performance in