summaryrefslogtreecommitdiff
path: root/arch/x86/crypto/aes-gcm-aesni-x86_64.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/crypto/aes-gcm-aesni-x86_64.S')
-rw-r--r--arch/x86/crypto/aes-gcm-aesni-x86_64.S1128
1 files changed, 1128 insertions, 0 deletions
diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
new file mode 100644
index 000000000000..45940e2883a0
--- /dev/null
+++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
@@ -0,0 +1,1128 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// AES-NI optimized AES-GCM for x86_64
+//
+// Copyright 2024 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+//------------------------------------------------------------------------------
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+//
+// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
+// support the original set of AES instructions, i.e. AES-NI. Two
+// implementations are provided, one that uses AVX and one that doesn't. They
+// are very similar, being generated by the same macros. The only difference is
+// that the AVX implementation takes advantage of VEX-coded instructions in some
+// places to avoid some 'movdqu' and 'movdqa' instructions. The AVX
+// implementation does *not* use 256-bit vectors, as AES is not supported on
+// 256-bit vectors until the VAES feature (which this file doesn't target).
+//
+// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
+// for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems
+// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
+//
+// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
+// more thoroughly commented. This file has the following notable changes:
+//
+// - The vector length is fixed at 128-bit, i.e. xmm registers. This means
+// there is only one AES block (and GHASH block) per register.
+//
+// - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
+// 32. We work around this by being much more careful about using
+// registers, relying heavily on loads to load values as they are needed.
+//
+// - Masking is not available either. We work around this by implementing
+// partial block loads and stores using overlapping scalar loads and stores
+// combined with shifts and SSE4.1 insertion and extraction instructions.
+//
+// - The main loop is organized differently due to the different design
+// constraints. First, with just one AES block per SIMD register, on some
+// CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore
+// do an 8-register wide loop. Considering that and the fact that we have
+// just 16 SIMD registers to work with, it's not feasible to cache AES
+// round keys and GHASH key powers in registers across loop iterations.
+// That's not ideal, but also not actually that bad, since loads can run in
+// parallel with other instructions. Significantly, this also makes it
+// possible to roll up the inner loops, relying on hardware loop unrolling
+// instead of software loop unrolling, greatly reducing code size.
+//
+// - We implement the GHASH multiplications in the main loop using Karatsuba
+// multiplication instead of schoolbook multiplication. This saves one
+// pclmulqdq instruction per block, at the cost of one 64-bit load, one
+// pshufd, and 0.25 pxors per block. (This is without the three-argument
+// XOR support that would be provided by AVX512 / AVX10, which would be
+// more beneficial to schoolbook than Karatsuba.)
+//
+// As a rough approximation, we can assume that Karatsuba multiplication is
+// faster than schoolbook multiplication in this context if one pshufd and
+// 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit
+// load is "free" due to running in parallel with arithmetic instructions.)
+// This is true on AMD CPUs, including all that support pclmulqdq up to at
+// least Zen 3. It's also true on older Intel CPUs: Westmere through
+// Haswell on the Core side, and Silvermont through Goldmont Plus on the
+// low-power side. On some of these CPUs, pclmulqdq is quite slow, and the
+// benefit of Karatsuba should be substantial. On newer Intel CPUs,
+// schoolbook multiplication should be faster, but only marginally.
+//
+// Not all these CPUs were available to be tested. However, benchmarks on
+// available CPUs suggest that this approximation is plausible. Switching
+// to Karatsuba showed negligible change (< 1%) on Intel Broadwell,
+// Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
+// Considering that and the fact that Karatsuba should be even more
+// beneficial on older Intel CPUs, it seems like the right choice here.
+//
+// An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
+// saved by using a multiplication-less reduction method. We don't do that
+// because it would require a large number of shift and xor instructions,
+// making it less worthwhile and likely harmful on newer CPUs.
+//
+// It does make sense to sometimes use a different reduction optimization
+// that saves a pclmulqdq, though: precompute the hash key times x^64, and
+// multiply the low half of the data block by the hash key with the extra
+// factor of x^64. This eliminates one step of the reduction. However,
+// this is incompatible with Karatsuba multiplication. Therefore, for
+// multi-block processing we use Karatsuba multiplication with a regular
+// reduction. For single-block processing, we use the x^64 optimization.
+
+#include <linux/linkage.h>
+
+.section .rodata
+.p2align 4
+.Lbswap_mask:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+.Lgfpoly:
+ .quad 0xc200000000000000
+.Lone:
+ .quad 1
+.Lgfpoly_and_internal_carrybit:
+ .octa 0xc2000000000000010000000000000001
+ // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
+ // 'len' 0xff bytes and the rest zeroes.
+.Lzeropad_mask:
+ .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0
+
+// Offsets in struct aes_gcm_key_aesni
+#define OFFSETOF_AESKEYLEN 480
+#define OFFSETOF_H_POWERS 496
+#define OFFSETOF_H_POWERS_XORED 624
+#define OFFSETOF_H_TIMES_X64 688
+
+.text
+
+// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback
+// assumes that all operands are distinct and that any mem operand is aligned.
+.macro _vpclmulqdq imm, src1, src2, dst
+.if USE_AVX
+ vpclmulqdq \imm, \src1, \src2, \dst
+.else
+ movdqa \src2, \dst
+ pclmulqdq \imm, \src1, \dst
+.endif
+.endm
+
+// Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes
+// that all operands are distinct and that any mem operand is aligned.
+.macro _vpshufb src1, src2, dst
+.if USE_AVX
+ vpshufb \src1, \src2, \dst
+.else
+ movdqa \src2, \dst
+ pshufb \src1, \dst
+.endif
+.endm
+
+// Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that
+// all operands are distinct.
+.macro _vpand src1, src2, dst
+.if USE_AVX
+ vpand \src1, \src2, \dst
+.else
+ movdqu \src1, \dst
+ pand \src2, \dst
+.endif
+.endm
+
+// XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must
+// be a temporary xmm register.
+.macro _xor_mem_to_reg mem, reg, tmp
+.if USE_AVX
+ vpxor \mem, \reg, \reg
+.else
+ movdqu \mem, \tmp
+ pxor \tmp, \reg
+.endif
+.endm
+
+// Test the unaligned memory operand \mem against the xmm register \reg. \tmp
+// must be a temporary xmm register.
+.macro _test_mem mem, reg, tmp
+.if USE_AVX
+ vptest \mem, \reg
+.else
+ movdqu \mem, \tmp
+ ptest \tmp, \reg
+.endif
+.endm
+
+// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
+// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro _load_partial_block src, dst, tmp64, tmp32
+ sub $8, %ecx // LEN - 8
+ jle .Lle8\@
+
+ // Load 9 <= LEN <= 15 bytes.
+ movq (\src), \dst // Load first 8 bytes
+ mov (\src, %rcx), %rax // Load last 8 bytes
+ neg %ecx
+ shl $3, %ecx
+ shr %cl, %rax // Discard overlapping bytes
+ pinsrq $1, %rax, \dst
+ jmp .Ldone\@
+
+.Lle8\@:
+ add $4, %ecx // LEN - 4
+ jl .Llt4\@
+
+ // Load 4 <= LEN <= 8 bytes.
+ mov (\src), %eax // Load first 4 bytes
+ mov (\src, %rcx), \tmp32 // Load last 4 bytes
+ jmp .Lcombine\@
+
+.Llt4\@:
+ // Load 1 <= LEN <= 3 bytes.
+ add $2, %ecx // LEN - 2
+ movzbl (\src), %eax // Load first byte
+ jl .Lmovq\@
+ movzwl (\src, %rcx), \tmp32 // Load last 2 bytes
+.Lcombine\@:
+ shl $3, %ecx
+ shl %cl, \tmp64
+ or \tmp64, %rax // Combine the two parts
+.Lmovq\@:
+ movq %rax, \dst
+.Ldone\@:
+.endm
+
+// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
+// Clobbers %rax, %rcx, and %rsi.
+.macro _store_partial_block src, dst
+ sub $8, %ecx // LEN - 8
+ jl .Llt8\@
+
+ // Store 8 <= LEN <= 15 bytes.
+ pextrq $1, \src, %rax
+ mov %ecx, %esi
+ shl $3, %ecx
+ ror %cl, %rax
+ mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes
+ movq \src, (\dst) // Store first 8 bytes
+ jmp .Ldone\@
+
+.Llt8\@:
+ add $4, %ecx // LEN - 4
+ jl .Llt4\@
+
+ // Store 4 <= LEN <= 7 bytes.
+ pextrd $1, \src, %eax
+ mov %ecx, %esi
+ shl $3, %ecx
+ ror %cl, %eax
+ mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes
+ movd \src, (\dst) // Store first 4 bytes
+ jmp .Ldone\@
+
+.Llt4\@:
+ // Store 1 <= LEN <= 3 bytes.
+ pextrb $0, \src, 0(\dst)
+ cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2?
+ jl .Ldone\@
+ pextrb $1, \src, 1(\dst)
+ je .Ldone\@
+ pextrb $2, \src, 2(\dst)
+.Ldone\@:
+.endm
+
+// Do one step of GHASH-multiplying \a by \b and storing the reduced product in
+// \b. To complete all steps, this must be invoked with \i=0 through \i=9.
+// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
+// .Lgfpoly constant, and \t0-\t1 must be temporary registers.
+.macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1
+
+ // MI = (a_L * b_H) + ((a*x^64)_L * b_L)
+.if \i == 0
+ _vpclmulqdq $0x01, \a, \b, \t0
+.elseif \i == 1
+ _vpclmulqdq $0x00, \a_times_x64, \b, \t1
+.elseif \i == 2
+ pxor \t1, \t0
+
+ // HI = (a_H * b_H) + ((a*x^64)_H * b_L)
+.elseif \i == 3
+ _vpclmulqdq $0x11, \a, \b, \t1
+.elseif \i == 4
+ pclmulqdq $0x10, \a_times_x64, \b
+.elseif \i == 5
+ pxor \t1, \b
+.elseif \i == 6
+
+ // Fold MI into HI.
+ pshufd $0x4e, \t0, \t1 // Swap halves of MI
+.elseif \i == 7
+ pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+ pxor \t1, \b
+.elseif \i == 9
+ pxor \t0, \b
+.endif
+.endm
+
+// GHASH-multiply \a by \b and store the reduced product in \b.
+// See _ghash_mul_step for details.
+.macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1
+.irp i, 0,1,2,3,4,5,6,7,8,9
+ _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
+.endr
+.endm
+
+// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
+// This does Karatsuba multiplication and must be paired with _ghash_reduce. On
+// the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the
+// two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered.
+.macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0
+
+ // LO += a_L * b_L
+ _vpclmulqdq $0x00, \a, \b, \t0
+ pxor \t0, \lo
+
+ // b_L + b_H
+ pshufd $0x4e, \b, \t0
+ pxor \b, \t0
+
+ // HI += a_H * b_H
+ pclmulqdq $0x11, \a, \b
+ pxor \b, \hi
+
+ // MI += (a_L + a_H) * (b_L + b_H)
+ pclmulqdq $0x00, \a_xored, \t0
+ pxor \t0, \mi
+.endm
+
+// Reduce the product from \lo, \mi, and \hi, and store the result in \dst.
+// This assumes that _ghash_mul_noreduce was used.
+.macro _ghash_reduce lo, mi, hi, dst, t0
+
+ movq .Lgfpoly(%rip), \t0
+
+ // MI += LO + HI (needed because we used Karatsuba multiplication)
+ pxor \lo, \mi
+ pxor \hi, \mi
+
+ // Fold LO into MI.
+ pshufd $0x4e, \lo, \dst
+ pclmulqdq $0x00, \t0, \lo
+ pxor \dst, \mi
+ pxor \lo, \mi
+
+ // Fold MI into HI.
+ pshufd $0x4e, \mi, \dst
+ pclmulqdq $0x00, \t0, \mi
+ pxor \hi, \dst
+ pxor \mi, \dst
+.endm
+
+// Do the first step of the GHASH update of a set of 8 ciphertext blocks.
+//
+// The whole GHASH update does:
+//
+// GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
+// blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1
+//
+// This macro just does the first step: it does the unreduced multiplication
+// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
+// registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the
+// inner block counter in %rax, which is a value that counts up by 8 for each
+// block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
+//
+// To reduce the number of pclmulqdq instructions required, both this macro and
+// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook
+// multiplication. See the file comment for more details about this choice.
+//
+// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
+// encrypting, or SRC if decrypting. They also expect the precomputed hash key
+// powers H^i and their XOR'd-together halves to be available in the struct
+// pointed to by KEY. Both macros clobber TMP[0-2].
+.macro _ghash_update_begin_8x enc
+
+ // Initialize the inner block counter.
+ xor %eax, %eax
+
+ // Load the highest hash key power, H^8.
+ movdqa OFFSETOF_H_POWERS(KEY), TMP0
+
+ // Load the first ciphertext block and byte-reflect it.
+.if \enc
+ movdqu (DST), TMP1
+.else
+ movdqu (SRC), TMP1
+.endif
+ pshufb BSWAP_MASK, TMP1
+
+ // Add the GHASH accumulator to the ciphertext block to get the block
+ // 'b' that needs to be multiplied with the hash key power 'a'.
+ pxor TMP1, GHASH_ACC
+
+ // b_L + b_H
+ pshufd $0x4e, GHASH_ACC, MI
+ pxor GHASH_ACC, MI
+
+ // LO = a_L * b_L
+ _vpclmulqdq $0x00, TMP0, GHASH_ACC, LO
+
+ // HI = a_H * b_H
+ pclmulqdq $0x11, TMP0, GHASH_ACC
+
+ // MI = (a_L + a_H) * (b_L + b_H)
+ pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI
+.endm
+
+// Continue the GHASH update of 8 ciphertext blocks as described above by doing
+// an unreduced multiplication of the next ciphertext block by the next lowest
+// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
+.macro _ghash_update_continue_8x enc
+ add $8, %eax
+
+ // Load the next lowest key power.
+ movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0
+
+ // Load the next ciphertext block and byte-reflect it.
+.if \enc
+ movdqu (DST,%rax,2), TMP1
+.else
+ movdqu (SRC,%rax,2), TMP1
+.endif
+ pshufb BSWAP_MASK, TMP1
+
+ // LO += a_L * b_L
+ _vpclmulqdq $0x00, TMP0, TMP1, TMP2
+ pxor TMP2, LO
+
+ // b_L + b_H
+ pshufd $0x4e, TMP1, TMP2
+ pxor TMP1, TMP2
+
+ // HI += a_H * b_H
+ pclmulqdq $0x11, TMP0, TMP1
+ pxor TMP1, GHASH_ACC
+
+ // MI += (a_L + a_H) * (b_L + b_H)
+ movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1
+ pclmulqdq $0x00, TMP1, TMP2
+ pxor TMP2, MI
+.endm
+
+// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to
+// _ghash_reduce, but it's hardcoded to use the registers of the main loop and
+// it uses the same register for HI and the destination. It's also divided into
+// two steps. TMP1 must be preserved across steps.
+//
+// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of
+// shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would
+// increase the critical path length, and it seems to slightly hurt performance.
+.macro _ghash_update_end_8x_step i
+.if \i == 0
+ movq .Lgfpoly(%rip), TMP1
+ pxor LO, MI
+ pxor GHASH_ACC, MI
+ pshufd $0x4e, LO, TMP2
+ pclmulqdq $0x00, TMP1, LO
+ pxor TMP2, MI
+ pxor LO, MI
+.elseif \i == 1
+ pshufd $0x4e, MI, TMP2
+ pclmulqdq $0x00, TMP1, MI
+ pxor TMP2, GHASH_ACC
+ pxor MI, GHASH_ACC
+.endif
+.endm
+
+// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);
+//
+// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH
+// related fields in the key struct.
+.macro _aes_gcm_precompute
+
+ // Function arguments
+ .set KEY, %rdi
+
+ // Additional local variables.
+ // %xmm0-%xmm1 and %rax are used as temporaries.
+ .set RNDKEYLAST_PTR, %rsi
+ .set H_CUR, %xmm2
+ .set H_POW1, %xmm3 // H^1
+ .set H_POW1_X64, %xmm4 // H^1 * x^64
+ .set GFPOLY, %xmm5
+
+ // Encrypt an all-zeroes block to get the raw hash subkey.
+ movl OFFSETOF_AESKEYLEN(KEY), %eax
+ lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR
+ movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block
+ lea 16(KEY), %rax
+1:
+ aesenc (%rax), H_POW1
+ add $16, %rax
+ cmp %rax, RNDKEYLAST_PTR
+ jne 1b
+ aesenclast (RNDKEYLAST_PTR), H_POW1
+
+ // Preprocess the raw hash subkey as needed to operate on GHASH's
+ // bit-reflected values directly: reflect its bytes, then multiply it by
+ // x^-1 (using the backwards interpretation of polynomial coefficients
+ // from the GCM spec) or equivalently x^1 (using the alternative,
+ // natural interpretation of polynomial coefficients).
+ pshufb .Lbswap_mask(%rip), H_POW1
+ movdqa H_POW1, %xmm0
+ pshufd $0xd3, %xmm0, %xmm0
+ psrad $31, %xmm0
+ paddq H_POW1, H_POW1
+ pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0
+ pxor %xmm0, H_POW1
+
+ // Store H^1.
+ movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY)
+
+ // Compute and store H^1 * x^64.
+ movq .Lgfpoly(%rip), GFPOLY
+ pshufd $0x4e, H_POW1, %xmm0
+ _vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64
+ pxor %xmm0, H_POW1_X64
+ movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)
+
+ // Compute and store the halves of H^1 XOR'd together.
+ pxor H_POW1, %xmm0
+ movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)
+
+ // Compute and store the remaining key powers H^2 through H^8.
+ movdqa H_POW1, H_CUR
+ mov $6*8, %eax
+.Lprecompute_next\@:
+ // Compute H^i = H^{i-1} * H^1.
+ _ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1
+ // Store H^i.
+ movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)
+ // Compute and store the halves of H^i XOR'd together.
+ pshufd $0x4e, H_CUR, %xmm0
+ pxor H_CUR, %xmm0
+ movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)
+ sub $8, %eax
+ jge .Lprecompute_next\@
+
+ RET
+.endm
+
+// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
+// u8 ghash_acc[16], const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all
+// zeroes. |aadlen| must be a multiple of 16, except on the last call where it
+// can be any length. The caller must do any buffering needed to ensure this.
+.macro _aes_gcm_aad_update
+
+ // Function arguments
+ .set KEY, %rdi
+ .set GHASH_ACC_PTR, %rsi
+ .set AAD, %rdx
+ .set AADLEN, %ecx
+ // Note: _load_partial_block relies on AADLEN being in %ecx.
+
+ // Additional local variables.
+ // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
+ .set BSWAP_MASK, %xmm2
+ .set GHASH_ACC, %xmm3
+ .set H_POW1, %xmm4 // H^1
+ .set H_POW1_X64, %xmm5 // H^1 * x^64
+ .set GFPOLY, %xmm6
+
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
+ movdqu (GHASH_ACC_PTR), GHASH_ACC
+ movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1
+ movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
+ movq .Lgfpoly(%rip), GFPOLY
+
+ // Process the AAD one full block at a time.
+ sub $16, AADLEN
+ jl .Laad_loop_1x_done\@
+.Laad_loop_1x\@:
+ movdqu (AAD), %xmm0
+ pshufb BSWAP_MASK, %xmm0
+ pxor %xmm0, GHASH_ACC
+ _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
+ add $16, AAD
+ sub $16, AADLEN
+ jge .Laad_loop_1x\@
+.Laad_loop_1x_done\@:
+ // Check whether there is a partial block at the end.
+ add $16, AADLEN
+ jz .Laad_done\@
+
+ // Process a partial block of length 1 <= AADLEN <= 15.
+ // _load_partial_block assumes that %ecx contains AADLEN.
+ _load_partial_block AAD, %xmm0, %r10, %r10d
+ pshufb BSWAP_MASK, %xmm0
+ pxor %xmm0, GHASH_ACC
+ _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
+
+.Laad_done\@:
+ movdqu GHASH_ACC, (GHASH_ACC_PTR)
+ RET
+.endm
+
+// Increment LE_CTR eight times to generate eight little-endian counter blocks,
+// swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with
+// the zero-th AES round key. Clobbers TMP0 and TMP1.
+.macro _ctr_begin_8x
+ movq .Lone(%rip), TMP0
+ movdqa (KEY), TMP1 // zero-th round key
+.irp i, 0,1,2,3,4,5,6,7
+ _vpshufb BSWAP_MASK, LE_CTR, AESDATA\i
+ pxor TMP1, AESDATA\i
+ paddd TMP0, LE_CTR
+.endr
+.endm
+
+// Do a non-last round of AES on AESDATA[0-7] using \round_key.
+.macro _aesenc_8x round_key
+.irp i, 0,1,2,3,4,5,6,7
+ aesenc \round_key, AESDATA\i
+.endr
+.endm
+
+// Do the last round of AES on AESDATA[0-7] using \round_key.
+.macro _aesenclast_8x round_key
+.irp i, 0,1,2,3,4,5,6,7
+ aesenclast \round_key, AESDATA\i
+.endr
+.endm
+
+// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
+// store the result to DST. Clobbers TMP0.
+.macro _xor_data_8x
+.irp i, 0,1,2,3,4,5,6,7
+ _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0
+.endr
+.irp i, 0,1,2,3,4,5,6,7
+ movdqu AESDATA\i, \i*16(DST)
+.endr
+.endm
+
+// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// const u8 *src, u8 *dst, int datalen);
+//
+// This macro generates a GCM encryption or decryption update function with the
+// above prototype (with \enc selecting which one).
+//
+// This function computes the next portion of the CTR keystream, XOR's it with
+// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
+// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the
+// next |datalen| ciphertext bytes.
+//
+// |datalen| must be a multiple of 16, except on the last call where it can be
+// any length. The caller must do any buffering needed to ensure this. Both
+// in-place and out-of-place en/decryption are supported.
+//
+// |le_ctr| must give the current counter in little-endian format. For a new
+// message, the low word of the counter must be 2. This function loads the
+// counter from |le_ctr| and increments the loaded counter as needed, but it
+// does *not* store the updated counter back to |le_ctr|. The caller must
+// update |le_ctr| if any more data segments follow. Internally, only the low
+// 32-bit word of the counter is incremented, following the GCM standard.
+.macro _aes_gcm_update enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg
+ .set GHASH_ACC_PTR, %rdx
+ .set SRC, %rcx
+ .set DST, %r8
+ .set DATALEN, %r9d
+ .set DATALEN64, %r9 // Zero-extend DATALEN before using!
+ // Note: the code setting up for _load_partial_block assumes that SRC is
+ // in %rcx (and that DATALEN is *not* in %rcx).
+
+ // Additional local variables
+
+ // %rax and %rsi are used as temporary registers. Note: %rsi overlaps
+ // with LE_CTR_PTR, which is used only at the beginning.
+
+ .set AESKEYLEN, %r10d // AES key length in bytes
+ .set AESKEYLEN64, %r10
+ .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key
+
+ // Put the most frequently used values in %xmm0-%xmm7 to reduce code
+ // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
+ .set TMP0, %xmm0
+ .set TMP1, %xmm1
+ .set TMP2, %xmm2
+ .set LO, %xmm3 // Low part of unreduced product
+ .set MI, %xmm4 // Middle part of unreduced product
+ .set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also
+ // the high part of unreduced product
+ .set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes
+ .set LE_CTR, %xmm7 // Little-endian counter value
+ .set AESDATA0, %xmm8
+ .set AESDATA1, %xmm9
+ .set AESDATA2, %xmm10
+ .set AESDATA3, %xmm11
+ .set AESDATA4, %xmm12
+ .set AESDATA5, %xmm13
+ .set AESDATA6, %xmm14
+ .set AESDATA7, %xmm15
+
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
+ movdqu (GHASH_ACC_PTR), GHASH_ACC
+ movdqu (LE_CTR_PTR), LE_CTR
+
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+ lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
+
+ // If there are at least 8*16 bytes of data, then continue into the main
+ // loop, which processes 8*16 bytes of data per iteration.
+ //
+ // The main loop interleaves AES and GHASH to improve performance on
+ // CPUs that can execute these instructions in parallel. When
+ // decrypting, the GHASH input (the ciphertext) is immediately
+ // available. When encrypting, we instead encrypt a set of 8 blocks
+ // first and then GHASH those blocks while encrypting the next set of 8,
+ // repeat that as needed, and finally GHASH the last set of 8 blocks.
+ //
+ // Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
+ // as this makes the immediate fit in a signed byte, saving 3 bytes.
+ add $-8*16, DATALEN
+ jl .Lcrypt_loop_8x_done\@
+.if \enc
+ // Encrypt the first 8 plaintext blocks.
+ _ctr_begin_8x
+ lea 16(KEY), %rsi
+ .p2align 4
+1:
+ movdqa (%rsi), TMP0
+ _aesenc_8x TMP0
+ add $16, %rsi
+ cmp %rsi, RNDKEYLAST_PTR
+ jne 1b
+ movdqa (%rsi), TMP0
+ _aesenclast_8x TMP0
+ _xor_data_8x
+ // Don't increment DST until the ciphertext blocks have been hashed.
+ sub $-8*16, SRC
+ add $-8*16, DATALEN
+ jl .Lghash_last_ciphertext_8x\@
+.endif
+
+ .p2align 4
+.Lcrypt_loop_8x\@:
+
+ // Generate the next set of 8 counter blocks and start encrypting them.
+ _ctr_begin_8x
+ lea 16(KEY), %rsi
+
+ // Do a round of AES, and start the GHASH update of 8 ciphertext blocks
+ // by doing the unreduced multiplication for the first ciphertext block.
+ movdqa (%rsi), TMP0
+ add $16, %rsi
+ _aesenc_8x TMP0
+ _ghash_update_begin_8x \enc
+
+ // Do 7 more rounds of AES, and continue the GHASH update by doing the
+ // unreduced multiplication for the remaining ciphertext blocks.
+ .p2align 4
+1:
+ movdqa (%rsi), TMP0
+ add $16, %rsi
+ _aesenc_8x TMP0
+ _ghash_update_continue_8x \enc
+ cmp $7*8, %eax
+ jne 1b
+
+ // Do the remaining AES rounds.
+ .p2align 4
+1:
+ movdqa (%rsi), TMP0
+ add $16, %rsi
+ _aesenc_8x TMP0
+ cmp %rsi, RNDKEYLAST_PTR
+ jne 1b
+
+ // Do the GHASH reduction and the last round of AES.
+ movdqa (RNDKEYLAST_PTR), TMP0
+ _ghash_update_end_8x_step 0
+ _aesenclast_8x TMP0
+ _ghash_update_end_8x_step 1
+
+ // XOR the data with the AES-CTR keystream blocks.
+.if \enc
+ sub $-8*16, DST
+.endif
+ _xor_data_8x
+ sub $-8*16, SRC
+.if !\enc
+ sub $-8*16, DST
+.endif
+ add $-8*16, DATALEN
+ jge .Lcrypt_loop_8x\@
+
+.if \enc
+.Lghash_last_ciphertext_8x\@:
+ // Update GHASH with the last set of 8 ciphertext blocks.
+ _ghash_update_begin_8x \enc
+ .p2align 4
+1:
+ _ghash_update_continue_8x \enc
+ cmp $7*8, %eax
+ jne 1b
+ _ghash_update_end_8x_step 0
+ _ghash_update_end_8x_step 1
+ sub $-8*16, DST
+.endif
+
+.Lcrypt_loop_8x_done\@:
+
+ sub $-8*16, DATALEN
+ jz .Ldone\@
+
+ // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep
+ // things simple and keep the code size down by just going one block at
+ // a time, again taking advantage of hardware loop unrolling. Since
+ // there are enough key powers available for all remaining data, we do
+ // the GHASH multiplications unreduced, and only reduce at the very end.
+
+ .set HI, TMP2
+ .set H_POW, AESDATA0
+ .set H_POW_XORED, AESDATA1
+ .set ONE, AESDATA2
+
+ movq .Lone(%rip), ONE
+
+ // Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+ pxor LO, LO
+ pxor MI, MI
+ pxor HI, HI
+
+ // Set up a block counter %rax to contain 8*(8-n), where n is the number
+ // of blocks that remain, counting any partial block. This will be used
+ // to access the key powers H^n through H^1.
+ mov DATALEN, %eax
+ neg %eax
+ and $~15, %eax
+ sar $1, %eax
+ add $64, %eax
+
+ sub $16, DATALEN
+ jl .Lcrypt_loop_1x_done\@
+
+ // Process the data one full block at a time.
+.Lcrypt_loop_1x\@:
+
+ // Encrypt the next counter block.
+ _vpshufb BSWAP_MASK, LE_CTR, TMP0
+ paddd ONE, LE_CTR
+ pxor (KEY), TMP0
+ lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ aesenc -7*16(%rsi), TMP0
+ aesenc -6*16(%rsi), TMP0
+192:
+ aesenc -5*16(%rsi), TMP0
+ aesenc -4*16(%rsi), TMP0
+128:
+.irp i, -3,-2,-1,0,1,2,3,4,5
+ aesenc \i*16(%rsi), TMP0
+.endr
+ aesenclast (RNDKEYLAST_PTR), TMP0
+
+ // Load the next key power H^i.
+ movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
+ movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
+
+ // XOR the keystream block that was just generated in TMP0 with the next
+ // source data block and store the resulting en/decrypted data to DST.
+.if \enc
+ _xor_mem_to_reg (SRC), TMP0, tmp=TMP1
+ movdqu TMP0, (DST)
+.else
+ movdqu (SRC), TMP1
+ pxor TMP1, TMP0
+ movdqu TMP0, (DST)
+.endif
+
+ // Update GHASH with the ciphertext block.
+.if \enc
+ pshufb BSWAP_MASK, TMP0
+ pxor TMP0, GHASH_ACC
+.else
+ pshufb BSWAP_MASK, TMP1
+ pxor TMP1, GHASH_ACC
+.endif
+ _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
+ pxor GHASH_ACC, GHASH_ACC
+
+ add $8, %eax
+ add $16, SRC
+ add $16, DST
+ sub $16, DATALEN
+ jge .Lcrypt_loop_1x\@
+.Lcrypt_loop_1x_done\@:
+ // Check whether there is a partial block at the end.
+ add $16, DATALEN
+ jz .Lghash_reduce\@
+
+ // Process a partial block of length 1 <= DATALEN <= 15.
+
+ // Encrypt a counter block for the last time.
+ pshufb BSWAP_MASK, LE_CTR
+ pxor (KEY), LE_CTR
+ lea 16(KEY), %rsi
+1:
+ aesenc (%rsi), LE_CTR
+ add $16, %rsi
+ cmp %rsi, RNDKEYLAST_PTR
+ jne 1b
+ aesenclast (RNDKEYLAST_PTR), LE_CTR
+
+ // Load the lowest key power, H^1.
+ movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
+ movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
+
+ // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is
+ // in %rcx, but _load_partial_block needs DATALEN in %rcx instead.
+ // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.
+ mov SRC, RNDKEYLAST_PTR
+ mov DATALEN, %ecx
+ _load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi
+
+ // XOR the keystream block that was just generated in LE_CTR with the
+ // source data block and store the resulting en/decrypted data to DST.
+ pxor TMP0, LE_CTR
+ mov DATALEN, %ecx
+ _store_partial_block LE_CTR, DST
+
+ // If encrypting, zero-pad the final ciphertext block for GHASH. (If
+ // decrypting, this was already done by _load_partial_block.)
+.if \enc
+ lea .Lzeropad_mask+16(%rip), %rax
+ sub DATALEN64, %rax
+ _vpand (%rax), LE_CTR, TMP0
+.endif
+
+ // Update GHASH with the final ciphertext block.
+ pshufb BSWAP_MASK, TMP0
+ pxor TMP0, GHASH_ACC
+ _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
+
+.Lghash_reduce\@:
+ // Finally, do the GHASH reduction.
+ _ghash_reduce LO, MI, HI, GHASH_ACC, TMP0
+
+.Ldone\@:
+ // Store the updated GHASH accumulator back to memory.
+ movdqu GHASH_ACC, (GHASH_ACC_PTR)
+
+ RET
+.endm
+
+// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,
+// const u32 le_ctr[4], const u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen,
+// const u8 tag[16], int taglen);
+//
+// This macro generates one of the above two functions (with \enc selecting
+// which one). Both functions finish computing the GCM authentication tag by
+// updating GHASH with the lengths block and encrypting the GHASH accumulator.
+// |total_aadlen| and |total_datalen| must be the total length of the additional
+// authenticated data and the en/decrypted data in bytes, respectively.
+//
+// The encryption function then stores the full-length (16-byte) computed
+// authentication tag to |ghash_acc|. The decryption function instead loads the
+// expected authentication tag (the one that was transmitted) from the 16-byte
+// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
+// computed tag in constant time, and returns true if and only if they match.
+.macro _aes_gcm_final enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi
+ .set GHASH_ACC_PTR, %rdx
+ .set TOTAL_AADLEN, %rcx
+ .set TOTAL_DATALEN, %r8
+ .set TAG, %r9
+ .set TAGLEN, %r10d // Originally at 8(%rsp)
+ .set TAGLEN64, %r10
+
+ // Additional local variables.
+ // %rax and %xmm0-%xmm2 are used as temporary registers.
+ .set AESKEYLEN, %r11d
+ .set AESKEYLEN64, %r11
+ .set BSWAP_MASK, %xmm3
+ .set GHASH_ACC, %xmm4
+ .set H_POW1, %xmm5 // H^1
+ .set H_POW1_X64, %xmm6 // H^1 * x^64
+ .set GFPOLY, %xmm7
+
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+ // Set up a counter block with 1 in the low 32-bit word. This is the
+ // counter that produces the ciphertext needed to encrypt the auth tag.
+ movdqu (LE_CTR_PTR), %xmm0
+ mov $1, %eax
+ pinsrd $0, %eax, %xmm0
+
+ // Build the lengths block and XOR it into the GHASH accumulator.
+ movq TOTAL_DATALEN, GHASH_ACC
+ pinsrq $1, TOTAL_AADLEN, GHASH_ACC
+ psllq $3, GHASH_ACC // Bytes to bits
+ _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1
+
+ movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1
+ movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
+ movq .Lgfpoly(%rip), GFPOLY
+
+ // Make %rax point to the 6th from last AES round key. (Using signed
+ // byte offsets -7*16 through 6*16 decreases code size.)
+ lea (KEY,AESKEYLEN64,4), %rax
+
+ // AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
+ // Interleave the AES and GHASH instructions to improve performance.
+ pshufb BSWAP_MASK, %xmm0
+ pxor (KEY), %xmm0
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ aesenc -7*16(%rax), %xmm0
+ aesenc -6*16(%rax), %xmm0
+192:
+ aesenc -5*16(%rax), %xmm0
+ aesenc -4*16(%rax), %xmm0
+128:
+.irp i, 0,1,2,3,4,5,6,7,8
+ aesenc (\i-3)*16(%rax), %xmm0
+ _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
+.endr
+ aesenclast 6*16(%rax), %xmm0
+ _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
+
+ // Undo the byte reflection of the GHASH accumulator.
+ pshufb BSWAP_MASK, GHASH_ACC
+
+ // Encrypt the GHASH accumulator.
+ pxor %xmm0, GHASH_ACC
+
+.if \enc
+ // Return the computed auth tag.
+ movdqu GHASH_ACC, (GHASH_ACC_PTR)
+.else
+ .set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!
+
+ // Verify the auth tag in constant time by XOR'ing the transmitted and
+ // computed auth tags together and using the ptest instruction to check
+ // whether the first TAGLEN bytes of the result are zero.
+ _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0
+ movl 8(%rsp), TAGLEN
+ lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR
+ sub TAGLEN64, ZEROPAD_MASK_PTR
+ xor %eax, %eax
+ _test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0
+ sete %al
+.endif
+ RET
+.endm
+
+.set USE_AVX, 0
+SYM_FUNC_START(aes_gcm_precompute_aesni)
+ _aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_aesni)
+SYM_FUNC_START(aes_gcm_aad_update_aesni)
+ _aes_gcm_aad_update
+SYM_FUNC_END(aes_gcm_aad_update_aesni)
+SYM_FUNC_START(aes_gcm_enc_update_aesni)
+ _aes_gcm_update 1
+SYM_FUNC_END(aes_gcm_enc_update_aesni)
+SYM_FUNC_START(aes_gcm_dec_update_aesni)
+ _aes_gcm_update 0
+SYM_FUNC_END(aes_gcm_dec_update_aesni)
+SYM_FUNC_START(aes_gcm_enc_final_aesni)
+ _aes_gcm_final 1
+SYM_FUNC_END(aes_gcm_enc_final_aesni)
+SYM_FUNC_START(aes_gcm_dec_final_aesni)
+ _aes_gcm_final 0
+SYM_FUNC_END(aes_gcm_dec_final_aesni)
+
+.set USE_AVX, 1
+SYM_FUNC_START(aes_gcm_precompute_aesni_avx)
+ _aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_aesni_avx)
+SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)
+ _aes_gcm_aad_update
+SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)
+SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)
+ _aes_gcm_update 1
+SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)
+SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)
+ _aes_gcm_update 0
+SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)
+SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)
+ _aes_gcm_final 1
+SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)
+SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)
+ _aes_gcm_final 0
+SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)