diff options
Diffstat (limited to 'arch/x86/crypto/aes-gcm-avx10-x86_64.S')
-rw-r--r-- | arch/x86/crypto/aes-gcm-avx10-x86_64.S | 119 |
1 files changed, 48 insertions, 71 deletions
diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S index 97e0ee515fc5..02ee11083d4f 100644 --- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S +++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S @@ -88,7 +88,7 @@ // A shuffle mask that reflects the bytes of 16-byte blocks .Lbswap_mask: - .octa 0x000102030405060708090a0b0c0d0e0f + .octa 0x000102030405060708090a0b0c0d0e0f // This is the GHASH reducing polynomial without its constant term, i.e. // x^128 + x^7 + x^2 + x, represented using the backwards mapping @@ -384,8 +384,8 @@ vpshufd $0xd3, H_CUR_XMM, %xmm0 vpsrad $31, %xmm0, %xmm0 vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM - vpand .Lgfpoly_and_internal_carrybit(%rip), %xmm0, %xmm0 - vpxor %xmm0, H_CUR_XMM, H_CUR_XMM + // H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit + vpternlogd $0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, H_CUR_XMM // Load the gfpoly constant. vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY @@ -562,6 +562,32 @@ vpxord RNDKEY0, V3, V3 .endm +// Do the last AES round for four vectors of counter blocks V0-V3, XOR source +// data with the resulting keystream, and write the result to DST and +// GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.) +.macro _aesenclast_and_xor_4x + // XOR the source data with the last round key, saving the result in + // GHASHDATA[0-3]. This reduces latency by taking advantage of the + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). + vpxord 0*VL(SRC), RNDKEYLAST, GHASHDATA0 + vpxord 1*VL(SRC), RNDKEYLAST, GHASHDATA1 + vpxord 2*VL(SRC), RNDKEYLAST, GHASHDATA2 + vpxord 3*VL(SRC), RNDKEYLAST, GHASHDATA3 + + // Do the last AES round. This handles the XOR with the source data + // too, as per the optimization described above. + vaesenclast GHASHDATA0, V0, GHASHDATA0 + vaesenclast GHASHDATA1, V1, GHASHDATA1 + vaesenclast GHASHDATA2, V2, GHASHDATA2 + vaesenclast GHASHDATA3, V3, GHASHDATA3 + + // Store the en/decrypted data to DST. + vmovdqu8 GHASHDATA0, 0*VL(DST) + vmovdqu8 GHASHDATA1, 1*VL(DST) + vmovdqu8 GHASHDATA2, 2*VL(DST) + vmovdqu8 GHASHDATA3, 3*VL(DST) +.endm + // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key, // const u32 le_ctr[4], u8 ghash_acc[16], // const u8 *src, u8 *dst, int datalen); @@ -640,7 +666,7 @@ // LE_CTR contains the next set of little-endian counter blocks. .set LE_CTR, V12 - // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys, + // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys, // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key, // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last. .set RNDKEY0, V13 @@ -650,15 +676,10 @@ .set RNDKEY_M7, V17 .set RNDKEY_M6, V18 .set RNDKEY_M5, V19 - - // RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with - // the corresponding block of source data. This is useful because - // vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can - // be computed in parallel with the AES rounds. - .set RNDKEYLAST0, V20 - .set RNDKEYLAST1, V21 - .set RNDKEYLAST2, V22 - .set RNDKEYLAST3, V23 + .set RNDKEY_M4, V20 + .set RNDKEY_M3, V21 + .set RNDKEY_M2, V22 + .set RNDKEY_M1, V23 // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These // cannot coincide with anything used for AES encryption, since for @@ -713,7 +734,7 @@ // Pre-subtracting 4*VL from DATALEN saves an instruction from the main // loop and also ensures that at least one write always occurs to // DATALEN, zero-extending it and allowing DATALEN64 to be used later. - sub $4*VL, DATALEN + add $-4*VL, DATALEN // shorter than 'sub 4*VL' when VL=32 jl .Lcrypt_loop_4x_done\@ // Load powers of the hash key. @@ -748,26 +769,15 @@ add $16, %rax cmp %rax, RNDKEYLAST_PTR jne 1b - vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 - vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 - vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 - vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 - vaesenclast RNDKEYLAST0, V0, GHASHDATA0 - vaesenclast RNDKEYLAST1, V1, GHASHDATA1 - vaesenclast RNDKEYLAST2, V2, GHASHDATA2 - vaesenclast RNDKEYLAST3, V3, GHASHDATA3 - vmovdqu8 GHASHDATA0, 0*VL(DST) - vmovdqu8 GHASHDATA1, 1*VL(DST) - vmovdqu8 GHASHDATA2, 2*VL(DST) - vmovdqu8 GHASHDATA3, 3*VL(DST) - add $4*VL, SRC - add $4*VL, DST - sub $4*VL, DATALEN + _aesenclast_and_xor_4x + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, DATALEN jl .Lghash_last_ciphertext_4x\@ .endif // Cache as many additional AES round keys as possible. -.irp i, 9,8,7,6,5 +.irp i, 9,8,7,6,5,4,3,2,1 vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i .endr @@ -799,50 +809,17 @@ _vaesenc_4x RNDKEY 128: - // XOR the source data with the last round key, saving the result in - // RNDKEYLAST[0-3]. This reduces latency by taking advantage of the - // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). -.if \enc - vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 - vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 - vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 - vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 -.else - vpxord GHASHDATA0, RNDKEYLAST, RNDKEYLAST0 - vpxord GHASHDATA1, RNDKEYLAST, RNDKEYLAST1 - vpxord GHASHDATA2, RNDKEYLAST, RNDKEYLAST2 - vpxord GHASHDATA3, RNDKEYLAST, RNDKEYLAST3 -.endif - // Finish the AES encryption of the counter blocks in V0-V3, interleaved // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3]. -.irp i, 9,8,7,6,5 +.irp i, 9,8,7,6,5,4,3,2,1 + _ghash_step_4x (9 - \i) _vaesenc_4x RNDKEY_M\i - _ghash_step_4x (9 - \i) -.endr -.irp i, 4,3,2,1 - vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY - _vaesenc_4x RNDKEY - _ghash_step_4x (9 - \i) .endr _ghash_step_4x 9 - - // Do the last AES round. This handles the XOR with the source data - // too, as per the optimization described above. - vaesenclast RNDKEYLAST0, V0, GHASHDATA0 - vaesenclast RNDKEYLAST1, V1, GHASHDATA1 - vaesenclast RNDKEYLAST2, V2, GHASHDATA2 - vaesenclast RNDKEYLAST3, V3, GHASHDATA3 - - // Store the en/decrypted data to DST. - vmovdqu8 GHASHDATA0, 0*VL(DST) - vmovdqu8 GHASHDATA1, 1*VL(DST) - vmovdqu8 GHASHDATA2, 2*VL(DST) - vmovdqu8 GHASHDATA3, 3*VL(DST) - - add $4*VL, SRC - add $4*VL, DST - sub $4*VL, DATALEN + _aesenclast_and_xor_4x + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, DATALEN jge .Lcrypt_loop_4x\@ .if \enc @@ -856,7 +833,7 @@ .Lcrypt_loop_4x_done\@: // Undo the extra subtraction by 4*VL and check whether data remains. - add $4*VL, DATALEN + sub $-4*VL, DATALEN // shorter than 'add 4*VL' when VL=32 jz .Ldone\@ // The data length isn't a multiple of 4*VL. Process the remaining data @@ -940,7 +917,7 @@ // GHASH. However, any such blocks are all-zeroes, and the values that // they're multiplied with are also all-zeroes. Therefore they just add // 0 * 0 = 0 to the final GHASH result, which makes no difference. - vmovdqu8 (POWERS_PTR), H_POW1 + vmovdqu8 (POWERS_PTR), H_POW1 .if \enc vmovdqu8 V0, V1{%k1}{z} .endif |