From 8621caa0d45e731f2e9f5889ff5bb384fcd6e059 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 8 Dec 2016 14:28:58 +0000 Subject: crypto: arm64/chacha20 - implement NEON version based on SSE3 code This is a straight port to arm64/NEON of the x86 SSE3 implementation of the ChaCha20 stream cipher. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 6 + arch/arm64/crypto/Makefile | 3 + arch/arm64/crypto/chacha20-neon-core.S | 480 +++++++++++++++++++++++++++++++++ arch/arm64/crypto/chacha20-neon-glue.c | 131 +++++++++ 4 files changed, 620 insertions(+) create mode 100644 arch/arm64/crypto/chacha20-neon-core.S create mode 100644 arch/arm64/crypto/chacha20-neon-glue.c (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 450a85df041a..0bf0f531f539 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -72,4 +72,10 @@ config CRYPTO_CRC32_ARM64 depends on ARM64 select CRYPTO_HASH +config CRYPTO_CHACHA20_NEON + tristate "NEON accelerated ChaCha20 symmetric cipher" + depends on KERNEL_MODE_NEON + select CRYPTO_BLKCIPHER + select CRYPTO_CHACHA20 + endif diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index aa8888d7b744..9d2826c5fccf 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -41,6 +41,9 @@ sha256-arm64-y := sha256-glue.o sha256-core.o obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o sha512-arm64-y := sha512-glue.o sha512-core.o +obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o +chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o + AFLAGS_aes-ce.o := -DINTERLEAVE=4 AFLAGS_aes-neon.o := -DINTERLEAVE=4 diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha20-neon-core.S new file mode 100644 index 000000000000..e2cd65580807 --- /dev/null +++ b/arch/arm64/crypto/chacha20-neon-core.S @@ -0,0 +1,480 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions + * + * Copyright (C) 2016 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include + + .text + .align 6 + +ENTRY(chacha20_block_xor_neon) + // x0: Input state matrix, s + // x1: 1 data block output, o + // x2: 1 data block input, i + + // + // This function encrypts one ChaCha20 block by loading the state matrix + // in four NEON registers. It performs matrix operation on four words in + // parallel, but requires shuffling to rearrange the words after each + // round. + // + + // x0..3 = s0..3 + ld1 {v0.4s-v3.4s}, [x0] + ld1 {v8.4s-v11.4s}, [x0] + + mov x3, #10 + +.Ldoubleround: + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v4.16b, v3.16b, v0.16b + shl v3.4s, v4.4s, #8 + sri v3.4s, v4.4s, #24 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + ext v1.16b, v1.16b, v1.16b, #4 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + ext v3.16b, v3.16b, v3.16b, #12 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v4.16b, v3.16b, v0.16b + shl v3.4s, v4.4s, #8 + sri v3.4s, v4.4s, #24 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + ext v1.16b, v1.16b, v1.16b, #12 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + ext v3.16b, v3.16b, v3.16b, #4 + + subs x3, x3, #1 + b.ne .Ldoubleround + + ld1 {v4.16b-v7.16b}, [x2] + + // o0 = i0 ^ (x0 + s0) + add v0.4s, v0.4s, v8.4s + eor v0.16b, v0.16b, v4.16b + + // o1 = i1 ^ (x1 + s1) + add v1.4s, v1.4s, v9.4s + eor v1.16b, v1.16b, v5.16b + + // o2 = i2 ^ (x2 + s2) + add v2.4s, v2.4s, v10.4s + eor v2.16b, v2.16b, v6.16b + + // o3 = i3 ^ (x3 + s3) + add v3.4s, v3.4s, v11.4s + eor v3.16b, v3.16b, v7.16b + + st1 {v0.16b-v3.16b}, [x1] + + ret +ENDPROC(chacha20_block_xor_neon) + + .align 6 +ENTRY(chacha20_4block_xor_neon) + // x0: Input state matrix, s + // x1: 4 data blocks output, o + // x2: 4 data blocks input, i + + // + // This function encrypts four consecutive ChaCha20 blocks by loading + // the state matrix in NEON registers four times. The algorithm performs + // each operation on the corresponding word of each state matrix, hence + // requires no word shuffling. For final XORing step we transpose the + // matrix by interleaving 32- and then 64-bit words, which allows us to + // do XOR in NEON registers. + // + adr x3, CTRINC + ld1 {v16.4s}, [x3] + + // x0..15[0-3] = s0..3[0..3] + mov x4, x0 + ld4r { v0.4s- v3.4s}, [x4], #16 + ld4r { v4.4s- v7.4s}, [x4], #16 + ld4r { v8.4s-v11.4s}, [x4], #16 + ld4r {v12.4s-v15.4s}, [x4] + + // x12 += counter values 0-3 + add v12.4s, v12.4s, v16.4s + + mov x3, #10 + +.Ldoubleround4: + // x0 += x4, x12 = rotl32(x12 ^ x0, 16) + // x1 += x5, x13 = rotl32(x13 ^ x1, 16) + // x2 += x6, x14 = rotl32(x14 ^ x2, 16) + // x3 += x7, x15 = rotl32(x15 ^ x3, 16) + add v0.4s, v0.4s, v4.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + eor v14.16b, v14.16b, v2.16b + eor v15.16b, v15.16b, v3.16b + + rev32 v12.8h, v12.8h + rev32 v13.8h, v13.8h + rev32 v14.8h, v14.8h + rev32 v15.8h, v15.8h + + // x8 += x12, x4 = rotl32(x4 ^ x8, 12) + // x9 += x13, x5 = rotl32(x5 ^ x9, 12) + // x10 += x14, x6 = rotl32(x6 ^ x10, 12) + // x11 += x15, x7 = rotl32(x7 ^ x11, 12) + add v8.4s, v8.4s, v12.4s + add v9.4s, v9.4s, v13.4s + add v10.4s, v10.4s, v14.4s + add v11.4s, v11.4s, v15.4s + + eor v17.16b, v4.16b, v8.16b + eor v18.16b, v5.16b, v9.16b + eor v19.16b, v6.16b, v10.16b + eor v20.16b, v7.16b, v11.16b + + shl v4.4s, v17.4s, #12 + shl v5.4s, v18.4s, #12 + shl v6.4s, v19.4s, #12 + shl v7.4s, v20.4s, #12 + + sri v4.4s, v17.4s, #20 + sri v5.4s, v18.4s, #20 + sri v6.4s, v19.4s, #20 + sri v7.4s, v20.4s, #20 + + // x0 += x4, x12 = rotl32(x12 ^ x0, 8) + // x1 += x5, x13 = rotl32(x13 ^ x1, 8) + // x2 += x6, x14 = rotl32(x14 ^ x2, 8) + // x3 += x7, x15 = rotl32(x15 ^ x3, 8) + add v0.4s, v0.4s, v4.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + + eor v17.16b, v12.16b, v0.16b + eor v18.16b, v13.16b, v1.16b + eor v19.16b, v14.16b, v2.16b + eor v20.16b, v15.16b, v3.16b + + shl v12.4s, v17.4s, #8 + shl v13.4s, v18.4s, #8 + shl v14.4s, v19.4s, #8 + shl v15.4s, v20.4s, #8 + + sri v12.4s, v17.4s, #24 + sri v13.4s, v18.4s, #24 + sri v14.4s, v19.4s, #24 + sri v15.4s, v20.4s, #24 + + // x8 += x12, x4 = rotl32(x4 ^ x8, 7) + // x9 += x13, x5 = rotl32(x5 ^ x9, 7) + // x10 += x14, x6 = rotl32(x6 ^ x10, 7) + // x11 += x15, x7 = rotl32(x7 ^ x11, 7) + add v8.4s, v8.4s, v12.4s + add v9.4s, v9.4s, v13.4s + add v10.4s, v10.4s, v14.4s + add v11.4s, v11.4s, v15.4s + + eor v17.16b, v4.16b, v8.16b + eor v18.16b, v5.16b, v9.16b + eor v19.16b, v6.16b, v10.16b + eor v20.16b, v7.16b, v11.16b + + shl v4.4s, v17.4s, #7 + shl v5.4s, v18.4s, #7 + shl v6.4s, v19.4s, #7 + shl v7.4s, v20.4s, #7 + + sri v4.4s, v17.4s, #25 + sri v5.4s, v18.4s, #25 + sri v6.4s, v19.4s, #25 + sri v7.4s, v20.4s, #25 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 16) + // x1 += x6, x12 = rotl32(x12 ^ x1, 16) + // x2 += x7, x13 = rotl32(x13 ^ x2, 16) + // x3 += x4, x14 = rotl32(x14 ^ x3, 16) + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v4.4s + + eor v15.16b, v15.16b, v0.16b + eor v12.16b, v12.16b, v1.16b + eor v13.16b, v13.16b, v2.16b + eor v14.16b, v14.16b, v3.16b + + rev32 v15.8h, v15.8h + rev32 v12.8h, v12.8h + rev32 v13.8h, v13.8h + rev32 v14.8h, v14.8h + + // x10 += x15, x5 = rotl32(x5 ^ x10, 12) + // x11 += x12, x6 = rotl32(x6 ^ x11, 12) + // x8 += x13, x7 = rotl32(x7 ^ x8, 12) + // x9 += x14, x4 = rotl32(x4 ^ x9, 12) + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v12.4s + add v8.4s, v8.4s, v13.4s + add v9.4s, v9.4s, v14.4s + + eor v17.16b, v5.16b, v10.16b + eor v18.16b, v6.16b, v11.16b + eor v19.16b, v7.16b, v8.16b + eor v20.16b, v4.16b, v9.16b + + shl v5.4s, v17.4s, #12 + shl v6.4s, v18.4s, #12 + shl v7.4s, v19.4s, #12 + shl v4.4s, v20.4s, #12 + + sri v5.4s, v17.4s, #20 + sri v6.4s, v18.4s, #20 + sri v7.4s, v19.4s, #20 + sri v4.4s, v20.4s, #20 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 8) + // x1 += x6, x12 = rotl32(x12 ^ x1, 8) + // x2 += x7, x13 = rotl32(x13 ^ x2, 8) + // x3 += x4, x14 = rotl32(x14 ^ x3, 8) + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v4.4s + + eor v17.16b, v15.16b, v0.16b + eor v18.16b, v12.16b, v1.16b + eor v19.16b, v13.16b, v2.16b + eor v20.16b, v14.16b, v3.16b + + shl v15.4s, v17.4s, #8 + shl v12.4s, v18.4s, #8 + shl v13.4s, v19.4s, #8 + shl v14.4s, v20.4s, #8 + + sri v15.4s, v17.4s, #24 + sri v12.4s, v18.4s, #24 + sri v13.4s, v19.4s, #24 + sri v14.4s, v20.4s, #24 + + // x10 += x15, x5 = rotl32(x5 ^ x10, 7) + // x11 += x12, x6 = rotl32(x6 ^ x11, 7) + // x8 += x13, x7 = rotl32(x7 ^ x8, 7) + // x9 += x14, x4 = rotl32(x4 ^ x9, 7) + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v12.4s + add v8.4s, v8.4s, v13.4s + add v9.4s, v9.4s, v14.4s + + eor v17.16b, v5.16b, v10.16b + eor v18.16b, v6.16b, v11.16b + eor v19.16b, v7.16b, v8.16b + eor v20.16b, v4.16b, v9.16b + + shl v5.4s, v17.4s, #7 + shl v6.4s, v18.4s, #7 + shl v7.4s, v19.4s, #7 + shl v4.4s, v20.4s, #7 + + sri v5.4s, v17.4s, #25 + sri v6.4s, v18.4s, #25 + sri v7.4s, v19.4s, #25 + sri v4.4s, v20.4s, #25 + + subs x3, x3, #1 + b.ne .Ldoubleround4 + + // x0[0-3] += s0[0] + // x1[0-3] += s0[1] + // x2[0-3] += s0[2] + // x3[0-3] += s0[3] + ld4r {v17.4s-v20.4s}, [x0], #16 + add v0.4s, v0.4s, v17.4s + add v1.4s, v1.4s, v18.4s + add v2.4s, v2.4s, v19.4s + add v3.4s, v3.4s, v20.4s + + // x4[0-3] += s1[0] + // x5[0-3] += s1[1] + // x6[0-3] += s1[2] + // x7[0-3] += s1[3] + ld4r {v21.4s-v24.4s}, [x0], #16 + add v4.4s, v4.4s, v21.4s + add v5.4s, v5.4s, v22.4s + add v6.4s, v6.4s, v23.4s + add v7.4s, v7.4s, v24.4s + + // x8[0-3] += s2[0] + // x9[0-3] += s2[1] + // x10[0-3] += s2[2] + // x11[0-3] += s2[3] + ld4r {v17.4s-v20.4s}, [x0], #16 + add v8.4s, v8.4s, v17.4s + add v9.4s, v9.4s, v18.4s + add v10.4s, v10.4s, v19.4s + add v11.4s, v11.4s, v20.4s + + // x12[0-3] += s3[0] + // x13[0-3] += s3[1] + // x14[0-3] += s3[2] + // x15[0-3] += s3[3] + ld4r {v21.4s-v24.4s}, [x0] + add v12.4s, v12.4s, v21.4s + add v13.4s, v13.4s, v22.4s + add v14.4s, v14.4s, v23.4s + add v15.4s, v15.4s, v24.4s + + // x12 += counter values 0-3 + add v12.4s, v12.4s, v16.4s + + ld1 {v16.16b-v19.16b}, [x2], #64 + ld1 {v20.16b-v23.16b}, [x2], #64 + + // interleave 32-bit words in state n, n+1 + zip1 v24.4s, v0.4s, v1.4s + zip1 v25.4s, v2.4s, v3.4s + zip1 v26.4s, v4.4s, v5.4s + zip1 v27.4s, v6.4s, v7.4s + zip1 v28.4s, v8.4s, v9.4s + zip1 v29.4s, v10.4s, v11.4s + zip1 v30.4s, v12.4s, v13.4s + zip1 v31.4s, v14.4s, v15.4s + + zip2 v1.4s, v0.4s, v1.4s + zip2 v3.4s, v2.4s, v3.4s + zip2 v5.4s, v4.4s, v5.4s + zip2 v7.4s, v6.4s, v7.4s + zip2 v9.4s, v8.4s, v9.4s + zip2 v11.4s, v10.4s, v11.4s + zip2 v13.4s, v12.4s, v13.4s + zip2 v15.4s, v14.4s, v15.4s + + mov v0.16b, v24.16b + mov v2.16b, v25.16b + mov v4.16b, v26.16b + mov v6.16b, v27.16b + mov v8.16b, v28.16b + mov v10.16b, v29.16b + mov v12.16b, v30.16b + mov v14.16b, v31.16b + + // interleave 64-bit words in state n, n+2 + zip1 v24.2d, v0.2d, v2.2d + zip1 v25.2d, v1.2d, v3.2d + zip1 v26.2d, v4.2d, v6.2d + zip1 v27.2d, v5.2d, v7.2d + zip1 v28.2d, v8.2d, v10.2d + zip1 v29.2d, v9.2d, v11.2d + zip1 v30.2d, v12.2d, v14.2d + zip1 v31.2d, v13.2d, v15.2d + + zip2 v2.2d, v0.2d, v2.2d + zip2 v3.2d, v1.2d, v3.2d + zip2 v6.2d, v4.2d, v6.2d + zip2 v7.2d, v5.2d, v7.2d + zip2 v10.2d, v8.2d, v10.2d + zip2 v11.2d, v9.2d, v11.2d + zip2 v14.2d, v12.2d, v14.2d + zip2 v15.2d, v13.2d, v15.2d + + mov v0.16b, v24.16b + mov v1.16b, v25.16b + mov v4.16b, v26.16b + mov v5.16b, v27.16b + + mov v8.16b, v28.16b + mov v9.16b, v29.16b + mov v12.16b, v30.16b + mov v13.16b, v31.16b + + ld1 {v24.16b-v27.16b}, [x2], #64 + ld1 {v28.16b-v31.16b}, [x2] + + // xor with corresponding input, write to output + eor v16.16b, v16.16b, v0.16b + eor v17.16b, v17.16b, v4.16b + eor v18.16b, v18.16b, v8.16b + eor v19.16b, v19.16b, v12.16b + st1 {v16.16b-v19.16b}, [x1], #64 + + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v14.16b + st1 {v20.16b-v23.16b}, [x1], #64 + + eor v24.16b, v24.16b, v1.16b + eor v25.16b, v25.16b, v5.16b + eor v26.16b, v26.16b, v9.16b + eor v27.16b, v27.16b, v13.16b + st1 {v24.16b-v27.16b}, [x1], #64 + + eor v28.16b, v28.16b, v3.16b + eor v29.16b, v29.16b, v7.16b + eor v30.16b, v30.16b, v11.16b + eor v31.16b, v31.16b, v15.16b + st1 {v28.16b-v31.16b}, [x1] + + ret +ENDPROC(chacha20_4block_xor_neon) + +CTRINC: .word 0, 1, 2, 3 diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c new file mode 100644 index 000000000000..705b42b06d00 --- /dev/null +++ b/arch/arm64/crypto/chacha20-neon-glue.c @@ -0,0 +1,131 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions + * + * Copyright (C) 2016 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include + +asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); +asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); + +static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes) +{ + u8 buf[CHACHA20_BLOCK_SIZE]; + + while (bytes >= CHACHA20_BLOCK_SIZE * 4) { + chacha20_4block_xor_neon(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE * 4; + src += CHACHA20_BLOCK_SIZE * 4; + dst += CHACHA20_BLOCK_SIZE * 4; + state[12] += 4; + } + while (bytes >= CHACHA20_BLOCK_SIZE) { + chacha20_block_xor_neon(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE; + src += CHACHA20_BLOCK_SIZE; + dst += CHACHA20_BLOCK_SIZE; + state[12]++; + } + if (bytes) { + memcpy(buf, src, bytes); + chacha20_block_xor_neon(state, buf, buf); + memcpy(dst, buf, bytes); + } +} + +static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + u32 state[16]; + int err; + + if (nbytes <= CHACHA20_BLOCK_SIZE) + return crypto_chacha20_crypt(desc, dst, src, nbytes); + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE); + + crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv); + + kernel_neon_begin(); + + while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { + chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, + rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); + err = blkcipher_walk_done(desc, &walk, + walk.nbytes % CHACHA20_BLOCK_SIZE); + } + + if (walk.nbytes) { + chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes); + err = blkcipher_walk_done(desc, &walk, 0); + } + + kernel_neon_end(); + + return err; +} + +static struct crypto_alg alg = { + .cra_name = "chacha20", + .cra_driver_name = "chacha20-neon", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_type = &crypto_blkcipher_type, + .cra_ctxsize = sizeof(struct chacha20_ctx), + .cra_alignmask = sizeof(u32) - 1, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CHACHA20_KEY_SIZE, + .max_keysize = CHACHA20_KEY_SIZE, + .ivsize = CHACHA20_IV_SIZE, + .geniv = "seqiv", + .setkey = crypto_chacha20_setkey, + .encrypt = chacha20_simd, + .decrypt = chacha20_simd, + }, + }, +}; + +static int __init chacha20_simd_mod_init(void) +{ + return crypto_register_alg(&alg); +} + +static void __exit chacha20_simd_mod_fini(void) +{ + crypto_unregister_alg(&alg); +} + +module_init(chacha20_simd_mod_init); +module_exit(chacha20_simd_mod_fini); + +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("chacha20"); -- cgit v1.2.3 From 5386e5d1f8b7305e447b781f7ac02649f7a4d055 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 28 Dec 2016 17:39:26 +0800 Subject: Revert "crypto: arm64/ARM: NEON accelerated ChaCha20" This patch reverts the following commits: 8621caa0d45e731f2e9f5889ff5bb384fcd6e059 8096667273477e735b0072b11a6d617ccee45e5f I should not have applied them because they had already been obsoleted by a subsequent patch series. They also cause a build failure because of the subsequent commit 9ae433bc79f9. Fixes: 9ae433bc79f ("crypto: chacha20 - convert generic and...") Signed-off-by: Herbert Xu --- arch/arm/crypto/Kconfig | 6 - arch/arm/crypto/Makefile | 2 - arch/arm/crypto/chacha20-neon-core.S | 523 --------------------------------- arch/arm/crypto/chacha20-neon-glue.c | 136 --------- arch/arm64/crypto/Kconfig | 6 - arch/arm64/crypto/Makefile | 3 - arch/arm64/crypto/chacha20-neon-core.S | 480 ------------------------------ arch/arm64/crypto/chacha20-neon-glue.c | 131 --------- 8 files changed, 1287 deletions(-) delete mode 100644 arch/arm/crypto/chacha20-neon-core.S delete mode 100644 arch/arm/crypto/chacha20-neon-glue.c delete mode 100644 arch/arm64/crypto/chacha20-neon-core.S delete mode 100644 arch/arm64/crypto/chacha20-neon-glue.c (limited to 'arch/arm64') diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 2f3339f015d3..13f1b4c289d4 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -130,10 +130,4 @@ config CRYPTO_CRC32_ARM_CE depends on KERNEL_MODE_NEON && CRC32 select CRYPTO_HASH -config CRYPTO_CHACHA20_NEON - tristate "NEON accelerated ChaCha20 symmetric cipher" - depends on KERNEL_MODE_NEON - select CRYPTO_BLKCIPHER - select CRYPTO_CHACHA20 - endif diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index 8d74e55eacd4..b578a1820ab1 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -8,7 +8,6 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o @@ -41,7 +40,6 @@ aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o -chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o quiet_cmd_perl = PERL $@ cmd_perl = $(PERL) $(<) > $(@) diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S deleted file mode 100644 index b0a35935be7e..000000000000 --- a/arch/arm/crypto/chacha20-neon-core.S +++ /dev/null @@ -1,523 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions - * - * Copyright (C) 2016 Linaro, Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SNEON3 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include - - .text - .fpu neon - .align 5 - -ENTRY(chacha20_block_xor_neon) - // r0: Input state matrix, s - // r1: 1 data block output, o - // r2: 1 data block input, i - - // - // This function encrypts one ChaCha20 block by loading the state matrix - // in four NEON registers. It performs matrix operation on four words in - // parallel, but requireds shuffling to rearrange the words after each - // round. - // - - // x0..3 = s0..3 - add ip, r0, #0x20 - vld1.32 {q0-q1}, [r0] - vld1.32 {q2-q3}, [ip] - - vmov q8, q0 - vmov q9, q1 - vmov q10, q2 - vmov q11, q3 - - mov r3, #10 - -.Ldoubleround: - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vadd.i32 q0, q0, q1 - veor q4, q3, q0 - vshl.u32 q3, q4, #16 - vsri.u32 q3, q4, #16 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #12 - vsri.u32 q1, q4, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vadd.i32 q0, q0, q1 - veor q4, q3, q0 - vshl.u32 q3, q4, #8 - vsri.u32 q3, q4, #24 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #7 - vsri.u32 q1, q4, #25 - - // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vext.8 q1, q1, q1, #4 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vext.8 q2, q2, q2, #8 - // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vext.8 q3, q3, q3, #12 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vadd.i32 q0, q0, q1 - veor q4, q3, q0 - vshl.u32 q3, q4, #16 - vsri.u32 q3, q4, #16 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #12 - vsri.u32 q1, q4, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vadd.i32 q0, q0, q1 - veor q4, q3, q0 - vshl.u32 q3, q4, #8 - vsri.u32 q3, q4, #24 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #7 - vsri.u32 q1, q4, #25 - - // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vext.8 q1, q1, q1, #12 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vext.8 q2, q2, q2, #8 - // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vext.8 q3, q3, q3, #4 - - subs r3, r3, #1 - bne .Ldoubleround - - add ip, r2, #0x20 - vld1.8 {q4-q5}, [r2] - vld1.8 {q6-q7}, [ip] - - // o0 = i0 ^ (x0 + s0) - vadd.i32 q0, q0, q8 - veor q0, q0, q4 - - // o1 = i1 ^ (x1 + s1) - vadd.i32 q1, q1, q9 - veor q1, q1, q5 - - // o2 = i2 ^ (x2 + s2) - vadd.i32 q2, q2, q10 - veor q2, q2, q6 - - // o3 = i3 ^ (x3 + s3) - vadd.i32 q3, q3, q11 - veor q3, q3, q7 - - add ip, r1, #0x20 - vst1.8 {q0-q1}, [r1] - vst1.8 {q2-q3}, [ip] - - bx lr -ENDPROC(chacha20_block_xor_neon) - - .align 5 -ENTRY(chacha20_4block_xor_neon) - push {r4-r6, lr} - mov ip, sp // preserve the stack pointer - sub r3, sp, #0x20 // allocate a 32 byte buffer - bic r3, r3, #0x1f // aligned to 32 bytes - mov sp, r3 - - // r0: Input state matrix, s - // r1: 4 data blocks output, o - // r2: 4 data blocks input, i - - // - // This function encrypts four consecutive ChaCha20 blocks by loading - // the state matrix in NEON registers four times. The algorithm performs - // each operation on the corresponding word of each state matrix, hence - // requires no word shuffling. For final XORing step we transpose the - // matrix by interleaving 32- and then 64-bit words, which allows us to - // do XOR in NEON registers. - // - - // x0..15[0-3] = s0..3[0..3] - add r3, r0, #0x20 - vld1.32 {q0-q1}, [r0] - vld1.32 {q2-q3}, [r3] - - adr r3, CTRINC - vdup.32 q15, d7[1] - vdup.32 q14, d7[0] - vld1.32 {q11}, [r3, :128] - vdup.32 q13, d6[1] - vdup.32 q12, d6[0] - vadd.i32 q12, q12, q11 // x12 += counter values 0-3 - vdup.32 q11, d5[1] - vdup.32 q10, d5[0] - vdup.32 q9, d4[1] - vdup.32 q8, d4[0] - vdup.32 q7, d3[1] - vdup.32 q6, d3[0] - vdup.32 q5, d2[1] - vdup.32 q4, d2[0] - vdup.32 q3, d1[1] - vdup.32 q2, d1[0] - vdup.32 q1, d0[1] - vdup.32 q0, d0[0] - - mov r3, #10 - -.Ldoubleround4: - // x0 += x4, x12 = rotl32(x12 ^ x0, 16) - // x1 += x5, x13 = rotl32(x13 ^ x1, 16) - // x2 += x6, x14 = rotl32(x14 ^ x2, 16) - // x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vadd.i32 q0, q0, q4 - vadd.i32 q1, q1, q5 - vadd.i32 q2, q2, q6 - vadd.i32 q3, q3, q7 - - veor q12, q12, q0 - veor q13, q13, q1 - veor q14, q14, q2 - veor q15, q15, q3 - - vrev32.16 q12, q12 - vrev32.16 q13, q13 - vrev32.16 q14, q14 - vrev32.16 q15, q15 - - // x8 += x12, x4 = rotl32(x4 ^ x8, 12) - // x9 += x13, x5 = rotl32(x5 ^ x9, 12) - // x10 += x14, x6 = rotl32(x6 ^ x10, 12) - // x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vadd.i32 q8, q8, q12 - vadd.i32 q9, q9, q13 - vadd.i32 q10, q10, q14 - vadd.i32 q11, q11, q15 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q4, q8 - veor q9, q5, q9 - vshl.u32 q4, q8, #12 - vshl.u32 q5, q9, #12 - vsri.u32 q4, q8, #20 - vsri.u32 q5, q9, #20 - - veor q8, q6, q10 - veor q9, q7, q11 - vshl.u32 q6, q8, #12 - vshl.u32 q7, q9, #12 - vsri.u32 q6, q8, #20 - vsri.u32 q7, q9, #20 - - // x0 += x4, x12 = rotl32(x12 ^ x0, 8) - // x1 += x5, x13 = rotl32(x13 ^ x1, 8) - // x2 += x6, x14 = rotl32(x14 ^ x2, 8) - // x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vadd.i32 q0, q0, q4 - vadd.i32 q1, q1, q5 - vadd.i32 q2, q2, q6 - vadd.i32 q3, q3, q7 - - veor q8, q12, q0 - veor q9, q13, q1 - vshl.u32 q12, q8, #8 - vshl.u32 q13, q9, #8 - vsri.u32 q12, q8, #24 - vsri.u32 q13, q9, #24 - - veor q8, q14, q2 - veor q9, q15, q3 - vshl.u32 q14, q8, #8 - vshl.u32 q15, q9, #8 - vsri.u32 q14, q8, #24 - vsri.u32 q15, q9, #24 - - vld1.32 {q8-q9}, [sp, :256] - - // x8 += x12, x4 = rotl32(x4 ^ x8, 7) - // x9 += x13, x5 = rotl32(x5 ^ x9, 7) - // x10 += x14, x6 = rotl32(x6 ^ x10, 7) - // x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vadd.i32 q8, q8, q12 - vadd.i32 q9, q9, q13 - vadd.i32 q10, q10, q14 - vadd.i32 q11, q11, q15 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q4, q8 - veor q9, q5, q9 - vshl.u32 q4, q8, #7 - vshl.u32 q5, q9, #7 - vsri.u32 q4, q8, #25 - vsri.u32 q5, q9, #25 - - veor q8, q6, q10 - veor q9, q7, q11 - vshl.u32 q6, q8, #7 - vshl.u32 q7, q9, #7 - vsri.u32 q6, q8, #25 - vsri.u32 q7, q9, #25 - - vld1.32 {q8-q9}, [sp, :256] - - // x0 += x5, x15 = rotl32(x15 ^ x0, 16) - // x1 += x6, x12 = rotl32(x12 ^ x1, 16) - // x2 += x7, x13 = rotl32(x13 ^ x2, 16) - // x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vadd.i32 q0, q0, q5 - vadd.i32 q1, q1, q6 - vadd.i32 q2, q2, q7 - vadd.i32 q3, q3, q4 - - veor q15, q15, q0 - veor q12, q12, q1 - veor q13, q13, q2 - veor q14, q14, q3 - - vrev32.16 q15, q15 - vrev32.16 q12, q12 - vrev32.16 q13, q13 - vrev32.16 q14, q14 - - // x10 += x15, x5 = rotl32(x5 ^ x10, 12) - // x11 += x12, x6 = rotl32(x6 ^ x11, 12) - // x8 += x13, x7 = rotl32(x7 ^ x8, 12) - // x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vadd.i32 q10, q10, q15 - vadd.i32 q11, q11, q12 - vadd.i32 q8, q8, q13 - vadd.i32 q9, q9, q14 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q7, q8 - veor q9, q4, q9 - vshl.u32 q7, q8, #12 - vshl.u32 q4, q9, #12 - vsri.u32 q7, q8, #20 - vsri.u32 q4, q9, #20 - - veor q8, q5, q10 - veor q9, q6, q11 - vshl.u32 q5, q8, #12 - vshl.u32 q6, q9, #12 - vsri.u32 q5, q8, #20 - vsri.u32 q6, q9, #20 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 8) - // x1 += x6, x12 = rotl32(x12 ^ x1, 8) - // x2 += x7, x13 = rotl32(x13 ^ x2, 8) - // x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vadd.i32 q0, q0, q5 - vadd.i32 q1, q1, q6 - vadd.i32 q2, q2, q7 - vadd.i32 q3, q3, q4 - - veor q8, q15, q0 - veor q9, q12, q1 - vshl.u32 q15, q8, #8 - vshl.u32 q12, q9, #8 - vsri.u32 q15, q8, #24 - vsri.u32 q12, q9, #24 - - veor q8, q13, q2 - veor q9, q14, q3 - vshl.u32 q13, q8, #8 - vshl.u32 q14, q9, #8 - vsri.u32 q13, q8, #24 - vsri.u32 q14, q9, #24 - - vld1.32 {q8-q9}, [sp, :256] - - // x10 += x15, x5 = rotl32(x5 ^ x10, 7) - // x11 += x12, x6 = rotl32(x6 ^ x11, 7) - // x8 += x13, x7 = rotl32(x7 ^ x8, 7) - // x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vadd.i32 q10, q10, q15 - vadd.i32 q11, q11, q12 - vadd.i32 q8, q8, q13 - vadd.i32 q9, q9, q14 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q7, q8 - veor q9, q4, q9 - vshl.u32 q7, q8, #7 - vshl.u32 q4, q9, #7 - vsri.u32 q7, q8, #25 - vsri.u32 q4, q9, #25 - - veor q8, q5, q10 - veor q9, q6, q11 - vshl.u32 q5, q8, #7 - vshl.u32 q6, q9, #7 - vsri.u32 q5, q8, #25 - vsri.u32 q6, q9, #25 - - subs r3, r3, #1 - beq 0f - - vld1.32 {q8-q9}, [sp, :256] - b .Ldoubleround4 - - // x0[0-3] += s0[0] - // x1[0-3] += s0[1] - // x2[0-3] += s0[2] - // x3[0-3] += s0[3] -0: ldmia r0!, {r3-r6} - vdup.32 q8, r3 - vdup.32 q9, r4 - vadd.i32 q0, q0, q8 - vadd.i32 q1, q1, q9 - vdup.32 q8, r5 - vdup.32 q9, r6 - vadd.i32 q2, q2, q8 - vadd.i32 q3, q3, q9 - - // x4[0-3] += s1[0] - // x5[0-3] += s1[1] - // x6[0-3] += s1[2] - // x7[0-3] += s1[3] - ldmia r0!, {r3-r6} - vdup.32 q8, r3 - vdup.32 q9, r4 - vadd.i32 q4, q4, q8 - vadd.i32 q5, q5, q9 - vdup.32 q8, r5 - vdup.32 q9, r6 - vadd.i32 q6, q6, q8 - vadd.i32 q7, q7, q9 - - // interleave 32-bit words in state n, n+1 - vzip.32 q0, q1 - vzip.32 q2, q3 - vzip.32 q4, q5 - vzip.32 q6, q7 - - // interleave 64-bit words in state n, n+2 - vswp d1, d4 - vswp d3, d6 - vswp d9, d12 - vswp d11, d14 - - // xor with corresponding input, write to output - vld1.8 {q8-q9}, [r2]! - veor q8, q8, q0 - veor q9, q9, q4 - vst1.8 {q8-q9}, [r1]! - - vld1.32 {q8-q9}, [sp, :256] - - // x8[0-3] += s2[0] - // x9[0-3] += s2[1] - // x10[0-3] += s2[2] - // x11[0-3] += s2[3] - ldmia r0!, {r3-r6} - vdup.32 q0, r3 - vdup.32 q4, r4 - vadd.i32 q8, q8, q0 - vadd.i32 q9, q9, q4 - vdup.32 q0, r5 - vdup.32 q4, r6 - vadd.i32 q10, q10, q0 - vadd.i32 q11, q11, q4 - - // x12[0-3] += s3[0] - // x13[0-3] += s3[1] - // x14[0-3] += s3[2] - // x15[0-3] += s3[3] - ldmia r0!, {r3-r6} - vdup.32 q0, r3 - vdup.32 q4, r4 - adr r3, CTRINC - vadd.i32 q12, q12, q0 - vld1.32 {q0}, [r3, :128] - vadd.i32 q13, q13, q4 - vadd.i32 q12, q12, q0 // x12 += counter values 0-3 - - vdup.32 q0, r5 - vdup.32 q4, r6 - vadd.i32 q14, q14, q0 - vadd.i32 q15, q15, q4 - - // interleave 32-bit words in state n, n+1 - vzip.32 q8, q9 - vzip.32 q10, q11 - vzip.32 q12, q13 - vzip.32 q14, q15 - - // interleave 64-bit words in state n, n+2 - vswp d17, d20 - vswp d19, d22 - vswp d25, d28 - vswp d27, d30 - - vmov q4, q1 - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q8 - veor q1, q1, q12 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q2 - veor q1, q1, q6 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q10 - veor q1, q1, q14 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q4 - veor q1, q1, q5 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q9 - veor q1, q1, q13 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q3 - veor q1, q1, q7 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2] - veor q0, q0, q11 - veor q1, q1, q15 - vst1.8 {q0-q1}, [r1] - - mov sp, ip - pop {r4-r6, pc} -ENDPROC(chacha20_4block_xor_neon) - - .align 4 -CTRINC: .word 0, 1, 2, 3 diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha20-neon-glue.c deleted file mode 100644 index 554f7f6069da..000000000000 --- a/arch/arm/crypto/chacha20-neon-glue.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions - * - * Copyright (C) 2016 Linaro, Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); -asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); - -static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes) -{ - u8 buf[CHACHA20_BLOCK_SIZE]; - - while (bytes >= CHACHA20_BLOCK_SIZE * 4) { - chacha20_4block_xor_neon(state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE * 4; - src += CHACHA20_BLOCK_SIZE * 4; - dst += CHACHA20_BLOCK_SIZE * 4; - state[12] += 4; - } - while (bytes >= CHACHA20_BLOCK_SIZE) { - chacha20_block_xor_neon(state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE; - src += CHACHA20_BLOCK_SIZE; - dst += CHACHA20_BLOCK_SIZE; - state[12]++; - } - if (bytes) { - memcpy(buf, src, bytes); - chacha20_block_xor_neon(state, buf, buf); - memcpy(dst, buf, bytes); - } -} - -static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - u32 state[16]; - int err; - - if (nbytes <= CHACHA20_BLOCK_SIZE || !may_use_simd()) - return crypto_chacha20_crypt(desc, dst, src, nbytes); - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE); - - crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv); - - kernel_neon_begin(); - - while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { - chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, - rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); - err = blkcipher_walk_done(desc, &walk, - walk.nbytes % CHACHA20_BLOCK_SIZE); - } - - if (walk.nbytes) { - chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes); - err = blkcipher_walk_done(desc, &walk, 0); - } - - kernel_neon_end(); - - return err; -} - -static struct crypto_alg alg = { - .cra_name = "chacha20", - .cra_driver_name = "chacha20-neon", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_type = &crypto_blkcipher_type, - .cra_ctxsize = sizeof(struct chacha20_ctx), - .cra_alignmask = sizeof(u32) - 1, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CHACHA20_KEY_SIZE, - .max_keysize = CHACHA20_KEY_SIZE, - .ivsize = CHACHA20_IV_SIZE, - .geniv = "seqiv", - .setkey = crypto_chacha20_setkey, - .encrypt = chacha20_simd, - .decrypt = chacha20_simd, - }, - }, -}; - -static int __init chacha20_simd_mod_init(void) -{ - if (!(elf_hwcap & HWCAP_NEON)) - return -ENODEV; - - return crypto_register_alg(&alg); -} - -static void __exit chacha20_simd_mod_fini(void) -{ - crypto_unregister_alg(&alg); -} - -module_init(chacha20_simd_mod_init); -module_exit(chacha20_simd_mod_fini); - -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("chacha20"); diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 0bf0f531f539..450a85df041a 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -72,10 +72,4 @@ config CRYPTO_CRC32_ARM64 depends on ARM64 select CRYPTO_HASH -config CRYPTO_CHACHA20_NEON - tristate "NEON accelerated ChaCha20 symmetric cipher" - depends on KERNEL_MODE_NEON - select CRYPTO_BLKCIPHER - select CRYPTO_CHACHA20 - endif diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index 9d2826c5fccf..aa8888d7b744 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -41,9 +41,6 @@ sha256-arm64-y := sha256-glue.o sha256-core.o obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o sha512-arm64-y := sha512-glue.o sha512-core.o -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o -chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o - AFLAGS_aes-ce.o := -DINTERLEAVE=4 AFLAGS_aes-neon.o := -DINTERLEAVE=4 diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha20-neon-core.S deleted file mode 100644 index e2cd65580807..000000000000 --- a/arch/arm64/crypto/chacha20-neon-core.S +++ /dev/null @@ -1,480 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions - * - * Copyright (C) 2016 Linaro, Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include - - .text - .align 6 - -ENTRY(chacha20_block_xor_neon) - // x0: Input state matrix, s - // x1: 1 data block output, o - // x2: 1 data block input, i - - // - // This function encrypts one ChaCha20 block by loading the state matrix - // in four NEON registers. It performs matrix operation on four words in - // parallel, but requires shuffling to rearrange the words after each - // round. - // - - // x0..3 = s0..3 - ld1 {v0.4s-v3.4s}, [x0] - ld1 {v8.4s-v11.4s}, [x0] - - mov x3, #10 - -.Ldoubleround: - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - rev32 v3.8h, v3.8h - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #12 - sri v1.4s, v4.4s, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - add v0.4s, v0.4s, v1.4s - eor v4.16b, v3.16b, v0.16b - shl v3.4s, v4.4s, #8 - sri v3.4s, v4.4s, #24 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #7 - sri v1.4s, v4.4s, #25 - - // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - ext v1.16b, v1.16b, v1.16b, #4 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - ext v2.16b, v2.16b, v2.16b, #8 - // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - ext v3.16b, v3.16b, v3.16b, #12 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - rev32 v3.8h, v3.8h - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #12 - sri v1.4s, v4.4s, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - add v0.4s, v0.4s, v1.4s - eor v4.16b, v3.16b, v0.16b - shl v3.4s, v4.4s, #8 - sri v3.4s, v4.4s, #24 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #7 - sri v1.4s, v4.4s, #25 - - // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - ext v1.16b, v1.16b, v1.16b, #12 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - ext v2.16b, v2.16b, v2.16b, #8 - // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - ext v3.16b, v3.16b, v3.16b, #4 - - subs x3, x3, #1 - b.ne .Ldoubleround - - ld1 {v4.16b-v7.16b}, [x2] - - // o0 = i0 ^ (x0 + s0) - add v0.4s, v0.4s, v8.4s - eor v0.16b, v0.16b, v4.16b - - // o1 = i1 ^ (x1 + s1) - add v1.4s, v1.4s, v9.4s - eor v1.16b, v1.16b, v5.16b - - // o2 = i2 ^ (x2 + s2) - add v2.4s, v2.4s, v10.4s - eor v2.16b, v2.16b, v6.16b - - // o3 = i3 ^ (x3 + s3) - add v3.4s, v3.4s, v11.4s - eor v3.16b, v3.16b, v7.16b - - st1 {v0.16b-v3.16b}, [x1] - - ret -ENDPROC(chacha20_block_xor_neon) - - .align 6 -ENTRY(chacha20_4block_xor_neon) - // x0: Input state matrix, s - // x1: 4 data blocks output, o - // x2: 4 data blocks input, i - - // - // This function encrypts four consecutive ChaCha20 blocks by loading - // the state matrix in NEON registers four times. The algorithm performs - // each operation on the corresponding word of each state matrix, hence - // requires no word shuffling. For final XORing step we transpose the - // matrix by interleaving 32- and then 64-bit words, which allows us to - // do XOR in NEON registers. - // - adr x3, CTRINC - ld1 {v16.4s}, [x3] - - // x0..15[0-3] = s0..3[0..3] - mov x4, x0 - ld4r { v0.4s- v3.4s}, [x4], #16 - ld4r { v4.4s- v7.4s}, [x4], #16 - ld4r { v8.4s-v11.4s}, [x4], #16 - ld4r {v12.4s-v15.4s}, [x4] - - // x12 += counter values 0-3 - add v12.4s, v12.4s, v16.4s - - mov x3, #10 - -.Ldoubleround4: - // x0 += x4, x12 = rotl32(x12 ^ x0, 16) - // x1 += x5, x13 = rotl32(x13 ^ x1, 16) - // x2 += x6, x14 = rotl32(x14 ^ x2, 16) - // x3 += x7, x15 = rotl32(x15 ^ x3, 16) - add v0.4s, v0.4s, v4.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - add v3.4s, v3.4s, v7.4s - - eor v12.16b, v12.16b, v0.16b - eor v13.16b, v13.16b, v1.16b - eor v14.16b, v14.16b, v2.16b - eor v15.16b, v15.16b, v3.16b - - rev32 v12.8h, v12.8h - rev32 v13.8h, v13.8h - rev32 v14.8h, v14.8h - rev32 v15.8h, v15.8h - - // x8 += x12, x4 = rotl32(x4 ^ x8, 12) - // x9 += x13, x5 = rotl32(x5 ^ x9, 12) - // x10 += x14, x6 = rotl32(x6 ^ x10, 12) - // x11 += x15, x7 = rotl32(x7 ^ x11, 12) - add v8.4s, v8.4s, v12.4s - add v9.4s, v9.4s, v13.4s - add v10.4s, v10.4s, v14.4s - add v11.4s, v11.4s, v15.4s - - eor v17.16b, v4.16b, v8.16b - eor v18.16b, v5.16b, v9.16b - eor v19.16b, v6.16b, v10.16b - eor v20.16b, v7.16b, v11.16b - - shl v4.4s, v17.4s, #12 - shl v5.4s, v18.4s, #12 - shl v6.4s, v19.4s, #12 - shl v7.4s, v20.4s, #12 - - sri v4.4s, v17.4s, #20 - sri v5.4s, v18.4s, #20 - sri v6.4s, v19.4s, #20 - sri v7.4s, v20.4s, #20 - - // x0 += x4, x12 = rotl32(x12 ^ x0, 8) - // x1 += x5, x13 = rotl32(x13 ^ x1, 8) - // x2 += x6, x14 = rotl32(x14 ^ x2, 8) - // x3 += x7, x15 = rotl32(x15 ^ x3, 8) - add v0.4s, v0.4s, v4.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - add v3.4s, v3.4s, v7.4s - - eor v17.16b, v12.16b, v0.16b - eor v18.16b, v13.16b, v1.16b - eor v19.16b, v14.16b, v2.16b - eor v20.16b, v15.16b, v3.16b - - shl v12.4s, v17.4s, #8 - shl v13.4s, v18.4s, #8 - shl v14.4s, v19.4s, #8 - shl v15.4s, v20.4s, #8 - - sri v12.4s, v17.4s, #24 - sri v13.4s, v18.4s, #24 - sri v14.4s, v19.4s, #24 - sri v15.4s, v20.4s, #24 - - // x8 += x12, x4 = rotl32(x4 ^ x8, 7) - // x9 += x13, x5 = rotl32(x5 ^ x9, 7) - // x10 += x14, x6 = rotl32(x6 ^ x10, 7) - // x11 += x15, x7 = rotl32(x7 ^ x11, 7) - add v8.4s, v8.4s, v12.4s - add v9.4s, v9.4s, v13.4s - add v10.4s, v10.4s, v14.4s - add v11.4s, v11.4s, v15.4s - - eor v17.16b, v4.16b, v8.16b - eor v18.16b, v5.16b, v9.16b - eor v19.16b, v6.16b, v10.16b - eor v20.16b, v7.16b, v11.16b - - shl v4.4s, v17.4s, #7 - shl v5.4s, v18.4s, #7 - shl v6.4s, v19.4s, #7 - shl v7.4s, v20.4s, #7 - - sri v4.4s, v17.4s, #25 - sri v5.4s, v18.4s, #25 - sri v6.4s, v19.4s, #25 - sri v7.4s, v20.4s, #25 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 16) - // x1 += x6, x12 = rotl32(x12 ^ x1, 16) - // x2 += x7, x13 = rotl32(x13 ^ x2, 16) - // x3 += x4, x14 = rotl32(x14 ^ x3, 16) - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v4.4s - - eor v15.16b, v15.16b, v0.16b - eor v12.16b, v12.16b, v1.16b - eor v13.16b, v13.16b, v2.16b - eor v14.16b, v14.16b, v3.16b - - rev32 v15.8h, v15.8h - rev32 v12.8h, v12.8h - rev32 v13.8h, v13.8h - rev32 v14.8h, v14.8h - - // x10 += x15, x5 = rotl32(x5 ^ x10, 12) - // x11 += x12, x6 = rotl32(x6 ^ x11, 12) - // x8 += x13, x7 = rotl32(x7 ^ x8, 12) - // x9 += x14, x4 = rotl32(x4 ^ x9, 12) - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v12.4s - add v8.4s, v8.4s, v13.4s - add v9.4s, v9.4s, v14.4s - - eor v17.16b, v5.16b, v10.16b - eor v18.16b, v6.16b, v11.16b - eor v19.16b, v7.16b, v8.16b - eor v20.16b, v4.16b, v9.16b - - shl v5.4s, v17.4s, #12 - shl v6.4s, v18.4s, #12 - shl v7.4s, v19.4s, #12 - shl v4.4s, v20.4s, #12 - - sri v5.4s, v17.4s, #20 - sri v6.4s, v18.4s, #20 - sri v7.4s, v19.4s, #20 - sri v4.4s, v20.4s, #20 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 8) - // x1 += x6, x12 = rotl32(x12 ^ x1, 8) - // x2 += x7, x13 = rotl32(x13 ^ x2, 8) - // x3 += x4, x14 = rotl32(x14 ^ x3, 8) - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v4.4s - - eor v17.16b, v15.16b, v0.16b - eor v18.16b, v12.16b, v1.16b - eor v19.16b, v13.16b, v2.16b - eor v20.16b, v14.16b, v3.16b - - shl v15.4s, v17.4s, #8 - shl v12.4s, v18.4s, #8 - shl v13.4s, v19.4s, #8 - shl v14.4s, v20.4s, #8 - - sri v15.4s, v17.4s, #24 - sri v12.4s, v18.4s, #24 - sri v13.4s, v19.4s, #24 - sri v14.4s, v20.4s, #24 - - // x10 += x15, x5 = rotl32(x5 ^ x10, 7) - // x11 += x12, x6 = rotl32(x6 ^ x11, 7) - // x8 += x13, x7 = rotl32(x7 ^ x8, 7) - // x9 += x14, x4 = rotl32(x4 ^ x9, 7) - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v12.4s - add v8.4s, v8.4s, v13.4s - add v9.4s, v9.4s, v14.4s - - eor v17.16b, v5.16b, v10.16b - eor v18.16b, v6.16b, v11.16b - eor v19.16b, v7.16b, v8.16b - eor v20.16b, v4.16b, v9.16b - - shl v5.4s, v17.4s, #7 - shl v6.4s, v18.4s, #7 - shl v7.4s, v19.4s, #7 - shl v4.4s, v20.4s, #7 - - sri v5.4s, v17.4s, #25 - sri v6.4s, v18.4s, #25 - sri v7.4s, v19.4s, #25 - sri v4.4s, v20.4s, #25 - - subs x3, x3, #1 - b.ne .Ldoubleround4 - - // x0[0-3] += s0[0] - // x1[0-3] += s0[1] - // x2[0-3] += s0[2] - // x3[0-3] += s0[3] - ld4r {v17.4s-v20.4s}, [x0], #16 - add v0.4s, v0.4s, v17.4s - add v1.4s, v1.4s, v18.4s - add v2.4s, v2.4s, v19.4s - add v3.4s, v3.4s, v20.4s - - // x4[0-3] += s1[0] - // x5[0-3] += s1[1] - // x6[0-3] += s1[2] - // x7[0-3] += s1[3] - ld4r {v21.4s-v24.4s}, [x0], #16 - add v4.4s, v4.4s, v21.4s - add v5.4s, v5.4s, v22.4s - add v6.4s, v6.4s, v23.4s - add v7.4s, v7.4s, v24.4s - - // x8[0-3] += s2[0] - // x9[0-3] += s2[1] - // x10[0-3] += s2[2] - // x11[0-3] += s2[3] - ld4r {v17.4s-v20.4s}, [x0], #16 - add v8.4s, v8.4s, v17.4s - add v9.4s, v9.4s, v18.4s - add v10.4s, v10.4s, v19.4s - add v11.4s, v11.4s, v20.4s - - // x12[0-3] += s3[0] - // x13[0-3] += s3[1] - // x14[0-3] += s3[2] - // x15[0-3] += s3[3] - ld4r {v21.4s-v24.4s}, [x0] - add v12.4s, v12.4s, v21.4s - add v13.4s, v13.4s, v22.4s - add v14.4s, v14.4s, v23.4s - add v15.4s, v15.4s, v24.4s - - // x12 += counter values 0-3 - add v12.4s, v12.4s, v16.4s - - ld1 {v16.16b-v19.16b}, [x2], #64 - ld1 {v20.16b-v23.16b}, [x2], #64 - - // interleave 32-bit words in state n, n+1 - zip1 v24.4s, v0.4s, v1.4s - zip1 v25.4s, v2.4s, v3.4s - zip1 v26.4s, v4.4s, v5.4s - zip1 v27.4s, v6.4s, v7.4s - zip1 v28.4s, v8.4s, v9.4s - zip1 v29.4s, v10.4s, v11.4s - zip1 v30.4s, v12.4s, v13.4s - zip1 v31.4s, v14.4s, v15.4s - - zip2 v1.4s, v0.4s, v1.4s - zip2 v3.4s, v2.4s, v3.4s - zip2 v5.4s, v4.4s, v5.4s - zip2 v7.4s, v6.4s, v7.4s - zip2 v9.4s, v8.4s, v9.4s - zip2 v11.4s, v10.4s, v11.4s - zip2 v13.4s, v12.4s, v13.4s - zip2 v15.4s, v14.4s, v15.4s - - mov v0.16b, v24.16b - mov v2.16b, v25.16b - mov v4.16b, v26.16b - mov v6.16b, v27.16b - mov v8.16b, v28.16b - mov v10.16b, v29.16b - mov v12.16b, v30.16b - mov v14.16b, v31.16b - - // interleave 64-bit words in state n, n+2 - zip1 v24.2d, v0.2d, v2.2d - zip1 v25.2d, v1.2d, v3.2d - zip1 v26.2d, v4.2d, v6.2d - zip1 v27.2d, v5.2d, v7.2d - zip1 v28.2d, v8.2d, v10.2d - zip1 v29.2d, v9.2d, v11.2d - zip1 v30.2d, v12.2d, v14.2d - zip1 v31.2d, v13.2d, v15.2d - - zip2 v2.2d, v0.2d, v2.2d - zip2 v3.2d, v1.2d, v3.2d - zip2 v6.2d, v4.2d, v6.2d - zip2 v7.2d, v5.2d, v7.2d - zip2 v10.2d, v8.2d, v10.2d - zip2 v11.2d, v9.2d, v11.2d - zip2 v14.2d, v12.2d, v14.2d - zip2 v15.2d, v13.2d, v15.2d - - mov v0.16b, v24.16b - mov v1.16b, v25.16b - mov v4.16b, v26.16b - mov v5.16b, v27.16b - - mov v8.16b, v28.16b - mov v9.16b, v29.16b - mov v12.16b, v30.16b - mov v13.16b, v31.16b - - ld1 {v24.16b-v27.16b}, [x2], #64 - ld1 {v28.16b-v31.16b}, [x2] - - // xor with corresponding input, write to output - eor v16.16b, v16.16b, v0.16b - eor v17.16b, v17.16b, v4.16b - eor v18.16b, v18.16b, v8.16b - eor v19.16b, v19.16b, v12.16b - st1 {v16.16b-v19.16b}, [x1], #64 - - eor v20.16b, v20.16b, v2.16b - eor v21.16b, v21.16b, v6.16b - eor v22.16b, v22.16b, v10.16b - eor v23.16b, v23.16b, v14.16b - st1 {v20.16b-v23.16b}, [x1], #64 - - eor v24.16b, v24.16b, v1.16b - eor v25.16b, v25.16b, v5.16b - eor v26.16b, v26.16b, v9.16b - eor v27.16b, v27.16b, v13.16b - st1 {v24.16b-v27.16b}, [x1], #64 - - eor v28.16b, v28.16b, v3.16b - eor v29.16b, v29.16b, v7.16b - eor v30.16b, v30.16b, v11.16b - eor v31.16b, v31.16b, v15.16b - st1 {v28.16b-v31.16b}, [x1] - - ret -ENDPROC(chacha20_4block_xor_neon) - -CTRINC: .word 0, 1, 2, 3 diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c deleted file mode 100644 index 705b42b06d00..000000000000 --- a/arch/arm64/crypto/chacha20-neon-glue.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions - * - * Copyright (C) 2016 Linaro, Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include -#include -#include -#include -#include - -#include - -asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); -asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); - -static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes) -{ - u8 buf[CHACHA20_BLOCK_SIZE]; - - while (bytes >= CHACHA20_BLOCK_SIZE * 4) { - chacha20_4block_xor_neon(state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE * 4; - src += CHACHA20_BLOCK_SIZE * 4; - dst += CHACHA20_BLOCK_SIZE * 4; - state[12] += 4; - } - while (bytes >= CHACHA20_BLOCK_SIZE) { - chacha20_block_xor_neon(state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE; - src += CHACHA20_BLOCK_SIZE; - dst += CHACHA20_BLOCK_SIZE; - state[12]++; - } - if (bytes) { - memcpy(buf, src, bytes); - chacha20_block_xor_neon(state, buf, buf); - memcpy(dst, buf, bytes); - } -} - -static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - u32 state[16]; - int err; - - if (nbytes <= CHACHA20_BLOCK_SIZE) - return crypto_chacha20_crypt(desc, dst, src, nbytes); - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE); - - crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv); - - kernel_neon_begin(); - - while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { - chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, - rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); - err = blkcipher_walk_done(desc, &walk, - walk.nbytes % CHACHA20_BLOCK_SIZE); - } - - if (walk.nbytes) { - chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes); - err = blkcipher_walk_done(desc, &walk, 0); - } - - kernel_neon_end(); - - return err; -} - -static struct crypto_alg alg = { - .cra_name = "chacha20", - .cra_driver_name = "chacha20-neon", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_type = &crypto_blkcipher_type, - .cra_ctxsize = sizeof(struct chacha20_ctx), - .cra_alignmask = sizeof(u32) - 1, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CHACHA20_KEY_SIZE, - .max_keysize = CHACHA20_KEY_SIZE, - .ivsize = CHACHA20_IV_SIZE, - .geniv = "seqiv", - .setkey = crypto_chacha20_setkey, - .encrypt = chacha20_simd, - .decrypt = chacha20_simd, - }, - }, -}; - -static int __init chacha20_simd_mod_init(void) -{ - return crypto_register_alg(&alg); -} - -static void __exit chacha20_simd_mod_fini(void) -{ - crypto_unregister_alg(&alg); -} - -module_init(chacha20_simd_mod_init); -module_exit(chacha20_simd_mod_fini); - -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("chacha20"); -- cgit v1.2.3 From b7171ce9eb523fd90e38f2d138d1b6ed2ff3eafd Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 11 Jan 2017 16:41:49 +0000 Subject: crypto: arm64/chacha20 - implement NEON version based on SSE3 code This is a straight port to arm64/NEON of the x86 SSE3 implementation of the ChaCha20 stream cipher. It uses the new skcipher walksize attribute to process the input in strides of 4x the block size. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 6 + arch/arm64/crypto/Makefile | 3 + arch/arm64/crypto/chacha20-neon-core.S | 450 +++++++++++++++++++++++++++++++++ arch/arm64/crypto/chacha20-neon-glue.c | 127 ++++++++++ 4 files changed, 586 insertions(+) create mode 100644 arch/arm64/crypto/chacha20-neon-core.S create mode 100644 arch/arm64/crypto/chacha20-neon-glue.c (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 450a85df041a..0bf0f531f539 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -72,4 +72,10 @@ config CRYPTO_CRC32_ARM64 depends on ARM64 select CRYPTO_HASH +config CRYPTO_CHACHA20_NEON + tristate "NEON accelerated ChaCha20 symmetric cipher" + depends on KERNEL_MODE_NEON + select CRYPTO_BLKCIPHER + select CRYPTO_CHACHA20 + endif diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index aa8888d7b744..9d2826c5fccf 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -41,6 +41,9 @@ sha256-arm64-y := sha256-glue.o sha256-core.o obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o sha512-arm64-y := sha512-glue.o sha512-core.o +obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o +chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o + AFLAGS_aes-ce.o := -DINTERLEAVE=4 AFLAGS_aes-neon.o := -DINTERLEAVE=4 diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha20-neon-core.S new file mode 100644 index 000000000000..13c85e272c2a --- /dev/null +++ b/arch/arm64/crypto/chacha20-neon-core.S @@ -0,0 +1,450 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions + * + * Copyright (C) 2016 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include + + .text + .align 6 + +ENTRY(chacha20_block_xor_neon) + // x0: Input state matrix, s + // x1: 1 data block output, o + // x2: 1 data block input, i + + // + // This function encrypts one ChaCha20 block by loading the state matrix + // in four NEON registers. It performs matrix operation on four words in + // parallel, but requires shuffling to rearrange the words after each + // round. + // + + // x0..3 = s0..3 + adr x3, ROT8 + ld1 {v0.4s-v3.4s}, [x0] + ld1 {v8.4s-v11.4s}, [x0] + ld1 {v12.4s}, [x3] + + mov x3, #10 + +.Ldoubleround: + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + tbl v3.16b, {v3.16b}, v12.16b + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + ext v1.16b, v1.16b, v1.16b, #4 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + ext v3.16b, v3.16b, v3.16b, #12 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + tbl v3.16b, {v3.16b}, v12.16b + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + ext v1.16b, v1.16b, v1.16b, #12 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + ext v3.16b, v3.16b, v3.16b, #4 + + subs x3, x3, #1 + b.ne .Ldoubleround + + ld1 {v4.16b-v7.16b}, [x2] + + // o0 = i0 ^ (x0 + s0) + add v0.4s, v0.4s, v8.4s + eor v0.16b, v0.16b, v4.16b + + // o1 = i1 ^ (x1 + s1) + add v1.4s, v1.4s, v9.4s + eor v1.16b, v1.16b, v5.16b + + // o2 = i2 ^ (x2 + s2) + add v2.4s, v2.4s, v10.4s + eor v2.16b, v2.16b, v6.16b + + // o3 = i3 ^ (x3 + s3) + add v3.4s, v3.4s, v11.4s + eor v3.16b, v3.16b, v7.16b + + st1 {v0.16b-v3.16b}, [x1] + + ret +ENDPROC(chacha20_block_xor_neon) + + .align 6 +ENTRY(chacha20_4block_xor_neon) + // x0: Input state matrix, s + // x1: 4 data blocks output, o + // x2: 4 data blocks input, i + + // + // This function encrypts four consecutive ChaCha20 blocks by loading + // the state matrix in NEON registers four times. The algorithm performs + // each operation on the corresponding word of each state matrix, hence + // requires no word shuffling. For final XORing step we transpose the + // matrix by interleaving 32- and then 64-bit words, which allows us to + // do XOR in NEON registers. + // + adr x3, CTRINC // ... and ROT8 + ld1 {v30.4s-v31.4s}, [x3] + + // x0..15[0-3] = s0..3[0..3] + mov x4, x0 + ld4r { v0.4s- v3.4s}, [x4], #16 + ld4r { v4.4s- v7.4s}, [x4], #16 + ld4r { v8.4s-v11.4s}, [x4], #16 + ld4r {v12.4s-v15.4s}, [x4] + + // x12 += counter values 0-3 + add v12.4s, v12.4s, v30.4s + + mov x3, #10 + +.Ldoubleround4: + // x0 += x4, x12 = rotl32(x12 ^ x0, 16) + // x1 += x5, x13 = rotl32(x13 ^ x1, 16) + // x2 += x6, x14 = rotl32(x14 ^ x2, 16) + // x3 += x7, x15 = rotl32(x15 ^ x3, 16) + add v0.4s, v0.4s, v4.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + eor v14.16b, v14.16b, v2.16b + eor v15.16b, v15.16b, v3.16b + + rev32 v12.8h, v12.8h + rev32 v13.8h, v13.8h + rev32 v14.8h, v14.8h + rev32 v15.8h, v15.8h + + // x8 += x12, x4 = rotl32(x4 ^ x8, 12) + // x9 += x13, x5 = rotl32(x5 ^ x9, 12) + // x10 += x14, x6 = rotl32(x6 ^ x10, 12) + // x11 += x15, x7 = rotl32(x7 ^ x11, 12) + add v8.4s, v8.4s, v12.4s + add v9.4s, v9.4s, v13.4s + add v10.4s, v10.4s, v14.4s + add v11.4s, v11.4s, v15.4s + + eor v16.16b, v4.16b, v8.16b + eor v17.16b, v5.16b, v9.16b + eor v18.16b, v6.16b, v10.16b + eor v19.16b, v7.16b, v11.16b + + shl v4.4s, v16.4s, #12 + shl v5.4s, v17.4s, #12 + shl v6.4s, v18.4s, #12 + shl v7.4s, v19.4s, #12 + + sri v4.4s, v16.4s, #20 + sri v5.4s, v17.4s, #20 + sri v6.4s, v18.4s, #20 + sri v7.4s, v19.4s, #20 + + // x0 += x4, x12 = rotl32(x12 ^ x0, 8) + // x1 += x5, x13 = rotl32(x13 ^ x1, 8) + // x2 += x6, x14 = rotl32(x14 ^ x2, 8) + // x3 += x7, x15 = rotl32(x15 ^ x3, 8) + add v0.4s, v0.4s, v4.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + eor v14.16b, v14.16b, v2.16b + eor v15.16b, v15.16b, v3.16b + + tbl v12.16b, {v12.16b}, v31.16b + tbl v13.16b, {v13.16b}, v31.16b + tbl v14.16b, {v14.16b}, v31.16b + tbl v15.16b, {v15.16b}, v31.16b + + // x8 += x12, x4 = rotl32(x4 ^ x8, 7) + // x9 += x13, x5 = rotl32(x5 ^ x9, 7) + // x10 += x14, x6 = rotl32(x6 ^ x10, 7) + // x11 += x15, x7 = rotl32(x7 ^ x11, 7) + add v8.4s, v8.4s, v12.4s + add v9.4s, v9.4s, v13.4s + add v10.4s, v10.4s, v14.4s + add v11.4s, v11.4s, v15.4s + + eor v16.16b, v4.16b, v8.16b + eor v17.16b, v5.16b, v9.16b + eor v18.16b, v6.16b, v10.16b + eor v19.16b, v7.16b, v11.16b + + shl v4.4s, v16.4s, #7 + shl v5.4s, v17.4s, #7 + shl v6.4s, v18.4s, #7 + shl v7.4s, v19.4s, #7 + + sri v4.4s, v16.4s, #25 + sri v5.4s, v17.4s, #25 + sri v6.4s, v18.4s, #25 + sri v7.4s, v19.4s, #25 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 16) + // x1 += x6, x12 = rotl32(x12 ^ x1, 16) + // x2 += x7, x13 = rotl32(x13 ^ x2, 16) + // x3 += x4, x14 = rotl32(x14 ^ x3, 16) + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v4.4s + + eor v15.16b, v15.16b, v0.16b + eor v12.16b, v12.16b, v1.16b + eor v13.16b, v13.16b, v2.16b + eor v14.16b, v14.16b, v3.16b + + rev32 v15.8h, v15.8h + rev32 v12.8h, v12.8h + rev32 v13.8h, v13.8h + rev32 v14.8h, v14.8h + + // x10 += x15, x5 = rotl32(x5 ^ x10, 12) + // x11 += x12, x6 = rotl32(x6 ^ x11, 12) + // x8 += x13, x7 = rotl32(x7 ^ x8, 12) + // x9 += x14, x4 = rotl32(x4 ^ x9, 12) + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v12.4s + add v8.4s, v8.4s, v13.4s + add v9.4s, v9.4s, v14.4s + + eor v16.16b, v5.16b, v10.16b + eor v17.16b, v6.16b, v11.16b + eor v18.16b, v7.16b, v8.16b + eor v19.16b, v4.16b, v9.16b + + shl v5.4s, v16.4s, #12 + shl v6.4s, v17.4s, #12 + shl v7.4s, v18.4s, #12 + shl v4.4s, v19.4s, #12 + + sri v5.4s, v16.4s, #20 + sri v6.4s, v17.4s, #20 + sri v7.4s, v18.4s, #20 + sri v4.4s, v19.4s, #20 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 8) + // x1 += x6, x12 = rotl32(x12 ^ x1, 8) + // x2 += x7, x13 = rotl32(x13 ^ x2, 8) + // x3 += x4, x14 = rotl32(x14 ^ x3, 8) + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v4.4s + + eor v15.16b, v15.16b, v0.16b + eor v12.16b, v12.16b, v1.16b + eor v13.16b, v13.16b, v2.16b + eor v14.16b, v14.16b, v3.16b + + tbl v15.16b, {v15.16b}, v31.16b + tbl v12.16b, {v12.16b}, v31.16b + tbl v13.16b, {v13.16b}, v31.16b + tbl v14.16b, {v14.16b}, v31.16b + + // x10 += x15, x5 = rotl32(x5 ^ x10, 7) + // x11 += x12, x6 = rotl32(x6 ^ x11, 7) + // x8 += x13, x7 = rotl32(x7 ^ x8, 7) + // x9 += x14, x4 = rotl32(x4 ^ x9, 7) + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v12.4s + add v8.4s, v8.4s, v13.4s + add v9.4s, v9.4s, v14.4s + + eor v16.16b, v5.16b, v10.16b + eor v17.16b, v6.16b, v11.16b + eor v18.16b, v7.16b, v8.16b + eor v19.16b, v4.16b, v9.16b + + shl v5.4s, v16.4s, #7 + shl v6.4s, v17.4s, #7 + shl v7.4s, v18.4s, #7 + shl v4.4s, v19.4s, #7 + + sri v5.4s, v16.4s, #25 + sri v6.4s, v17.4s, #25 + sri v7.4s, v18.4s, #25 + sri v4.4s, v19.4s, #25 + + subs x3, x3, #1 + b.ne .Ldoubleround4 + + ld4r {v16.4s-v19.4s}, [x0], #16 + ld4r {v20.4s-v23.4s}, [x0], #16 + + // x12 += counter values 0-3 + add v12.4s, v12.4s, v30.4s + + // x0[0-3] += s0[0] + // x1[0-3] += s0[1] + // x2[0-3] += s0[2] + // x3[0-3] += s0[3] + add v0.4s, v0.4s, v16.4s + add v1.4s, v1.4s, v17.4s + add v2.4s, v2.4s, v18.4s + add v3.4s, v3.4s, v19.4s + + ld4r {v24.4s-v27.4s}, [x0], #16 + ld4r {v28.4s-v31.4s}, [x0] + + // x4[0-3] += s1[0] + // x5[0-3] += s1[1] + // x6[0-3] += s1[2] + // x7[0-3] += s1[3] + add v4.4s, v4.4s, v20.4s + add v5.4s, v5.4s, v21.4s + add v6.4s, v6.4s, v22.4s + add v7.4s, v7.4s, v23.4s + + // x8[0-3] += s2[0] + // x9[0-3] += s2[1] + // x10[0-3] += s2[2] + // x11[0-3] += s2[3] + add v8.4s, v8.4s, v24.4s + add v9.4s, v9.4s, v25.4s + add v10.4s, v10.4s, v26.4s + add v11.4s, v11.4s, v27.4s + + // x12[0-3] += s3[0] + // x13[0-3] += s3[1] + // x14[0-3] += s3[2] + // x15[0-3] += s3[3] + add v12.4s, v12.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v14.4s, v14.4s, v30.4s + add v15.4s, v15.4s, v31.4s + + // interleave 32-bit words in state n, n+1 + zip1 v16.4s, v0.4s, v1.4s + zip2 v17.4s, v0.4s, v1.4s + zip1 v18.4s, v2.4s, v3.4s + zip2 v19.4s, v2.4s, v3.4s + zip1 v20.4s, v4.4s, v5.4s + zip2 v21.4s, v4.4s, v5.4s + zip1 v22.4s, v6.4s, v7.4s + zip2 v23.4s, v6.4s, v7.4s + zip1 v24.4s, v8.4s, v9.4s + zip2 v25.4s, v8.4s, v9.4s + zip1 v26.4s, v10.4s, v11.4s + zip2 v27.4s, v10.4s, v11.4s + zip1 v28.4s, v12.4s, v13.4s + zip2 v29.4s, v12.4s, v13.4s + zip1 v30.4s, v14.4s, v15.4s + zip2 v31.4s, v14.4s, v15.4s + + // interleave 64-bit words in state n, n+2 + zip1 v0.2d, v16.2d, v18.2d + zip2 v4.2d, v16.2d, v18.2d + zip1 v8.2d, v17.2d, v19.2d + zip2 v12.2d, v17.2d, v19.2d + ld1 {v16.16b-v19.16b}, [x2], #64 + + zip1 v1.2d, v20.2d, v22.2d + zip2 v5.2d, v20.2d, v22.2d + zip1 v9.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + ld1 {v20.16b-v23.16b}, [x2], #64 + + zip1 v2.2d, v24.2d, v26.2d + zip2 v6.2d, v24.2d, v26.2d + zip1 v10.2d, v25.2d, v27.2d + zip2 v14.2d, v25.2d, v27.2d + ld1 {v24.16b-v27.16b}, [x2], #64 + + zip1 v3.2d, v28.2d, v30.2d + zip2 v7.2d, v28.2d, v30.2d + zip1 v11.2d, v29.2d, v31.2d + zip2 v15.2d, v29.2d, v31.2d + ld1 {v28.16b-v31.16b}, [x2] + + // xor with corresponding input, write to output + eor v16.16b, v16.16b, v0.16b + eor v17.16b, v17.16b, v1.16b + eor v18.16b, v18.16b, v2.16b + eor v19.16b, v19.16b, v3.16b + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v5.16b + st1 {v16.16b-v19.16b}, [x1], #64 + eor v22.16b, v22.16b, v6.16b + eor v23.16b, v23.16b, v7.16b + eor v24.16b, v24.16b, v8.16b + eor v25.16b, v25.16b, v9.16b + st1 {v20.16b-v23.16b}, [x1], #64 + eor v26.16b, v26.16b, v10.16b + eor v27.16b, v27.16b, v11.16b + eor v28.16b, v28.16b, v12.16b + st1 {v24.16b-v27.16b}, [x1], #64 + eor v29.16b, v29.16b, v13.16b + eor v30.16b, v30.16b, v14.16b + eor v31.16b, v31.16b, v15.16b + st1 {v28.16b-v31.16b}, [x1] + + ret +ENDPROC(chacha20_4block_xor_neon) + +CTRINC: .word 0, 1, 2, 3 +ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c new file mode 100644 index 000000000000..a7f2337d46cf --- /dev/null +++ b/arch/arm64/crypto/chacha20-neon-glue.c @@ -0,0 +1,127 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions + * + * Copyright (C) 2016 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include + +asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); +asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); + +static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes) +{ + u8 buf[CHACHA20_BLOCK_SIZE]; + + while (bytes >= CHACHA20_BLOCK_SIZE * 4) { + chacha20_4block_xor_neon(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE * 4; + src += CHACHA20_BLOCK_SIZE * 4; + dst += CHACHA20_BLOCK_SIZE * 4; + state[12] += 4; + } + while (bytes >= CHACHA20_BLOCK_SIZE) { + chacha20_block_xor_neon(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE; + src += CHACHA20_BLOCK_SIZE; + dst += CHACHA20_BLOCK_SIZE; + state[12]++; + } + if (bytes) { + memcpy(buf, src, bytes); + chacha20_block_xor_neon(state, buf, buf); + memcpy(dst, buf, bytes); + } +} + +static int chacha20_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + u32 state[16]; + int err; + + if (req->cryptlen <= CHACHA20_BLOCK_SIZE) + return crypto_chacha20_crypt(req); + + err = skcipher_walk_virt(&walk, req, true); + + crypto_chacha20_init(state, ctx, walk.iv); + + kernel_neon_begin(); + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); + + chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, + nbytes); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + kernel_neon_end(); + + return err; +} + +static struct skcipher_alg alg = { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha20_ctx), + .base.cra_alignmask = 1, + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA20_KEY_SIZE, + .max_keysize = CHACHA20_KEY_SIZE, + .ivsize = CHACHA20_IV_SIZE, + .chunksize = CHACHA20_BLOCK_SIZE, + .walksize = 4 * CHACHA20_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = chacha20_neon, + .decrypt = chacha20_neon, +}; + +static int __init chacha20_simd_mod_init(void) +{ + if (!(elf_hwcap & HWCAP_ASIMD)) + return -ENODEV; + + return crypto_register_skcipher(&alg); +} + +static void __exit chacha20_simd_mod_fini(void) +{ + crypto_unregister_skcipher(&alg); +} + +module_init(chacha20_simd_mod_init); +module_exit(chacha20_simd_mod_fini); + +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("chacha20"); -- cgit v1.2.3 From 293614ce3eda94a3c9b38d5c18fdc06eb1397221 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 11 Jan 2017 16:41:51 +0000 Subject: crypto: arm64/aes-blk - expose AES-CTR as synchronous cipher as well In addition to wrapping the AES-CTR cipher into the async SIMD wrapper, which exposes it as an async skcipher that defers processing to process context, expose our AES-CTR implementation directly as a synchronous cipher as well, but with a lower priority. This makes the AES-CTR transform usable in places where synchronous transforms are required, such as the MAC802.11 encryption code, which executes in sotfirq context, where SIMD processing is allowed on arm64. Users of the async transform will keep the existing behavior. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-glue.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 4e3f8adb1793..5164aaf82c6a 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -325,6 +325,23 @@ static struct skcipher_alg aes_algs[] = { { .setkey = skcipher_aes_setkey, .encrypt = ctr_encrypt, .decrypt = ctr_encrypt, +}, { + .base = { + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-" MODE, + .cra_priority = PRIO - 1, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_alignmask = 7, + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .setkey = skcipher_aes_setkey, + .encrypt = ctr_encrypt, + .decrypt = ctr_encrypt, }, { .base = { .cra_name = "__xts(aes)", @@ -350,8 +367,9 @@ static void aes_exit(void) { int i; - for (i = 0; i < ARRAY_SIZE(aes_simd_algs) && aes_simd_algs[i]; i++) - simd_skcipher_free(aes_simd_algs[i]); + for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++) + if (aes_simd_algs[i]) + simd_skcipher_free(aes_simd_algs[i]); crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); } @@ -370,6 +388,9 @@ static int __init aes_init(void) return err; for (i = 0; i < ARRAY_SIZE(aes_algs); i++) { + if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL)) + continue; + algname = aes_algs[i].base.cra_name + 2; drvname = aes_algs[i].base.cra_driver_name + 2; basename = aes_algs[i].base.cra_driver_name; -- cgit v1.2.3 From bed593c0e852f5c1efd3ca4e984fd744c51cf6ee Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 11 Jan 2017 16:41:52 +0000 Subject: crypto: arm64/aes - add scalar implementation This adds a scalar implementation of AES, based on the precomputed tables that are exposed by the generic AES code. Since rotates are cheap on arm64, this implementation only uses the 4 core tables (of 1 KB each), and avoids the prerotated ones, reducing the D-cache footprint by 75%. On Cortex-A57, this code manages 13.0 cycles per byte, which is ~34% faster than the generic C code. (Note that this is still >13x slower than the code that uses the optional ARMv8 Crypto Extensions, which manages <1 cycles per byte.) Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 4 ++ arch/arm64/crypto/Makefile | 3 + arch/arm64/crypto/aes-cipher-core.S | 127 ++++++++++++++++++++++++++++++++++++ arch/arm64/crypto/aes-cipher-glue.c | 69 ++++++++++++++++++++ 4 files changed, 203 insertions(+) create mode 100644 arch/arm64/crypto/aes-cipher-core.S create mode 100644 arch/arm64/crypto/aes-cipher-glue.c (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 0bf0f531f539..0826f8e599a6 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -41,6 +41,10 @@ config CRYPTO_CRC32_ARM64_CE depends on KERNEL_MODE_NEON && CRC32 select CRYPTO_HASH +config CRYPTO_AES_ARM64 + tristate "AES core cipher using scalar instructions" + select CRYPTO_AES + config CRYPTO_AES_ARM64_CE tristate "AES core cipher using ARMv8 Crypto Extensions" depends on ARM64 && KERNEL_MODE_NEON diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index 9d2826c5fccf..a893507629eb 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -44,6 +44,9 @@ sha512-arm64-y := sha512-glue.o sha512-core.o obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o +obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o +aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o + AFLAGS_aes-ce.o := -DINTERLEAVE=4 AFLAGS_aes-neon.o := -DINTERLEAVE=4 diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S new file mode 100644 index 000000000000..37590ab8121a --- /dev/null +++ b/arch/arm64/crypto/aes-cipher-core.S @@ -0,0 +1,127 @@ +/* + * Scalar AES core transform + * + * Copyright (C) 2017 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + + .text + + rk .req x0 + out .req x1 + in .req x2 + rounds .req x3 + tt .req x4 + lt .req x2 + + .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc + ldp \out0, \out1, [rk], #8 + + ubfx w13, \in0, #0, #8 + ubfx w14, \in1, #8, #8 + ldr w13, [tt, w13, uxtw #2] + ldr w14, [tt, w14, uxtw #2] + + .if \enc + ubfx w17, \in1, #0, #8 + ubfx w18, \in2, #8, #8 + .else + ubfx w17, \in3, #0, #8 + ubfx w18, \in0, #8, #8 + .endif + ldr w17, [tt, w17, uxtw #2] + ldr w18, [tt, w18, uxtw #2] + + ubfx w15, \in2, #16, #8 + ubfx w16, \in3, #24, #8 + ldr w15, [tt, w15, uxtw #2] + ldr w16, [tt, w16, uxtw #2] + + .if \enc + ubfx \t0, \in3, #16, #8 + ubfx \t1, \in0, #24, #8 + .else + ubfx \t0, \in1, #16, #8 + ubfx \t1, \in2, #24, #8 + .endif + ldr \t0, [tt, \t0, uxtw #2] + ldr \t1, [tt, \t1, uxtw #2] + + eor \out0, \out0, w13 + eor \out1, \out1, w17 + eor \out0, \out0, w14, ror #24 + eor \out1, \out1, w18, ror #24 + eor \out0, \out0, w15, ror #16 + eor \out1, \out1, \t0, ror #16 + eor \out0, \out0, w16, ror #8 + eor \out1, \out1, \t1, ror #8 + .endm + + .macro fround, out0, out1, out2, out3, in0, in1, in2, in3 + __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1 + __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1 + .endm + + .macro iround, out0, out1, out2, out3, in0, in1, in2, in3 + __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0 + __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0 + .endm + + .macro do_crypt, round, ttab, ltab + ldp w5, w6, [in] + ldp w7, w8, [in, #8] + ldp w9, w10, [rk], #16 + ldp w11, w12, [rk, #-8] + +CPU_BE( rev w5, w5 ) +CPU_BE( rev w6, w6 ) +CPU_BE( rev w7, w7 ) +CPU_BE( rev w8, w8 ) + + eor w5, w5, w9 + eor w6, w6, w10 + eor w7, w7, w11 + eor w8, w8, w12 + + ldr tt, =\ttab + ldr lt, =\ltab + + tbnz rounds, #1, 1f + +0: \round w9, w10, w11, w12, w5, w6, w7, w8 + \round w5, w6, w7, w8, w9, w10, w11, w12 + +1: subs rounds, rounds, #4 + \round w9, w10, w11, w12, w5, w6, w7, w8 + csel tt, tt, lt, hi + \round w5, w6, w7, w8, w9, w10, w11, w12 + b.hi 0b + +CPU_BE( rev w5, w5 ) +CPU_BE( rev w6, w6 ) +CPU_BE( rev w7, w7 ) +CPU_BE( rev w8, w8 ) + + stp w5, w6, [out] + stp w7, w8, [out, #8] + ret + + .align 4 + .ltorg + .endm + + .align 5 +ENTRY(__aes_arm64_encrypt) + do_crypt fround, crypto_ft_tab, crypto_fl_tab +ENDPROC(__aes_arm64_encrypt) + + .align 5 +ENTRY(__aes_arm64_decrypt) + do_crypt iround, crypto_it_tab, crypto_il_tab +ENDPROC(__aes_arm64_decrypt) diff --git a/arch/arm64/crypto/aes-cipher-glue.c b/arch/arm64/crypto/aes-cipher-glue.c new file mode 100644 index 000000000000..7288e7cbebff --- /dev/null +++ b/arch/arm64/crypto/aes-cipher-glue.c @@ -0,0 +1,69 @@ +/* + * Scalar AES core transform + * + * Copyright (C) 2017 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); +EXPORT_SYMBOL(__aes_arm64_encrypt); + +asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds); +EXPORT_SYMBOL(__aes_arm64_decrypt); + +static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); + int rounds = 6 + ctx->key_length / 4; + + __aes_arm64_encrypt(ctx->key_enc, out, in, rounds); +} + +static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); + int rounds = 6 + ctx->key_length / 4; + + __aes_arm64_decrypt(ctx->key_dec, out, in, rounds); +} + +static struct crypto_alg aes_alg = { + .cra_name = "aes", + .cra_driver_name = "aes-arm64", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_module = THIS_MODULE, + + .cra_cipher.cia_min_keysize = AES_MIN_KEY_SIZE, + .cra_cipher.cia_max_keysize = AES_MAX_KEY_SIZE, + .cra_cipher.cia_setkey = crypto_aes_set_key, + .cra_cipher.cia_encrypt = aes_encrypt, + .cra_cipher.cia_decrypt = aes_decrypt +}; + +static int __init aes_init(void) +{ + return crypto_register_alg(&aes_alg); +} + +static void __exit aes_fini(void) +{ + crypto_unregister_alg(&aes_alg); +} + +module_init(aes_init); +module_exit(aes_fini); + +MODULE_DESCRIPTION("Scalar AES cipher for arm64"); +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("aes"); -- cgit v1.2.3 From 1abee99eafab67fb1c98f9ecfc43cd5735384a86 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 11 Jan 2017 16:41:55 +0000 Subject: crypto: arm64/aes - reimplement bit-sliced ARM/NEON implementation for arm64 This is a reimplementation of the NEON version of the bit-sliced AES algorithm. This code is heavily based on Andy Polyakov's OpenSSL version for ARM, which is also available in the kernel. This is an alternative for the existing NEON implementation for arm64 authored by me, which suffers from poor performance due to its reliance on the pathologically slow four register variant of the tbl/tbx NEON instruction. This version is about ~30% (*) faster than the generic C code, but only in cases where the input can be 8x interleaved (this is a fundamental property of bit slicing). For this reason, only the chaining modes ECB, XTS and CTR are implemented. (The significance of ECB is that it could potentially be used by other chaining modes) * Measured on Cortex-A57. Note that this is still an order of magnitude slower than the implementations that use the dedicated AES instructions introduced in ARMv8, but those are part of an optional extension, and so it is good to have a fallback. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 7 + arch/arm64/crypto/Makefile | 3 + arch/arm64/crypto/aes-neonbs-core.S | 963 ++++++++++++++++++++++++++++++++++++ arch/arm64/crypto/aes-neonbs-glue.c | 420 ++++++++++++++++ 4 files changed, 1393 insertions(+) create mode 100644 arch/arm64/crypto/aes-neonbs-core.S create mode 100644 arch/arm64/crypto/aes-neonbs-glue.c (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 0826f8e599a6..5de75c3dcbd4 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -82,4 +82,11 @@ config CRYPTO_CHACHA20_NEON select CRYPTO_BLKCIPHER select CRYPTO_CHACHA20 +config CRYPTO_AES_ARM64_BS + tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm" + depends on KERNEL_MODE_NEON + select CRYPTO_BLKCIPHER + select CRYPTO_AES_ARM64 + select CRYPTO_SIMD + endif diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index a893507629eb..d1ae1b9cbe70 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -47,6 +47,9 @@ chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o +obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o +aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o + AFLAGS_aes-ce.o := -DINTERLEAVE=4 AFLAGS_aes-neon.o := -DINTERLEAVE=4 diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S new file mode 100644 index 000000000000..8d0cdaa2768d --- /dev/null +++ b/arch/arm64/crypto/aes-neonbs-core.S @@ -0,0 +1,963 @@ +/* + * Bit sliced AES using NEON instructions + * + * Copyright (C) 2016 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * The algorithm implemented here is described in detail by the paper + * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and + * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf) + * + * This implementation is based primarily on the OpenSSL implementation + * for 32-bit ARM written by Andy Polyakov + */ + +#include +#include + + .text + + rounds .req x11 + bskey .req x12 + + .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 + eor \b2, \b2, \b1 + eor \b5, \b5, \b6 + eor \b3, \b3, \b0 + eor \b6, \b6, \b2 + eor \b5, \b5, \b0 + eor \b6, \b6, \b3 + eor \b3, \b3, \b7 + eor \b7, \b7, \b5 + eor \b3, \b3, \b4 + eor \b4, \b4, \b5 + eor \b2, \b2, \b7 + eor \b3, \b3, \b1 + eor \b1, \b1, \b5 + .endm + + .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 + eor \b0, \b0, \b6 + eor \b1, \b1, \b4 + eor \b4, \b4, \b6 + eor \b2, \b2, \b0 + eor \b6, \b6, \b1 + eor \b1, \b1, \b5 + eor \b5, \b5, \b3 + eor \b3, \b3, \b7 + eor \b7, \b7, \b5 + eor \b2, \b2, \b5 + eor \b4, \b4, \b7 + .endm + + .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5 + eor \b1, \b1, \b7 + eor \b4, \b4, \b7 + eor \b7, \b7, \b5 + eor \b1, \b1, \b3 + eor \b2, \b2, \b5 + eor \b3, \b3, \b7 + eor \b6, \b6, \b1 + eor \b2, \b2, \b0 + eor \b5, \b5, \b3 + eor \b4, \b4, \b6 + eor \b0, \b0, \b6 + eor \b1, \b1, \b4 + .endm + + .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2 + eor \b1, \b1, \b5 + eor \b2, \b2, \b7 + eor \b3, \b3, \b1 + eor \b4, \b4, \b5 + eor \b7, \b7, \b5 + eor \b3, \b3, \b4 + eor \b5, \b5, \b0 + eor \b3, \b3, \b7 + eor \b6, \b6, \b2 + eor \b2, \b2, \b1 + eor \b6, \b6, \b3 + eor \b3, \b3, \b0 + eor \b5, \b5, \b6 + .endm + + .macro mul_gf4, x0, x1, y0, y1, t0, t1 + eor \t0, \y0, \y1 + and \t0, \t0, \x0 + eor \x0, \x0, \x1 + and \t1, \x1, \y0 + and \x0, \x0, \y1 + eor \x1, \t1, \t0 + eor \x0, \x0, \t1 + .endm + + .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1 + eor \t0, \y0, \y1 + eor \t1, \y2, \y3 + and \t0, \t0, \x0 + and \t1, \t1, \x2 + eor \x0, \x0, \x1 + eor \x2, \x2, \x3 + and \x1, \x1, \y0 + and \x3, \x3, \y2 + and \x0, \x0, \y1 + and \x2, \x2, \y3 + eor \x1, \x1, \x0 + eor \x2, \x2, \x3 + eor \x0, \x0, \t0 + eor \x3, \x3, \t1 + .endm + + .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, y1, y2, y3, t0, t1, t2, t3 + eor \t0, \x0, \x2 + eor \t1, \x1, \x3 + mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3 + eor \y0, \y0, \y2 + eor \y1, \y1, \y3 + mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2 + eor \x0, \x0, \t0 + eor \x2, \x2, \t0 + eor \x1, \x1, \t1 + eor \x3, \x3, \t1 + eor \t0, \x4, \x6 + eor \t1, \x5, \x7 + mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2 + eor \y0, \y0, \y2 + eor \y1, \y1, \y3 + mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3 + eor \x4, \x4, \t0 + eor \x6, \x6, \t0 + eor \x5, \x5, \t1 + eor \x7, \x7, \t1 + .endm + + .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \ + t0, t1, t2, t3, s0, s1, s2, s3 + eor \t3, \x4, \x6 + eor \t0, \x5, \x7 + eor \t1, \x1, \x3 + eor \s1, \x7, \x6 + eor \s0, \x0, \x2 + eor \s3, \t3, \t0 + orr \t2, \t0, \t1 + and \s2, \t3, \s0 + orr \t3, \t3, \s0 + eor \s0, \s0, \t1 + and \t0, \t0, \t1 + eor \t1, \x3, \x2 + and \s3, \s3, \s0 + and \s1, \s1, \t1 + eor \t1, \x4, \x5 + eor \s0, \x1, \x0 + eor \t3, \t3, \s1 + eor \t2, \t2, \s1 + and \s1, \t1, \s0 + orr \t1, \t1, \s0 + eor \t3, \t3, \s3 + eor \t0, \t0, \s1 + eor \t2, \t2, \s2 + eor \t1, \t1, \s3 + eor \t0, \t0, \s2 + and \s0, \x7, \x3 + eor \t1, \t1, \s2 + and \s1, \x6, \x2 + and \s2, \x5, \x1 + orr \s3, \x4, \x0 + eor \t3, \t3, \s0 + eor \t1, \t1, \s2 + eor \s0, \t0, \s3 + eor \t2, \t2, \s1 + and \s2, \t3, \t1 + eor \s1, \t2, \s2 + eor \s3, \s0, \s2 + bsl \s1, \t1, \s0 + not \t0, \s0 + bsl \s0, \s1, \s3 + bsl \t0, \s1, \s3 + bsl \s3, \t3, \t2 + eor \t3, \t3, \t2 + and \s2, \s0, \s3 + eor \t1, \t1, \t0 + eor \s2, \s2, \t3 + mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ + \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 + .endm + + .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ + t0, t1, t2, t3, s0, s1, s2, s3 + in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ + \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b + inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \ + \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ + \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ + \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b + out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ + \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b + .endm + + .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ + t0, t1, t2, t3, s0, s1, s2, s3 + inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ + \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b + inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \ + \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ + \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ + \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b + inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ + \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b + .endm + + .macro enc_next_rk + ldp q16, q17, [bskey], #128 + ldp q18, q19, [bskey, #-96] + ldp q20, q21, [bskey, #-64] + ldp q22, q23, [bskey, #-32] + .endm + + .macro dec_next_rk + ldp q16, q17, [bskey, #-128]! + ldp q18, q19, [bskey, #32] + ldp q20, q21, [bskey, #64] + ldp q22, q23, [bskey, #96] + .endm + + .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7 + eor \x0\().16b, \x0\().16b, v16.16b + eor \x1\().16b, \x1\().16b, v17.16b + eor \x2\().16b, \x2\().16b, v18.16b + eor \x3\().16b, \x3\().16b, v19.16b + eor \x4\().16b, \x4\().16b, v20.16b + eor \x5\().16b, \x5\().16b, v21.16b + eor \x6\().16b, \x6\().16b, v22.16b + eor \x7\().16b, \x7\().16b, v23.16b + .endm + + .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask + tbl \x0\().16b, {\x0\().16b}, \mask\().16b + tbl \x1\().16b, {\x1\().16b}, \mask\().16b + tbl \x2\().16b, {\x2\().16b}, \mask\().16b + tbl \x3\().16b, {\x3\().16b}, \mask\().16b + tbl \x4\().16b, {\x4\().16b}, \mask\().16b + tbl \x5\().16b, {\x5\().16b}, \mask\().16b + tbl \x6\().16b, {\x6\().16b}, \mask\().16b + tbl \x7\().16b, {\x7\().16b}, \mask\().16b + .endm + + .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ + t0, t1, t2, t3, t4, t5, t6, t7, inv + ext \t0\().16b, \x0\().16b, \x0\().16b, #12 + ext \t1\().16b, \x1\().16b, \x1\().16b, #12 + eor \x0\().16b, \x0\().16b, \t0\().16b + ext \t2\().16b, \x2\().16b, \x2\().16b, #12 + eor \x1\().16b, \x1\().16b, \t1\().16b + ext \t3\().16b, \x3\().16b, \x3\().16b, #12 + eor \x2\().16b, \x2\().16b, \t2\().16b + ext \t4\().16b, \x4\().16b, \x4\().16b, #12 + eor \x3\().16b, \x3\().16b, \t3\().16b + ext \t5\().16b, \x5\().16b, \x5\().16b, #12 + eor \x4\().16b, \x4\().16b, \t4\().16b + ext \t6\().16b, \x6\().16b, \x6\().16b, #12 + eor \x5\().16b, \x5\().16b, \t5\().16b + ext \t7\().16b, \x7\().16b, \x7\().16b, #12 + eor \x6\().16b, \x6\().16b, \t6\().16b + eor \t1\().16b, \t1\().16b, \x0\().16b + eor \x7\().16b, \x7\().16b, \t7\().16b + ext \x0\().16b, \x0\().16b, \x0\().16b, #8 + eor \t2\().16b, \t2\().16b, \x1\().16b + eor \t0\().16b, \t0\().16b, \x7\().16b + eor \t1\().16b, \t1\().16b, \x7\().16b + ext \x1\().16b, \x1\().16b, \x1\().16b, #8 + eor \t5\().16b, \t5\().16b, \x4\().16b + eor \x0\().16b, \x0\().16b, \t0\().16b + eor \t6\().16b, \t6\().16b, \x5\().16b + eor \x1\().16b, \x1\().16b, \t1\().16b + ext \t0\().16b, \x4\().16b, \x4\().16b, #8 + eor \t4\().16b, \t4\().16b, \x3\().16b + ext \t1\().16b, \x5\().16b, \x5\().16b, #8 + eor \t7\().16b, \t7\().16b, \x6\().16b + ext \x4\().16b, \x3\().16b, \x3\().16b, #8 + eor \t3\().16b, \t3\().16b, \x2\().16b + ext \x5\().16b, \x7\().16b, \x7\().16b, #8 + eor \t4\().16b, \t4\().16b, \x7\().16b + ext \x3\().16b, \x6\().16b, \x6\().16b, #8 + eor \t3\().16b, \t3\().16b, \x7\().16b + ext \x6\().16b, \x2\().16b, \x2\().16b, #8 + eor \x7\().16b, \t1\().16b, \t5\().16b + .ifb \inv + eor \x2\().16b, \t0\().16b, \t4\().16b + eor \x4\().16b, \x4\().16b, \t3\().16b + eor \x5\().16b, \x5\().16b, \t7\().16b + eor \x3\().16b, \x3\().16b, \t6\().16b + eor \x6\().16b, \x6\().16b, \t2\().16b + .else + eor \t3\().16b, \t3\().16b, \x4\().16b + eor \x5\().16b, \x5\().16b, \t7\().16b + eor \x2\().16b, \x3\().16b, \t6\().16b + eor \x3\().16b, \t0\().16b, \t4\().16b + eor \x4\().16b, \x6\().16b, \t2\().16b + mov \x6\().16b, \t3\().16b + .endif + .endm + + .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ + t0, t1, t2, t3, t4, t5, t6, t7 + ext \t0\().16b, \x0\().16b, \x0\().16b, #8 + ext \t6\().16b, \x6\().16b, \x6\().16b, #8 + ext \t7\().16b, \x7\().16b, \x7\().16b, #8 + eor \t0\().16b, \t0\().16b, \x0\().16b + ext \t1\().16b, \x1\().16b, \x1\().16b, #8 + eor \t6\().16b, \t6\().16b, \x6\().16b + ext \t2\().16b, \x2\().16b, \x2\().16b, #8 + eor \t7\().16b, \t7\().16b, \x7\().16b + ext \t3\().16b, \x3\().16b, \x3\().16b, #8 + eor \t1\().16b, \t1\().16b, \x1\().16b + ext \t4\().16b, \x4\().16b, \x4\().16b, #8 + eor \t2\().16b, \t2\().16b, \x2\().16b + ext \t5\().16b, \x5\().16b, \x5\().16b, #8 + eor \t3\().16b, \t3\().16b, \x3\().16b + eor \t4\().16b, \t4\().16b, \x4\().16b + eor \t5\().16b, \t5\().16b, \x5\().16b + eor \x0\().16b, \x0\().16b, \t6\().16b + eor \x1\().16b, \x1\().16b, \t6\().16b + eor \x2\().16b, \x2\().16b, \t0\().16b + eor \x4\().16b, \x4\().16b, \t2\().16b + eor \x3\().16b, \x3\().16b, \t1\().16b + eor \x1\().16b, \x1\().16b, \t7\().16b + eor \x2\().16b, \x2\().16b, \t7\().16b + eor \x4\().16b, \x4\().16b, \t6\().16b + eor \x5\().16b, \x5\().16b, \t3\().16b + eor \x3\().16b, \x3\().16b, \t6\().16b + eor \x6\().16b, \x6\().16b, \t4\().16b + eor \x4\().16b, \x4\().16b, \t7\().16b + eor \x5\().16b, \x5\().16b, \t7\().16b + eor \x7\().16b, \x7\().16b, \t5\().16b + mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ + \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1 + .endm + + .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1 + ushr \t0\().2d, \b0\().2d, #\n + ushr \t1\().2d, \b1\().2d, #\n + eor \t0\().16b, \t0\().16b, \a0\().16b + eor \t1\().16b, \t1\().16b, \a1\().16b + and \t0\().16b, \t0\().16b, \mask\().16b + and \t1\().16b, \t1\().16b, \mask\().16b + eor \a0\().16b, \a0\().16b, \t0\().16b + shl \t0\().2d, \t0\().2d, #\n + eor \a1\().16b, \a1\().16b, \t1\().16b + shl \t1\().2d, \t1\().2d, #\n + eor \b0\().16b, \b0\().16b, \t0\().16b + eor \b1\().16b, \b1\().16b, \t1\().16b + .endm + + .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3 + movi \t0\().16b, #0x55 + movi \t1\().16b, #0x33 + swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3 + swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3 + movi \t0\().16b, #0x0f + swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3 + swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3 + swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3 + swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3 + .endm + + + .align 6 +M0: .octa 0x0004080c0105090d02060a0e03070b0f + +M0SR: .octa 0x0004080c05090d010a0e02060f03070b +SR: .octa 0x0f0e0d0c0a09080b0504070600030201 +SRM0: .octa 0x01060b0c0207080d0304090e00050a0f + +M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03 +ISR: .octa 0x0f0e0d0c080b0a090504070602010003 +ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f + + /* + * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) + */ +ENTRY(aesbs_convert_key) + ld1 {v7.4s}, [x1], #16 // load round 0 key + ld1 {v17.4s}, [x1], #16 // load round 1 key + + movi v8.16b, #0x01 // bit masks + movi v9.16b, #0x02 + movi v10.16b, #0x04 + movi v11.16b, #0x08 + movi v12.16b, #0x10 + movi v13.16b, #0x20 + movi v14.16b, #0x40 + movi v15.16b, #0x80 + ldr q16, M0 + + sub x2, x2, #1 + str q7, [x0], #16 // save round 0 key + +.Lkey_loop: + tbl v7.16b ,{v17.16b}, v16.16b + ld1 {v17.4s}, [x1], #16 // load next round key + + cmtst v0.16b, v7.16b, v8.16b + cmtst v1.16b, v7.16b, v9.16b + cmtst v2.16b, v7.16b, v10.16b + cmtst v3.16b, v7.16b, v11.16b + cmtst v4.16b, v7.16b, v12.16b + cmtst v5.16b, v7.16b, v13.16b + cmtst v6.16b, v7.16b, v14.16b + cmtst v7.16b, v7.16b, v15.16b + not v0.16b, v0.16b + not v1.16b, v1.16b + not v5.16b, v5.16b + not v6.16b, v6.16b + + subs x2, x2, #1 + stp q0, q1, [x0], #128 + stp q2, q3, [x0, #-96] + stp q4, q5, [x0, #-64] + stp q6, q7, [x0, #-32] + b.ne .Lkey_loop + + movi v7.16b, #0x63 // compose .L63 + eor v17.16b, v17.16b, v7.16b + str q17, [x0] + ret +ENDPROC(aesbs_convert_key) + + .align 4 +aesbs_encrypt8: + ldr q9, [bskey], #16 // round 0 key + ldr q8, M0SR + ldr q24, SR + + eor v10.16b, v0.16b, v9.16b // xor with round0 key + eor v11.16b, v1.16b, v9.16b + tbl v0.16b, {v10.16b}, v8.16b + eor v12.16b, v2.16b, v9.16b + tbl v1.16b, {v11.16b}, v8.16b + eor v13.16b, v3.16b, v9.16b + tbl v2.16b, {v12.16b}, v8.16b + eor v14.16b, v4.16b, v9.16b + tbl v3.16b, {v13.16b}, v8.16b + eor v15.16b, v5.16b, v9.16b + tbl v4.16b, {v14.16b}, v8.16b + eor v10.16b, v6.16b, v9.16b + tbl v5.16b, {v15.16b}, v8.16b + eor v11.16b, v7.16b, v9.16b + tbl v6.16b, {v10.16b}, v8.16b + tbl v7.16b, {v11.16b}, v8.16b + + bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 + + sub rounds, rounds, #1 + b .Lenc_sbox + +.Lenc_loop: + shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 +.Lenc_sbox: + sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ + v13, v14, v15 + subs rounds, rounds, #1 + b.cc .Lenc_done + + enc_next_rk + + mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \ + v13, v14, v15 + + add_round_key v0, v1, v2, v3, v4, v5, v6, v7 + + b.ne .Lenc_loop + ldr q24, SRM0 + b .Lenc_loop + +.Lenc_done: + ldr q12, [bskey] // last round key + + bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11 + + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v12.16b + eor v4.16b, v4.16b, v12.16b + eor v6.16b, v6.16b, v12.16b + eor v3.16b, v3.16b, v12.16b + eor v7.16b, v7.16b, v12.16b + eor v2.16b, v2.16b, v12.16b + eor v5.16b, v5.16b, v12.16b + ret +ENDPROC(aesbs_encrypt8) + + .align 4 +aesbs_decrypt8: + lsl x9, rounds, #7 + add bskey, bskey, x9 + + ldr q9, [bskey, #-112]! // round 0 key + ldr q8, M0ISR + ldr q24, ISR + + eor v10.16b, v0.16b, v9.16b // xor with round0 key + eor v11.16b, v1.16b, v9.16b + tbl v0.16b, {v10.16b}, v8.16b + eor v12.16b, v2.16b, v9.16b + tbl v1.16b, {v11.16b}, v8.16b + eor v13.16b, v3.16b, v9.16b + tbl v2.16b, {v12.16b}, v8.16b + eor v14.16b, v4.16b, v9.16b + tbl v3.16b, {v13.16b}, v8.16b + eor v15.16b, v5.16b, v9.16b + tbl v4.16b, {v14.16b}, v8.16b + eor v10.16b, v6.16b, v9.16b + tbl v5.16b, {v15.16b}, v8.16b + eor v11.16b, v7.16b, v9.16b + tbl v6.16b, {v10.16b}, v8.16b + tbl v7.16b, {v11.16b}, v8.16b + + bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 + + sub rounds, rounds, #1 + b .Ldec_sbox + +.Ldec_loop: + shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 +.Ldec_sbox: + inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ + v13, v14, v15 + subs rounds, rounds, #1 + b.cc .Ldec_done + + dec_next_rk + + add_round_key v0, v1, v6, v4, v2, v7, v3, v5 + + inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \ + v13, v14, v15 + + b.ne .Ldec_loop + ldr q24, ISRM0 + b .Ldec_loop +.Ldec_done: + ldr q12, [bskey, #-16] // last round key + + bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11 + + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v12.16b + eor v6.16b, v6.16b, v12.16b + eor v4.16b, v4.16b, v12.16b + eor v2.16b, v2.16b, v12.16b + eor v7.16b, v7.16b, v12.16b + eor v3.16b, v3.16b, v12.16b + eor v5.16b, v5.16b, v12.16b + ret +ENDPROC(aesbs_decrypt8) + + /* + * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks) + * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks) + */ + .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 + stp x29, x30, [sp, #-16]! + mov x29, sp + +99: mov x5, #1 + lsl x5, x5, x4 + subs w4, w4, #8 + csel x4, x4, xzr, pl + csel x5, x5, xzr, mi + + ld1 {v0.16b}, [x1], #16 + tbnz x5, #1, 0f + ld1 {v1.16b}, [x1], #16 + tbnz x5, #2, 0f + ld1 {v2.16b}, [x1], #16 + tbnz x5, #3, 0f + ld1 {v3.16b}, [x1], #16 + tbnz x5, #4, 0f + ld1 {v4.16b}, [x1], #16 + tbnz x5, #5, 0f + ld1 {v5.16b}, [x1], #16 + tbnz x5, #6, 0f + ld1 {v6.16b}, [x1], #16 + tbnz x5, #7, 0f + ld1 {v7.16b}, [x1], #16 + +0: mov bskey, x2 + mov rounds, x3 + bl \do8 + + st1 {\o0\().16b}, [x0], #16 + tbnz x5, #1, 1f + st1 {\o1\().16b}, [x0], #16 + tbnz x5, #2, 1f + st1 {\o2\().16b}, [x0], #16 + tbnz x5, #3, 1f + st1 {\o3\().16b}, [x0], #16 + tbnz x5, #4, 1f + st1 {\o4\().16b}, [x0], #16 + tbnz x5, #5, 1f + st1 {\o5\().16b}, [x0], #16 + tbnz x5, #6, 1f + st1 {\o6\().16b}, [x0], #16 + tbnz x5, #7, 1f + st1 {\o7\().16b}, [x0], #16 + + cbnz x4, 99b + +1: ldp x29, x30, [sp], #16 + ret + .endm + + .align 4 +ENTRY(aesbs_ecb_encrypt) + __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 +ENDPROC(aesbs_ecb_encrypt) + + .align 4 +ENTRY(aesbs_ecb_decrypt) + __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 +ENDPROC(aesbs_ecb_decrypt) + + /* + * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[]) + */ + .align 4 +ENTRY(aesbs_cbc_decrypt) + stp x29, x30, [sp, #-16]! + mov x29, sp + +99: mov x6, #1 + lsl x6, x6, x4 + subs w4, w4, #8 + csel x4, x4, xzr, pl + csel x6, x6, xzr, mi + + ld1 {v0.16b}, [x1], #16 + mov v25.16b, v0.16b + tbnz x6, #1, 0f + ld1 {v1.16b}, [x1], #16 + mov v26.16b, v1.16b + tbnz x6, #2, 0f + ld1 {v2.16b}, [x1], #16 + mov v27.16b, v2.16b + tbnz x6, #3, 0f + ld1 {v3.16b}, [x1], #16 + mov v28.16b, v3.16b + tbnz x6, #4, 0f + ld1 {v4.16b}, [x1], #16 + mov v29.16b, v4.16b + tbnz x6, #5, 0f + ld1 {v5.16b}, [x1], #16 + mov v30.16b, v5.16b + tbnz x6, #6, 0f + ld1 {v6.16b}, [x1], #16 + mov v31.16b, v6.16b + tbnz x6, #7, 0f + ld1 {v7.16b}, [x1] + +0: mov bskey, x2 + mov rounds, x3 + bl aesbs_decrypt8 + + ld1 {v24.16b}, [x5] // load IV + + eor v1.16b, v1.16b, v25.16b + eor v6.16b, v6.16b, v26.16b + eor v4.16b, v4.16b, v27.16b + eor v2.16b, v2.16b, v28.16b + eor v7.16b, v7.16b, v29.16b + eor v0.16b, v0.16b, v24.16b + eor v3.16b, v3.16b, v30.16b + eor v5.16b, v5.16b, v31.16b + + st1 {v0.16b}, [x0], #16 + mov v24.16b, v25.16b + tbnz x6, #1, 1f + st1 {v1.16b}, [x0], #16 + mov v24.16b, v26.16b + tbnz x6, #2, 1f + st1 {v6.16b}, [x0], #16 + mov v24.16b, v27.16b + tbnz x6, #3, 1f + st1 {v4.16b}, [x0], #16 + mov v24.16b, v28.16b + tbnz x6, #4, 1f + st1 {v2.16b}, [x0], #16 + mov v24.16b, v29.16b + tbnz x6, #5, 1f + st1 {v7.16b}, [x0], #16 + mov v24.16b, v30.16b + tbnz x6, #6, 1f + st1 {v3.16b}, [x0], #16 + mov v24.16b, v31.16b + tbnz x6, #7, 1f + ld1 {v24.16b}, [x1], #16 + st1 {v5.16b}, [x0], #16 +1: st1 {v24.16b}, [x5] // store IV + + cbnz x4, 99b + + ldp x29, x30, [sp], #16 + ret +ENDPROC(aesbs_cbc_decrypt) + + .macro next_tweak, out, in, const, tmp + sshr \tmp\().2d, \in\().2d, #63 + and \tmp\().16b, \tmp\().16b, \const\().16b + add \out\().2d, \in\().2d, \in\().2d + ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 + eor \out\().16b, \out\().16b, \tmp\().16b + .endm + + .align 4 +.Lxts_mul_x: +CPU_LE( .quad 1, 0x87 ) +CPU_BE( .quad 0x87, 1 ) + + /* + * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[]) + * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[]) + */ +__xts_crypt8: + mov x6, #1 + lsl x6, x6, x4 + subs w4, w4, #8 + csel x4, x4, xzr, pl + csel x6, x6, xzr, mi + + ld1 {v0.16b}, [x1], #16 + next_tweak v26, v25, v30, v31 + eor v0.16b, v0.16b, v25.16b + tbnz x6, #1, 0f + + ld1 {v1.16b}, [x1], #16 + next_tweak v27, v26, v30, v31 + eor v1.16b, v1.16b, v26.16b + tbnz x6, #2, 0f + + ld1 {v2.16b}, [x1], #16 + next_tweak v28, v27, v30, v31 + eor v2.16b, v2.16b, v27.16b + tbnz x6, #3, 0f + + ld1 {v3.16b}, [x1], #16 + next_tweak v29, v28, v30, v31 + eor v3.16b, v3.16b, v28.16b + tbnz x6, #4, 0f + + ld1 {v4.16b}, [x1], #16 + str q29, [sp, #16] + eor v4.16b, v4.16b, v29.16b + next_tweak v29, v29, v30, v31 + tbnz x6, #5, 0f + + ld1 {v5.16b}, [x1], #16 + str q29, [sp, #32] + eor v5.16b, v5.16b, v29.16b + next_tweak v29, v29, v30, v31 + tbnz x6, #6, 0f + + ld1 {v6.16b}, [x1], #16 + str q29, [sp, #48] + eor v6.16b, v6.16b, v29.16b + next_tweak v29, v29, v30, v31 + tbnz x6, #7, 0f + + ld1 {v7.16b}, [x1], #16 + str q29, [sp, #64] + eor v7.16b, v7.16b, v29.16b + next_tweak v29, v29, v30, v31 + +0: mov bskey, x2 + mov rounds, x3 + br x7 +ENDPROC(__xts_crypt8) + + .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 + stp x29, x30, [sp, #-80]! + mov x29, sp + + ldr q30, .Lxts_mul_x + ld1 {v25.16b}, [x5] + +99: adr x7, \do8 + bl __xts_crypt8 + + ldp q16, q17, [sp, #16] + ldp q18, q19, [sp, #48] + + eor \o0\().16b, \o0\().16b, v25.16b + eor \o1\().16b, \o1\().16b, v26.16b + eor \o2\().16b, \o2\().16b, v27.16b + eor \o3\().16b, \o3\().16b, v28.16b + + st1 {\o0\().16b}, [x0], #16 + mov v25.16b, v26.16b + tbnz x6, #1, 1f + st1 {\o1\().16b}, [x0], #16 + mov v25.16b, v27.16b + tbnz x6, #2, 1f + st1 {\o2\().16b}, [x0], #16 + mov v25.16b, v28.16b + tbnz x6, #3, 1f + st1 {\o3\().16b}, [x0], #16 + mov v25.16b, v29.16b + tbnz x6, #4, 1f + + eor \o4\().16b, \o4\().16b, v16.16b + eor \o5\().16b, \o5\().16b, v17.16b + eor \o6\().16b, \o6\().16b, v18.16b + eor \o7\().16b, \o7\().16b, v19.16b + + st1 {\o4\().16b}, [x0], #16 + tbnz x6, #5, 1f + st1 {\o5\().16b}, [x0], #16 + tbnz x6, #6, 1f + st1 {\o6\().16b}, [x0], #16 + tbnz x6, #7, 1f + st1 {\o7\().16b}, [x0], #16 + + cbnz x4, 99b + +1: st1 {v25.16b}, [x5] + ldp x29, x30, [sp], #80 + ret + .endm + +ENTRY(aesbs_xts_encrypt) + __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 +ENDPROC(aesbs_xts_encrypt) + +ENTRY(aesbs_xts_decrypt) + __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 +ENDPROC(aesbs_xts_decrypt) + + .macro next_ctr, v + mov \v\().d[1], x8 + adds x8, x8, #1 + mov \v\().d[0], x7 + adc x7, x7, xzr + rev64 \v\().16b, \v\().16b + .endm + + /* + * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], + * int rounds, int blocks, u8 iv[], bool final) + */ +ENTRY(aesbs_ctr_encrypt) + stp x29, x30, [sp, #-16]! + mov x29, sp + + add x4, x4, x6 // do one extra block if final + + ldp x7, x8, [x5] + ld1 {v0.16b}, [x5] +CPU_LE( rev x7, x7 ) +CPU_LE( rev x8, x8 ) + adds x8, x8, #1 + adc x7, x7, xzr + +99: mov x9, #1 + lsl x9, x9, x4 + subs w4, w4, #8 + csel x4, x4, xzr, pl + csel x9, x9, xzr, le + + next_ctr v1 + next_ctr v2 + next_ctr v3 + next_ctr v4 + next_ctr v5 + next_ctr v6 + next_ctr v7 + +0: mov bskey, x2 + mov rounds, x3 + bl aesbs_encrypt8 + + lsr x9, x9, x6 // disregard the extra block + tbnz x9, #0, 0f + + ld1 {v8.16b}, [x1], #16 + eor v0.16b, v0.16b, v8.16b + st1 {v0.16b}, [x0], #16 + tbnz x9, #1, 1f + + ld1 {v9.16b}, [x1], #16 + eor v1.16b, v1.16b, v9.16b + st1 {v1.16b}, [x0], #16 + tbnz x9, #2, 2f + + ld1 {v10.16b}, [x1], #16 + eor v4.16b, v4.16b, v10.16b + st1 {v4.16b}, [x0], #16 + tbnz x9, #3, 3f + + ld1 {v11.16b}, [x1], #16 + eor v6.16b, v6.16b, v11.16b + st1 {v6.16b}, [x0], #16 + tbnz x9, #4, 4f + + ld1 {v12.16b}, [x1], #16 + eor v3.16b, v3.16b, v12.16b + st1 {v3.16b}, [x0], #16 + tbnz x9, #5, 5f + + ld1 {v13.16b}, [x1], #16 + eor v7.16b, v7.16b, v13.16b + st1 {v7.16b}, [x0], #16 + tbnz x9, #6, 6f + + ld1 {v14.16b}, [x1], #16 + eor v2.16b, v2.16b, v14.16b + st1 {v2.16b}, [x0], #16 + tbnz x9, #7, 7f + + ld1 {v15.16b}, [x1], #16 + eor v5.16b, v5.16b, v15.16b + st1 {v5.16b}, [x0], #16 + + next_ctr v0 + cbnz x4, 99b + +0: st1 {v0.16b}, [x5] +8: ldp x29, x30, [sp], #16 + ret + + /* + * If we are handling the tail of the input (x6 == 1), return the + * final keystream block back to the caller via the IV buffer. + */ +1: cbz x6, 8b + st1 {v1.16b}, [x5] + b 8b +2: cbz x6, 8b + st1 {v4.16b}, [x5] + b 8b +3: cbz x6, 8b + st1 {v6.16b}, [x5] + b 8b +4: cbz x6, 8b + st1 {v3.16b}, [x5] + b 8b +5: cbz x6, 8b + st1 {v7.16b}, [x5] + b 8b +6: cbz x6, 8b + st1 {v2.16b}, [x5] + b 8b +7: cbz x6, 8b + st1 {v5.16b}, [x5] + b 8b +ENDPROC(aesbs_ctr_encrypt) diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c new file mode 100644 index 000000000000..323dd76ae5f0 --- /dev/null +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -0,0 +1,420 @@ +/* + * Bit sliced AES using NEON instructions + * + * Copyright (C) 2016 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL v2"); + +MODULE_ALIAS_CRYPTO("ecb(aes)"); +MODULE_ALIAS_CRYPTO("cbc(aes)"); +MODULE_ALIAS_CRYPTO("ctr(aes)"); +MODULE_ALIAS_CRYPTO("xts(aes)"); + +asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds); + +asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks); +asmlinkage void aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks); + +asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 iv[]); + +asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 iv[], bool final); + +asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 iv[]); +asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 iv[]); + +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); + +struct aesbs_ctx { + u8 rk[13 * (8 * AES_BLOCK_SIZE) + 32]; + int rounds; +} __aligned(AES_BLOCK_SIZE); + +struct aesbs_cbc_ctx { + struct aesbs_ctx key; + u32 enc[AES_MAX_KEYLENGTH_U32]; +}; + +struct aesbs_xts_ctx { + struct aesbs_ctx key; + u32 twkey[AES_MAX_KEYLENGTH_U32]; +}; + +static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_aes_ctx rk; + int err; + + err = crypto_aes_expand_key(&rk, in_key, key_len); + if (err) + return err; + + ctx->rounds = 6 + key_len / 4; + + kernel_neon_begin(); + aesbs_convert_key(ctx->rk, rk.key_enc, ctx->rounds); + kernel_neon_end(); + + return 0; +} + +static int __ecb_crypt(struct skcipher_request *req, + void (*fn)(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks)) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, true); + + kernel_neon_begin(); + while (walk.nbytes >= AES_BLOCK_SIZE) { + unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; + + if (walk.nbytes < walk.total) + blocks = round_down(blocks, + walk.stride / AES_BLOCK_SIZE); + + fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk, + ctx->rounds, blocks); + err = skcipher_walk_done(&walk, + walk.nbytes - blocks * AES_BLOCK_SIZE); + } + kernel_neon_end(); + + return err; +} + +static int ecb_encrypt(struct skcipher_request *req) +{ + return __ecb_crypt(req, aesbs_ecb_encrypt); +} + +static int ecb_decrypt(struct skcipher_request *req) +{ + return __ecb_crypt(req, aesbs_ecb_decrypt); +} + +static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_aes_ctx rk; + int err; + + err = crypto_aes_expand_key(&rk, in_key, key_len); + if (err) + return err; + + ctx->key.rounds = 6 + key_len / 4; + + memcpy(ctx->enc, rk.key_enc, sizeof(ctx->enc)); + + kernel_neon_begin(); + aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds); + kernel_neon_end(); + + return 0; +} + +static void cbc_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst) +{ + struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + + __aes_arm64_encrypt(ctx->enc, dst, src, ctx->key.rounds); +} + +static int cbc_encrypt(struct skcipher_request *req) +{ + return crypto_cbc_encrypt_walk(req, cbc_encrypt_one); +} + +static int cbc_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, true); + + kernel_neon_begin(); + while (walk.nbytes >= AES_BLOCK_SIZE) { + unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; + + if (walk.nbytes < walk.total) + blocks = round_down(blocks, + walk.stride / AES_BLOCK_SIZE); + + aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key.rk, ctx->key.rounds, blocks, + walk.iv); + err = skcipher_walk_done(&walk, + walk.nbytes - blocks * AES_BLOCK_SIZE); + } + kernel_neon_end(); + + return err; +} + +static int ctr_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, true); + + kernel_neon_begin(); + while (walk.nbytes > 0) { + unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; + bool final = (walk.total % AES_BLOCK_SIZE) != 0; + + if (walk.nbytes < walk.total) { + blocks = round_down(blocks, + walk.stride / AES_BLOCK_SIZE); + final = false; + } + + aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->rk, ctx->rounds, blocks, walk.iv, final); + + if (final) { + u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; + u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; + + if (dst != src) + memcpy(dst, src, walk.total % AES_BLOCK_SIZE); + crypto_xor(dst, walk.iv, walk.total % AES_BLOCK_SIZE); + + err = skcipher_walk_done(&walk, 0); + break; + } + err = skcipher_walk_done(&walk, + walk.nbytes - blocks * AES_BLOCK_SIZE); + } + kernel_neon_end(); + + return err; +} + +static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_aes_ctx rk; + int err; + + err = xts_verify_key(tfm, in_key, key_len); + if (err) + return err; + + key_len /= 2; + err = crypto_aes_expand_key(&rk, in_key + key_len, key_len); + if (err) + return err; + + memcpy(ctx->twkey, rk.key_enc, sizeof(ctx->twkey)); + + return aesbs_setkey(tfm, in_key, key_len); +} + +static int __xts_crypt(struct skcipher_request *req, + void (*fn)(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 iv[])) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, true); + + __aes_arm64_encrypt(ctx->twkey, walk.iv, walk.iv, ctx->key.rounds); + + kernel_neon_begin(); + while (walk.nbytes >= AES_BLOCK_SIZE) { + unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; + + if (walk.nbytes < walk.total) + blocks = round_down(blocks, + walk.stride / AES_BLOCK_SIZE); + + fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->key.rk, + ctx->key.rounds, blocks, walk.iv); + err = skcipher_walk_done(&walk, + walk.nbytes - blocks * AES_BLOCK_SIZE); + } + kernel_neon_end(); + + return err; +} + +static int xts_encrypt(struct skcipher_request *req) +{ + return __xts_crypt(req, aesbs_xts_encrypt); +} + +static int xts_decrypt(struct skcipher_request *req) +{ + return __xts_crypt(req, aesbs_xts_decrypt); +} + +static struct skcipher_alg aes_algs[] = { { + .base.cra_name = "__ecb(aes)", + .base.cra_driver_name = "__ecb-aes-neonbs", + .base.cra_priority = 250, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct aesbs_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .walksize = 8 * AES_BLOCK_SIZE, + .setkey = aesbs_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, +}, { + .base.cra_name = "__cbc(aes)", + .base.cra_driver_name = "__cbc-aes-neonbs", + .base.cra_priority = 250, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .walksize = 8 * AES_BLOCK_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aesbs_cbc_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, +}, { + .base.cra_name = "__ctr(aes)", + .base.cra_driver_name = "__ctr-aes-neonbs", + .base.cra_priority = 250, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct aesbs_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .chunksize = AES_BLOCK_SIZE, + .walksize = 8 * AES_BLOCK_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aesbs_setkey, + .encrypt = ctr_encrypt, + .decrypt = ctr_encrypt, +}, { + .base.cra_name = "ctr(aes)", + .base.cra_driver_name = "ctr-aes-neonbs", + .base.cra_priority = 250 - 1, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct aesbs_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .chunksize = AES_BLOCK_SIZE, + .walksize = 8 * AES_BLOCK_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aesbs_setkey, + .encrypt = ctr_encrypt, + .decrypt = ctr_encrypt, +}, { + .base.cra_name = "__xts(aes)", + .base.cra_driver_name = "__xts-aes-neonbs", + .base.cra_priority = 250, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct aesbs_xts_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .walksize = 8 * AES_BLOCK_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aesbs_xts_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, +} }; + +static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)]; + +static void aes_exit(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++) + if (aes_simd_algs[i]) + simd_skcipher_free(aes_simd_algs[i]); + + crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); +} + +static int __init aes_init(void) +{ + struct simd_skcipher_alg *simd; + const char *basename; + const char *algname; + const char *drvname; + int err; + int i; + + if (!(elf_hwcap & HWCAP_ASIMD)) + return -ENODEV; + + err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); + if (err) + return err; + + for (i = 0; i < ARRAY_SIZE(aes_algs); i++) { + if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL)) + continue; + + algname = aes_algs[i].base.cra_name + 2; + drvname = aes_algs[i].base.cra_driver_name + 2; + basename = aes_algs[i].base.cra_driver_name; + simd = simd_skcipher_create_compat(algname, drvname, basename); + err = PTR_ERR(simd); + if (IS_ERR(simd)) + goto unregister_simds; + + aes_simd_algs[i] = simd; + } + return 0; + +unregister_simds: + aes_exit(); + return err; +} + +module_init(aes_init); +module_exit(aes_exit); -- cgit v1.2.3 From 8f4102dbd9b6a050491a966a74f030e65e29d33d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 28 Jan 2017 23:25:33 +0000 Subject: crypto: arm64/aes-ce-ccm - remove cra_alignmask Remove the unnecessary alignmask: it is much more efficient to deal with the misalignment in the core algorithm than relying on the crypto API to copy the data to a suitably aligned buffer. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-ce-ccm-glue.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c index cc5515dac74a..6a7dbc7c83a6 100644 --- a/arch/arm64/crypto/aes-ce-ccm-glue.c +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c @@ -258,7 +258,6 @@ static struct aead_alg ccm_aes_alg = { .cra_priority = 300, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct crypto_aes_ctx), - .cra_alignmask = 7, .cra_module = THIS_MODULE, }, .ivsize = AES_BLOCK_SIZE, -- cgit v1.2.3 From ccc5d51ef968d0d7634d36afbaf0286126e12f09 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 28 Jan 2017 23:25:34 +0000 Subject: crypto: arm64/aes-blk - remove cra_alignmask Remove the unnecessary alignmask: it is much more efficient to deal with the misalignment in the core algorithm than relying on the crypto API to copy the data to a suitably aligned buffer. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-glue.c | 16 ++++++---------- arch/arm64/crypto/aes-modes.S | 8 +++----- 2 files changed, 9 insertions(+), 15 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 5164aaf82c6a..8ee1fb7aaa4f 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -215,14 +215,15 @@ static int ctr_encrypt(struct skcipher_request *req) u8 *tsrc = walk.src.virt.addr; /* - * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need - * to tell aes_ctr_encrypt() to only read half a block. + * Tell aes_ctr_encrypt() to process a tail block. */ - blocks = (nbytes <= 8) ? -1 : 1; + blocks = -1; - aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds, + aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds, blocks, walk.iv, first); - memcpy(tdst, tail, nbytes); + if (tdst != tsrc) + memcpy(tdst, tsrc, nbytes); + crypto_xor(tdst, tail, nbytes); err = skcipher_walk_done(&walk, 0); } kernel_neon_end(); @@ -282,7 +283,6 @@ static struct skcipher_alg aes_algs[] = { { .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_ctx), - .cra_alignmask = 7, .cra_module = THIS_MODULE, }, .min_keysize = AES_MIN_KEY_SIZE, @@ -298,7 +298,6 @@ static struct skcipher_alg aes_algs[] = { { .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_ctx), - .cra_alignmask = 7, .cra_module = THIS_MODULE, }, .min_keysize = AES_MIN_KEY_SIZE, @@ -315,7 +314,6 @@ static struct skcipher_alg aes_algs[] = { { .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct crypto_aes_ctx), - .cra_alignmask = 7, .cra_module = THIS_MODULE, }, .min_keysize = AES_MIN_KEY_SIZE, @@ -332,7 +330,6 @@ static struct skcipher_alg aes_algs[] = { { .cra_priority = PRIO - 1, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct crypto_aes_ctx), - .cra_alignmask = 7, .cra_module = THIS_MODULE, }, .min_keysize = AES_MIN_KEY_SIZE, @@ -350,7 +347,6 @@ static struct skcipher_alg aes_algs[] = { { .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx), - .cra_alignmask = 7, .cra_module = THIS_MODULE, }, .min_keysize = 2 * AES_MIN_KEY_SIZE, diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index 838dad5c209f..92b982a8b112 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -337,7 +337,7 @@ AES_ENTRY(aes_ctr_encrypt) .Lctrcarrydone: subs w4, w4, #1 - bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */ + bmi .Lctrtailblock /* blocks <0 means tail block */ ld1 {v3.16b}, [x1], #16 eor v3.16b, v0.16b, v3.16b st1 {v3.16b}, [x0], #16 @@ -348,10 +348,8 @@ AES_ENTRY(aes_ctr_encrypt) FRAME_POP ret -.Lctrhalfblock: - ld1 {v3.8b}, [x1] - eor v3.8b, v0.8b, v3.8b - st1 {v3.8b}, [x0] +.Lctrtailblock: + st1 {v0.16b}, [x0] FRAME_POP ret -- cgit v1.2.3 From 4d1108fd747f88a6555e644073ee5629bad610b9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 28 Jan 2017 23:25:35 +0000 Subject: crypto: arm64/chacha20 - remove cra_alignmask Remove the unnecessary alignmask: it is much more efficient to deal with the misalignment in the core algorithm than relying on the crypto API to copy the data to a suitably aligned buffer. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/chacha20-neon-glue.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c index a7f2337d46cf..a7cd575ea223 100644 --- a/arch/arm64/crypto/chacha20-neon-glue.c +++ b/arch/arm64/crypto/chacha20-neon-glue.c @@ -93,7 +93,6 @@ static struct skcipher_alg alg = { .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha20_ctx), - .base.cra_alignmask = 1, .base.cra_module = THIS_MODULE, .min_keysize = CHACHA20_KEY_SIZE, -- cgit v1.2.3 From 262ea4f670b792d0985090b1187b1f1ce2c2c648 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 28 Jan 2017 23:25:36 +0000 Subject: crypto: arm64/aes - avoid literals for cross-module symbol references Using simple adrp/add pairs to refer to the AES lookup tables exposed by the generic AES driver (which could be loaded far away from this driver when KASLR is in effect) was unreliable at module load time before commit 41c066f2c4d4 ("arm64: assembler: make adr_l work in modules under KASLR"), which is why the AES code used literals instead. So now we can get rid of the literals, and switch to the adr_l macro. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-cipher-core.S | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S index 37590ab8121a..cd58c61e6677 100644 --- a/arch/arm64/crypto/aes-cipher-core.S +++ b/arch/arm64/crypto/aes-cipher-core.S @@ -89,8 +89,8 @@ CPU_BE( rev w8, w8 ) eor w7, w7, w11 eor w8, w8, w12 - ldr tt, =\ttab - ldr lt, =\ltab + adr_l tt, \ttab + adr_l lt, \ltab tbnz rounds, #1, 1f @@ -111,9 +111,6 @@ CPU_BE( rev w8, w8 ) stp w5, w6, [out] stp w7, w8, [out, #8] ret - - .align 4 - .ltorg .endm .align 5 -- cgit v1.2.3 From c458c4ada0e3e3c898a56d0640d2ef70c9f702e3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 28 Jan 2017 23:25:37 +0000 Subject: crypto: arm64/aes - performance tweak Shuffle some instructions around in the __hround macro to shave off 0.1 cycles per byte on Cortex-A57. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-cipher-core.S | 52 ++++++++++++++----------------------- 1 file changed, 19 insertions(+), 33 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S index cd58c61e6677..f2f9cc519309 100644 --- a/arch/arm64/crypto/aes-cipher-core.S +++ b/arch/arm64/crypto/aes-cipher-core.S @@ -20,46 +20,32 @@ tt .req x4 lt .req x2 - .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc - ldp \out0, \out1, [rk], #8 - - ubfx w13, \in0, #0, #8 - ubfx w14, \in1, #8, #8 - ldr w13, [tt, w13, uxtw #2] - ldr w14, [tt, w14, uxtw #2] - + .macro __pair, enc, reg0, reg1, in0, in1e, in1d, shift + ubfx \reg0, \in0, #\shift, #8 .if \enc - ubfx w17, \in1, #0, #8 - ubfx w18, \in2, #8, #8 + ubfx \reg1, \in1e, #\shift, #8 .else - ubfx w17, \in3, #0, #8 - ubfx w18, \in0, #8, #8 + ubfx \reg1, \in1d, #\shift, #8 .endif - ldr w17, [tt, w17, uxtw #2] - ldr w18, [tt, w18, uxtw #2] + ldr \reg0, [tt, \reg0, uxtw #2] + ldr \reg1, [tt, \reg1, uxtw #2] + .endm - ubfx w15, \in2, #16, #8 - ubfx w16, \in3, #24, #8 - ldr w15, [tt, w15, uxtw #2] - ldr w16, [tt, w16, uxtw #2] + .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc + ldp \out0, \out1, [rk], #8 - .if \enc - ubfx \t0, \in3, #16, #8 - ubfx \t1, \in0, #24, #8 - .else - ubfx \t0, \in1, #16, #8 - ubfx \t1, \in2, #24, #8 - .endif - ldr \t0, [tt, \t0, uxtw #2] - ldr \t1, [tt, \t1, uxtw #2] + __pair \enc, w13, w14, \in0, \in1, \in3, 0 + __pair \enc, w15, w16, \in1, \in2, \in0, 8 + __pair \enc, w17, w18, \in2, \in3, \in1, 16 + __pair \enc, \t0, \t1, \in3, \in0, \in2, 24 eor \out0, \out0, w13 - eor \out1, \out1, w17 - eor \out0, \out0, w14, ror #24 - eor \out1, \out1, w18, ror #24 - eor \out0, \out0, w15, ror #16 - eor \out1, \out1, \t0, ror #16 - eor \out0, \out0, w16, ror #8 + eor \out1, \out1, w14 + eor \out0, \out0, w15, ror #24 + eor \out1, \out1, w16, ror #24 + eor \out0, \out0, w17, ror #16 + eor \out1, \out1, w18, ror #16 + eor \out0, \out0, \t0, ror #8 eor \out1, \out1, \t1, ror #8 .endm -- cgit v1.2.3 From 4edd7d015b95abcedde591a0c45965305d7cd524 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 28 Jan 2017 23:25:38 +0000 Subject: crypto: arm64/aes-neon-blk - tweak performance for low end cores The non-bitsliced AES implementation using the NEON is highly sensitive to micro-architectural details, and, as it turns out, the Cortex-A53 on the Raspberry Pi 3 is a core that can benefit from this code, given that its scalar AES performance is abysmal (32.9 cycles per byte). The new bitsliced AES code manages 19.8 cycles per byte on this core, but can only operate on 8 blocks at a time, which is not supported by all chaining modes. With a bit of tweaking, we can get the plain NEON code to run at 22.0 cycles per byte, making it useful for sequential modes like CBC encryption. (Like bitsliced NEON, the plain NEON implementation does not use any lookup tables, which makes it easy on the D-cache, and invulnerable to cache timing attacks) So tweak the plain NEON AES code to use tbl instructions rather than shl/sri pairs, and to avoid the need to reload permutation vectors or other constants from memory in every round. Also, improve the decryption performance by switching to 16x8 pmul instructions for the performing the multiplications in GF(2^8). To allow the ECB and CBC encrypt routines to be reused by the bitsliced NEON code in a subsequent patch, export them from the module. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-glue.c | 2 + arch/arm64/crypto/aes-neon.S | 235 ++++++++++++++++++------------------------- 2 files changed, 102 insertions(+), 135 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 8ee1fb7aaa4f..055bc3f61138 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -409,5 +409,7 @@ unregister_simds: module_cpu_feature_match(AES, aes_init); #else module_init(aes_init); +EXPORT_SYMBOL(neon_aes_ecb_encrypt); +EXPORT_SYMBOL(neon_aes_cbc_encrypt); #endif module_exit(aes_exit); diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S index 85f07ead7c5c..f1e3aa2732f9 100644 --- a/arch/arm64/crypto/aes-neon.S +++ b/arch/arm64/crypto/aes-neon.S @@ -1,7 +1,7 @@ /* * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON * - * Copyright (C) 2013 Linaro Ltd + * Copyright (C) 2013 - 2017 Linaro Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -17,17 +17,25 @@ /* multiply by polynomial 'x' in GF(2^8) */ .macro mul_by_x, out, in, temp, const sshr \temp, \in, #7 - add \out, \in, \in + shl \out, \in, #1 and \temp, \temp, \const eor \out, \out, \temp .endm + /* multiply by polynomial 'x^2' in GF(2^8) */ + .macro mul_by_x2, out, in, temp, const + ushr \temp, \in, #6 + shl \out, \in, #2 + pmul \temp, \temp, \const + eor \out, \out, \temp + .endm + /* preload the entire Sbox */ .macro prepare, sbox, shiftrows, temp adr \temp, \sbox - movi v12.16b, #0x40 + movi v12.16b, #0x1b ldr q13, \shiftrows - movi v14.16b, #0x1b + ldr q14, .Lror32by8 ld1 {v16.16b-v19.16b}, [\temp], #64 ld1 {v20.16b-v23.16b}, [\temp], #64 ld1 {v24.16b-v27.16b}, [\temp], #64 @@ -50,37 +58,31 @@ /* apply SubBytes transformation using the the preloaded Sbox */ .macro sub_bytes, in - sub v9.16b, \in\().16b, v12.16b + sub v9.16b, \in\().16b, v15.16b tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b - sub v10.16b, v9.16b, v12.16b + sub v10.16b, v9.16b, v15.16b tbx \in\().16b, {v20.16b-v23.16b}, v9.16b - sub v11.16b, v10.16b, v12.16b + sub v11.16b, v10.16b, v15.16b tbx \in\().16b, {v24.16b-v27.16b}, v10.16b tbx \in\().16b, {v28.16b-v31.16b}, v11.16b .endm /* apply MixColumns transformation */ - .macro mix_columns, in - mul_by_x v10.16b, \in\().16b, v9.16b, v14.16b - rev32 v8.8h, \in\().8h - eor \in\().16b, v10.16b, \in\().16b - shl v9.4s, v8.4s, #24 - shl v11.4s, \in\().4s, #24 - sri v9.4s, v8.4s, #8 - sri v11.4s, \in\().4s, #8 - eor v9.16b, v9.16b, v8.16b - eor v10.16b, v10.16b, v9.16b - eor \in\().16b, v10.16b, v11.16b - .endm - + .macro mix_columns, in, enc + .if \enc == 0 /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ - .macro inv_mix_columns, in - mul_by_x v11.16b, \in\().16b, v10.16b, v14.16b - mul_by_x v11.16b, v11.16b, v10.16b, v14.16b - eor \in\().16b, \in\().16b, v11.16b - rev32 v11.8h, v11.8h - eor \in\().16b, \in\().16b, v11.16b - mix_columns \in + mul_by_x2 v8.16b, \in\().16b, v9.16b, v12.16b + eor \in\().16b, \in\().16b, v8.16b + rev32 v8.8h, v8.8h + eor \in\().16b, \in\().16b, v8.16b + .endif + + mul_by_x v9.16b, \in\().16b, v8.16b, v12.16b + rev32 v8.8h, \in\().8h + eor v8.16b, v8.16b, v9.16b + eor \in\().16b, \in\().16b, v8.16b + tbl \in\().16b, {\in\().16b}, v14.16b + eor \in\().16b, \in\().16b, v8.16b .endm .macro do_block, enc, in, rounds, rk, rkp, i @@ -88,16 +90,13 @@ add \rkp, \rk, #16 mov \i, \rounds 1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ + movi v15.16b, #0x40 tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ sub_bytes \in - ld1 {v15.4s}, [\rkp], #16 subs \i, \i, #1 + ld1 {v15.4s}, [\rkp], #16 beq 2222f - .if \enc == 1 - mix_columns \in - .else - inv_mix_columns \in - .endif + mix_columns \in, \enc b 1111b 2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ .endm @@ -116,139 +115,114 @@ */ .macro sub_bytes_2x, in0, in1 - sub v8.16b, \in0\().16b, v12.16b - sub v9.16b, \in1\().16b, v12.16b + sub v8.16b, \in0\().16b, v15.16b tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b + sub v9.16b, \in1\().16b, v15.16b tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b - sub v10.16b, v8.16b, v12.16b - sub v11.16b, v9.16b, v12.16b + sub v10.16b, v8.16b, v15.16b tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b + sub v11.16b, v9.16b, v15.16b tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b - sub v8.16b, v10.16b, v12.16b - sub v9.16b, v11.16b, v12.16b + sub v8.16b, v10.16b, v15.16b tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b + sub v9.16b, v11.16b, v15.16b tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b .endm .macro sub_bytes_4x, in0, in1, in2, in3 - sub v8.16b, \in0\().16b, v12.16b + sub v8.16b, \in0\().16b, v15.16b tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b - sub v9.16b, \in1\().16b, v12.16b + sub v9.16b, \in1\().16b, v15.16b tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b - sub v10.16b, \in2\().16b, v12.16b + sub v10.16b, \in2\().16b, v15.16b tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b - sub v11.16b, \in3\().16b, v12.16b + sub v11.16b, \in3\().16b, v15.16b tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b - sub v8.16b, v8.16b, v12.16b + sub v8.16b, v8.16b, v15.16b tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b - sub v9.16b, v9.16b, v12.16b + sub v9.16b, v9.16b, v15.16b tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b - sub v10.16b, v10.16b, v12.16b + sub v10.16b, v10.16b, v15.16b tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b - sub v11.16b, v11.16b, v12.16b + sub v11.16b, v11.16b, v15.16b tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b - sub v8.16b, v8.16b, v12.16b + sub v8.16b, v8.16b, v15.16b tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b - sub v9.16b, v9.16b, v12.16b + sub v9.16b, v9.16b, v15.16b tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b - sub v10.16b, v10.16b, v12.16b + sub v10.16b, v10.16b, v15.16b tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b - sub v11.16b, v11.16b, v12.16b + sub v11.16b, v11.16b, v15.16b tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b .endm .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const - sshr \tmp0\().16b, \in0\().16b, #7 - add \out0\().16b, \in0\().16b, \in0\().16b - sshr \tmp1\().16b, \in1\().16b, #7 + sshr \tmp0\().16b, \in0\().16b, #7 + shl \out0\().16b, \in0\().16b, #1 + sshr \tmp1\().16b, \in1\().16b, #7 and \tmp0\().16b, \tmp0\().16b, \const\().16b - add \out1\().16b, \in1\().16b, \in1\().16b + shl \out1\().16b, \in1\().16b, #1 and \tmp1\().16b, \tmp1\().16b, \const\().16b eor \out0\().16b, \out0\().16b, \tmp0\().16b eor \out1\().16b, \out1\().16b, \tmp1\().16b .endm - .macro mix_columns_2x, in0, in1 - mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 - rev32 v10.8h, \in0\().8h - rev32 v11.8h, \in1\().8h - eor \in0\().16b, v8.16b, \in0\().16b - eor \in1\().16b, v9.16b, \in1\().16b - shl v12.4s, v10.4s, #24 - shl v13.4s, v11.4s, #24 - eor v8.16b, v8.16b, v10.16b - sri v12.4s, v10.4s, #8 - shl v10.4s, \in0\().4s, #24 - eor v9.16b, v9.16b, v11.16b - sri v13.4s, v11.4s, #8 - shl v11.4s, \in1\().4s, #24 - sri v10.4s, \in0\().4s, #8 - eor \in0\().16b, v8.16b, v12.16b - sri v11.4s, \in1\().4s, #8 - eor \in1\().16b, v9.16b, v13.16b - eor \in0\().16b, v10.16b, \in0\().16b - eor \in1\().16b, v11.16b, \in1\().16b + .macro mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const + ushr \tmp0\().16b, \in0\().16b, #6 + shl \out0\().16b, \in0\().16b, #2 + ushr \tmp1\().16b, \in1\().16b, #6 + pmul \tmp0\().16b, \tmp0\().16b, \const\().16b + shl \out1\().16b, \in1\().16b, #2 + pmul \tmp1\().16b, \tmp1\().16b, \const\().16b + eor \out0\().16b, \out0\().16b, \tmp0\().16b + eor \out1\().16b, \out1\().16b, \tmp1\().16b .endm - .macro inv_mix_cols_2x, in0, in1 - mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 - mul_by_x_2x v8, v9, v8, v9, v10, v11, v14 + .macro mix_columns_2x, in0, in1, enc + .if \enc == 0 + /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ + mul_by_x2_2x v8, v9, \in0, \in1, v10, v11, v12 eor \in0\().16b, \in0\().16b, v8.16b - eor \in1\().16b, \in1\().16b, v9.16b rev32 v8.8h, v8.8h - rev32 v9.8h, v9.8h - eor \in0\().16b, \in0\().16b, v8.16b - eor \in1\().16b, \in1\().16b, v9.16b - mix_columns_2x \in0, \in1 - .endm - - .macro inv_mix_cols_4x, in0, in1, in2, in3 - mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 - mul_by_x_2x v10, v11, \in2, \in3, v12, v13, v14 - mul_by_x_2x v8, v9, v8, v9, v12, v13, v14 - mul_by_x_2x v10, v11, v10, v11, v12, v13, v14 - eor \in0\().16b, \in0\().16b, v8.16b eor \in1\().16b, \in1\().16b, v9.16b - eor \in2\().16b, \in2\().16b, v10.16b - eor \in3\().16b, \in3\().16b, v11.16b - rev32 v8.8h, v8.8h rev32 v9.8h, v9.8h - rev32 v10.8h, v10.8h - rev32 v11.8h, v11.8h eor \in0\().16b, \in0\().16b, v8.16b eor \in1\().16b, \in1\().16b, v9.16b - eor \in2\().16b, \in2\().16b, v10.16b - eor \in3\().16b, \in3\().16b, v11.16b - mix_columns_2x \in0, \in1 - mix_columns_2x \in2, \in3 + .endif + + mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12 + rev32 v10.8h, \in0\().8h + rev32 v11.8h, \in1\().8h + eor v10.16b, v10.16b, v8.16b + eor v11.16b, v11.16b, v9.16b + eor \in0\().16b, \in0\().16b, v10.16b + eor \in1\().16b, \in1\().16b, v11.16b + tbl \in0\().16b, {\in0\().16b}, v14.16b + tbl \in1\().16b, {\in1\().16b}, v14.16b + eor \in0\().16b, \in0\().16b, v10.16b + eor \in1\().16b, \in1\().16b, v11.16b .endm - .macro do_block_2x, enc, in0, in1 rounds, rk, rkp, i + .macro do_block_2x, enc, in0, in1, rounds, rk, rkp, i ld1 {v15.4s}, [\rk] add \rkp, \rk, #16 mov \i, \rounds 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ - sub_bytes_2x \in0, \in1 + movi v15.16b, #0x40 tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ - ld1 {v15.4s}, [\rkp], #16 + sub_bytes_2x \in0, \in1 subs \i, \i, #1 + ld1 {v15.4s}, [\rkp], #16 beq 2222f - .if \enc == 1 - mix_columns_2x \in0, \in1 - ldr q13, .LForward_ShiftRows - .else - inv_mix_cols_2x \in0, \in1 - ldr q13, .LReverse_ShiftRows - .endif - movi v12.16b, #0x40 + mix_columns_2x \in0, \in1, \enc b 1111b 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ @@ -262,23 +236,17 @@ eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ - sub_bytes_4x \in0, \in1, \in2, \in3 + movi v15.16b, #0x40 tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ - ld1 {v15.4s}, [\rkp], #16 + sub_bytes_4x \in0, \in1, \in2, \in3 subs \i, \i, #1 + ld1 {v15.4s}, [\rkp], #16 beq 2222f - .if \enc == 1 - mix_columns_2x \in0, \in1 - mix_columns_2x \in2, \in3 - ldr q13, .LForward_ShiftRows - .else - inv_mix_cols_4x \in0, \in1, \in2, \in3 - ldr q13, .LReverse_ShiftRows - .endif - movi v12.16b, #0x40 + mix_columns_2x \in0, \in1, \enc + mix_columns_2x \in2, \in3, \enc b 1111b 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ @@ -305,19 +273,7 @@ #include "aes-modes.S" .text - .align 4 -.LForward_ShiftRows: -CPU_LE( .byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3 ) -CPU_LE( .byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb ) -CPU_BE( .byte 0xb, 0x6, 0x1, 0xc, 0x7, 0x2, 0xd, 0x8 ) -CPU_BE( .byte 0x3, 0xe, 0x9, 0x4, 0xf, 0xa, 0x5, 0x0 ) - -.LReverse_ShiftRows: -CPU_LE( .byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb ) -CPU_LE( .byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3 ) -CPU_BE( .byte 0x3, 0x6, 0x9, 0xc, 0xf, 0x2, 0x5, 0x8 ) -CPU_BE( .byte 0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0 ) - + .align 6 .LForward_Sbox: .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 @@ -385,3 +341,12 @@ CPU_BE( .byte 0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0 ) .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + +.LForward_ShiftRows: + .octa 0x0b06010c07020d08030e09040f0a0500 + +.LReverse_ShiftRows: + .octa 0x0306090c0f0205080b0e0104070a0d00 + +.Lror32by8: + .octa 0x0c0f0e0d080b0a090407060500030201 -- cgit v1.2.3 From 12fcd92305880504c9827c99ea128fecf1c99f0d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 28 Jan 2017 23:25:39 +0000 Subject: crypto: arm64/aes - replace scalar fallback with plain NEON fallback The new bitsliced NEON implementation of AES uses a fallback in two places: CBC encryption (which is strictly sequential, whereas this driver can only operate efficiently on 8 blocks at a time), and the XTS tweak generation, which involves encrypting a single AES block with a different key schedule. The plain (i.e., non-bitsliced) NEON code is more suitable as a fallback, given that it is faster than scalar on low end cores (which is what the NEON implementations target, since high end cores have dedicated instructions for AES), and shows similar behavior in terms of D-cache footprint and sensitivity to cache timing attacks. So switch the fallback handling to the plain NEON driver. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 2 +- arch/arm64/crypto/aes-neonbs-glue.c | 38 +++++++++++++++++++++++++++---------- 2 files changed, 29 insertions(+), 11 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 5de75c3dcbd4..bed7feddfeed 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -86,7 +86,7 @@ config CRYPTO_AES_ARM64_BS tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm" depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER - select CRYPTO_AES_ARM64 + select CRYPTO_AES_ARM64_NEON_BLK select CRYPTO_SIMD endif diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index 323dd76ae5f0..863e436ecf89 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -10,7 +10,6 @@ #include #include -#include #include #include #include @@ -42,7 +41,12 @@ asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, int blocks, u8 iv[]); -asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); +/* borrowed from aes-neon-blk.ko */ +asmlinkage void neon_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks, int first); +asmlinkage void neon_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks, u8 iv[], + int first); struct aesbs_ctx { u8 rk[13 * (8 * AES_BLOCK_SIZE) + 32]; @@ -140,16 +144,28 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key, return 0; } -static void cbc_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst) +static int cbc_encrypt(struct skcipher_request *req) { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + int err, first = 1; - __aes_arm64_encrypt(ctx->enc, dst, src, ctx->key.rounds); -} + err = skcipher_walk_virt(&walk, req, true); -static int cbc_encrypt(struct skcipher_request *req) -{ - return crypto_cbc_encrypt_walk(req, cbc_encrypt_one); + kernel_neon_begin(); + while (walk.nbytes >= AES_BLOCK_SIZE) { + unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; + + /* fall back to the non-bitsliced NEON implementation */ + neon_aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + ctx->enc, ctx->key.rounds, blocks, walk.iv, + first); + err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); + first = 0; + } + kernel_neon_end(); + return err; } static int cbc_decrypt(struct skcipher_request *req) @@ -254,9 +270,11 @@ static int __xts_crypt(struct skcipher_request *req, err = skcipher_walk_virt(&walk, req, true); - __aes_arm64_encrypt(ctx->twkey, walk.iv, walk.iv, ctx->key.rounds); - kernel_neon_begin(); + + neon_aes_ecb_encrypt(walk.iv, walk.iv, ctx->twkey, + ctx->key.rounds, 1, 1); + while (walk.nbytes >= AES_BLOCK_SIZE) { unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; -- cgit v1.2.3 From 88a3f582bea9e1da0346ea412950bbbdc3125cc1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 2 Feb 2017 11:38:55 +0000 Subject: crypto: arm64/aes - don't use IV buffer to return final keystream block The arm64 bit sliced AES core code uses the IV buffer to pass the final keystream block back to the glue code if the input is not a multiple of the block size, so that the asm code does not have to deal with anything except 16 byte blocks. This is done under the assumption that the outgoing IV is meaningless anyway in this case, given that chaining is no longer possible under these circumstances. However, as it turns out, the CCM driver does expect the IV to retain a value that is equal to the original IV except for the counter value, and even interprets byte zero as a length indicator, which may result in memory corruption if the IV is overwritten with something else. So use a separate buffer to return the final keystream block. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-neonbs-core.S | 37 +++++++++++++++++++++++-------------- arch/arm64/crypto/aes-neonbs-glue.c | 9 +++++---- 2 files changed, 28 insertions(+), 18 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S index 8d0cdaa2768d..ca0472500433 100644 --- a/arch/arm64/crypto/aes-neonbs-core.S +++ b/arch/arm64/crypto/aes-neonbs-core.S @@ -853,13 +853,15 @@ ENDPROC(aesbs_xts_decrypt) /* * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], - * int rounds, int blocks, u8 iv[], bool final) + * int rounds, int blocks, u8 iv[], u8 final[]) */ ENTRY(aesbs_ctr_encrypt) stp x29, x30, [sp, #-16]! mov x29, sp - add x4, x4, x6 // do one extra block if final + cmp x6, #0 + cset x10, ne + add x4, x4, x10 // do one extra block if final ldp x7, x8, [x5] ld1 {v0.16b}, [x5] @@ -874,19 +876,26 @@ CPU_LE( rev x8, x8 ) csel x4, x4, xzr, pl csel x9, x9, xzr, le + tbnz x9, #1, 0f next_ctr v1 + tbnz x9, #2, 0f next_ctr v2 + tbnz x9, #3, 0f next_ctr v3 + tbnz x9, #4, 0f next_ctr v4 + tbnz x9, #5, 0f next_ctr v5 + tbnz x9, #6, 0f next_ctr v6 + tbnz x9, #7, 0f next_ctr v7 0: mov bskey, x2 mov rounds, x3 bl aesbs_encrypt8 - lsr x9, x9, x6 // disregard the extra block + lsr x9, x9, x10 // disregard the extra block tbnz x9, #0, 0f ld1 {v8.16b}, [x1], #16 @@ -928,36 +937,36 @@ CPU_LE( rev x8, x8 ) eor v5.16b, v5.16b, v15.16b st1 {v5.16b}, [x0], #16 - next_ctr v0 +8: next_ctr v0 cbnz x4, 99b 0: st1 {v0.16b}, [x5] -8: ldp x29, x30, [sp], #16 + ldp x29, x30, [sp], #16 ret /* - * If we are handling the tail of the input (x6 == 1), return the - * final keystream block back to the caller via the IV buffer. + * If we are handling the tail of the input (x6 != NULL), return the + * final keystream block back to the caller. */ 1: cbz x6, 8b - st1 {v1.16b}, [x5] + st1 {v1.16b}, [x6] b 8b 2: cbz x6, 8b - st1 {v4.16b}, [x5] + st1 {v4.16b}, [x6] b 8b 3: cbz x6, 8b - st1 {v6.16b}, [x5] + st1 {v6.16b}, [x6] b 8b 4: cbz x6, 8b - st1 {v3.16b}, [x5] + st1 {v3.16b}, [x6] b 8b 5: cbz x6, 8b - st1 {v7.16b}, [x5] + st1 {v7.16b}, [x6] b 8b 6: cbz x6, 8b - st1 {v2.16b}, [x5] + st1 {v2.16b}, [x6] b 8b 7: cbz x6, 8b - st1 {v5.16b}, [x5] + st1 {v5.16b}, [x6] b 8b ENDPROC(aesbs_ctr_encrypt) diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index 863e436ecf89..db2501d93550 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -34,7 +34,7 @@ asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, int blocks, u8 iv[]); asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], - int rounds, int blocks, u8 iv[], bool final); + int rounds, int blocks, u8 iv[], u8 final[]); asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, int blocks, u8 iv[]); @@ -201,6 +201,7 @@ static int ctr_encrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; + u8 buf[AES_BLOCK_SIZE]; int err; err = skcipher_walk_virt(&walk, req, true); @@ -208,12 +209,12 @@ static int ctr_encrypt(struct skcipher_request *req) kernel_neon_begin(); while (walk.nbytes > 0) { unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; - bool final = (walk.total % AES_BLOCK_SIZE) != 0; + u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL; if (walk.nbytes < walk.total) { blocks = round_down(blocks, walk.stride / AES_BLOCK_SIZE); - final = false; + final = NULL; } aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, @@ -225,7 +226,7 @@ static int ctr_encrypt(struct skcipher_request *req) if (dst != src) memcpy(dst, src, walk.total % AES_BLOCK_SIZE); - crypto_xor(dst, walk.iv, walk.total % AES_BLOCK_SIZE); + crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE); err = skcipher_walk_done(&walk, 0); break; -- cgit v1.2.3 From 5d3d9c8bda2c74b13185704e504cdf0aa5210723 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 1 Feb 2017 15:35:40 +0000 Subject: crypto: arm64/crc32 - merge CRC32 and PMULL instruction based drivers The PMULL based CRC32 implementation already contains code based on the separate, optional CRC32 instructions to fallback to when operating on small quantities of data. We can expose these routines directly on systems that lack the 64x64 PMULL instructions but do implement the CRC32 ones, which makes the driver that is based solely on those CRC32 instructions redundant. So remove it. Note that this aligns arm64 with ARM, whose accelerated CRC32 driver also combines the CRC32 extension based and the PMULL based versions. Signed-off-by: Ard Biesheuvel Tested-by: Matthias Brugger Signed-off-by: Herbert Xu --- arch/arm64/configs/defconfig | 1 - arch/arm64/crypto/Kconfig | 9 +- arch/arm64/crypto/Makefile | 4 - arch/arm64/crypto/crc32-arm64.c | 290 -------------------------------------- arch/arm64/crypto/crc32-ce-glue.c | 49 +++++-- 5 files changed, 41 insertions(+), 312 deletions(-) delete mode 100644 arch/arm64/crypto/crc32-arm64.c (limited to 'arch/arm64') diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 33b744d54739..6fc6f5a2a6e5 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -516,4 +516,3 @@ CONFIG_CRYPTO_GHASH_ARM64_CE=y CONFIG_CRYPTO_AES_ARM64_CE_CCM=y CONFIG_CRYPTO_AES_ARM64_CE_BLK=y # CONFIG_CRYPTO_AES_ARM64_NEON_BLK is not set -CONFIG_CRYPTO_CRC32_ARM64=y diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index bed7feddfeed..d92293747d63 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -37,8 +37,8 @@ config CRYPTO_CRCT10DIF_ARM64_CE select CRYPTO_HASH config CRYPTO_CRC32_ARM64_CE - tristate "CRC32 and CRC32C digest algorithms using PMULL instructions" - depends on KERNEL_MODE_NEON && CRC32 + tristate "CRC32 and CRC32C digest algorithms using ARMv8 extensions" + depends on CRC32 select CRYPTO_HASH config CRYPTO_AES_ARM64 @@ -71,11 +71,6 @@ config CRYPTO_AES_ARM64_NEON_BLK select CRYPTO_AES select CRYPTO_SIMD -config CRYPTO_CRC32_ARM64 - tristate "CRC32 and CRC32C using optional ARMv8 instructions" - depends on ARM64 - select CRYPTO_HASH - config CRYPTO_CHACHA20_NEON tristate "NEON accelerated ChaCha20 symmetric cipher" depends on KERNEL_MODE_NEON diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index d1ae1b9cbe70..b5edc5918c28 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -55,10 +55,6 @@ AFLAGS_aes-neon.o := -DINTERLEAVE=4 CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS -obj-$(CONFIG_CRYPTO_CRC32_ARM64) += crc32-arm64.o - -CFLAGS_crc32-arm64.o := -mcpu=generic+crc - $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE $(call if_changed_rule,cc_o_c) diff --git a/arch/arm64/crypto/crc32-arm64.c b/arch/arm64/crypto/crc32-arm64.c deleted file mode 100644 index 6a37c3c6b11d..000000000000 --- a/arch/arm64/crypto/crc32-arm64.c +++ /dev/null @@ -1,290 +0,0 @@ -/* - * crc32-arm64.c - CRC32 and CRC32C using optional ARMv8 instructions - * - * Module based on crypto/crc32c_generic.c - * - * CRC32 loop taken from Ed Nevill's Hadoop CRC patch - * http://mail-archives.apache.org/mod_mbox/hadoop-common-dev/201406.mbox/%3C1403687030.3355.19.camel%40localhost.localdomain%3E - * - * Using inline assembly instead of intrinsics in order to be backwards - * compatible with older compilers. - * - * Copyright (C) 2014 Linaro Ltd - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include - -#include - -MODULE_AUTHOR("Yazen Ghannam "); -MODULE_DESCRIPTION("CRC32 and CRC32C using optional ARMv8 instructions"); -MODULE_LICENSE("GPL v2"); - -#define CRC32X(crc, value) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) -#define CRC32W(crc, value) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) -#define CRC32H(crc, value) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) -#define CRC32B(crc, value) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) -#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) -#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) -#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) -#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) - -static u32 crc32_arm64_le_hw(u32 crc, const u8 *p, unsigned int len) -{ - s64 length = len; - - while ((length -= sizeof(u64)) >= 0) { - CRC32X(crc, get_unaligned_le64(p)); - p += sizeof(u64); - } - - /* The following is more efficient than the straight loop */ - if (length & sizeof(u32)) { - CRC32W(crc, get_unaligned_le32(p)); - p += sizeof(u32); - } - if (length & sizeof(u16)) { - CRC32H(crc, get_unaligned_le16(p)); - p += sizeof(u16); - } - if (length & sizeof(u8)) - CRC32B(crc, *p); - - return crc; -} - -static u32 crc32c_arm64_le_hw(u32 crc, const u8 *p, unsigned int len) -{ - s64 length = len; - - while ((length -= sizeof(u64)) >= 0) { - CRC32CX(crc, get_unaligned_le64(p)); - p += sizeof(u64); - } - - /* The following is more efficient than the straight loop */ - if (length & sizeof(u32)) { - CRC32CW(crc, get_unaligned_le32(p)); - p += sizeof(u32); - } - if (length & sizeof(u16)) { - CRC32CH(crc, get_unaligned_le16(p)); - p += sizeof(u16); - } - if (length & sizeof(u8)) - CRC32CB(crc, *p); - - return crc; -} - -#define CHKSUM_BLOCK_SIZE 1 -#define CHKSUM_DIGEST_SIZE 4 - -struct chksum_ctx { - u32 key; -}; - -struct chksum_desc_ctx { - u32 crc; -}; - -static int chksum_init(struct shash_desc *desc) -{ - struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - ctx->crc = mctx->key; - - return 0; -} - -/* - * Setting the seed allows arbitrary accumulators and flexible XOR policy - * If your algorithm starts with ~0, then XOR with ~0 before you set - * the seed. - */ -static int chksum_setkey(struct crypto_shash *tfm, const u8 *key, - unsigned int keylen) -{ - struct chksum_ctx *mctx = crypto_shash_ctx(tfm); - - if (keylen != sizeof(mctx->key)) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - mctx->key = get_unaligned_le32(key); - return 0; -} - -static int chksum_update(struct shash_desc *desc, const u8 *data, - unsigned int length) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - ctx->crc = crc32_arm64_le_hw(ctx->crc, data, length); - return 0; -} - -static int chksumc_update(struct shash_desc *desc, const u8 *data, - unsigned int length) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - ctx->crc = crc32c_arm64_le_hw(ctx->crc, data, length); - return 0; -} - -static int chksum_final(struct shash_desc *desc, u8 *out) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - put_unaligned_le32(ctx->crc, out); - return 0; -} - -static int chksumc_final(struct shash_desc *desc, u8 *out) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - put_unaligned_le32(~ctx->crc, out); - return 0; -} - -static int __chksum_finup(u32 crc, const u8 *data, unsigned int len, u8 *out) -{ - put_unaligned_le32(crc32_arm64_le_hw(crc, data, len), out); - return 0; -} - -static int __chksumc_finup(u32 crc, const u8 *data, unsigned int len, u8 *out) -{ - put_unaligned_le32(~crc32c_arm64_le_hw(crc, data, len), out); - return 0; -} - -static int chksum_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - return __chksum_finup(ctx->crc, data, len, out); -} - -static int chksumc_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - return __chksumc_finup(ctx->crc, data, len, out); -} - -static int chksum_digest(struct shash_desc *desc, const u8 *data, - unsigned int length, u8 *out) -{ - struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); - - return __chksum_finup(mctx->key, data, length, out); -} - -static int chksumc_digest(struct shash_desc *desc, const u8 *data, - unsigned int length, u8 *out) -{ - struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); - - return __chksumc_finup(mctx->key, data, length, out); -} - -static int crc32_cra_init(struct crypto_tfm *tfm) -{ - struct chksum_ctx *mctx = crypto_tfm_ctx(tfm); - - mctx->key = 0; - return 0; -} - -static int crc32c_cra_init(struct crypto_tfm *tfm) -{ - struct chksum_ctx *mctx = crypto_tfm_ctx(tfm); - - mctx->key = ~0; - return 0; -} - -static struct shash_alg crc32_alg = { - .digestsize = CHKSUM_DIGEST_SIZE, - .setkey = chksum_setkey, - .init = chksum_init, - .update = chksum_update, - .final = chksum_final, - .finup = chksum_finup, - .digest = chksum_digest, - .descsize = sizeof(struct chksum_desc_ctx), - .base = { - .cra_name = "crc32", - .cra_driver_name = "crc32-arm64-hw", - .cra_priority = 300, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_alignmask = 0, - .cra_ctxsize = sizeof(struct chksum_ctx), - .cra_module = THIS_MODULE, - .cra_init = crc32_cra_init, - } -}; - -static struct shash_alg crc32c_alg = { - .digestsize = CHKSUM_DIGEST_SIZE, - .setkey = chksum_setkey, - .init = chksum_init, - .update = chksumc_update, - .final = chksumc_final, - .finup = chksumc_finup, - .digest = chksumc_digest, - .descsize = sizeof(struct chksum_desc_ctx), - .base = { - .cra_name = "crc32c", - .cra_driver_name = "crc32c-arm64-hw", - .cra_priority = 300, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_alignmask = 0, - .cra_ctxsize = sizeof(struct chksum_ctx), - .cra_module = THIS_MODULE, - .cra_init = crc32c_cra_init, - } -}; - -static int __init crc32_mod_init(void) -{ - int err; - - err = crypto_register_shash(&crc32_alg); - - if (err) - return err; - - err = crypto_register_shash(&crc32c_alg); - - if (err) { - crypto_unregister_shash(&crc32_alg); - return err; - } - - return 0; -} - -static void __exit crc32_mod_exit(void) -{ - crypto_unregister_shash(&crc32_alg); - crypto_unregister_shash(&crc32c_alg); -} - -module_cpu_feature_match(CRC32, crc32_mod_init); -module_exit(crc32_mod_exit); diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c index 8594127d5e01..eccb1ae90064 100644 --- a/arch/arm64/crypto/crc32-ce-glue.c +++ b/arch/arm64/crypto/crc32-ce-glue.c @@ -72,6 +72,24 @@ static int crc32_pmull_init(struct shash_desc *desc) return 0; } +static int crc32_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + u32 *crc = shash_desc_ctx(desc); + + *crc = crc32_armv8_le(*crc, data, length); + return 0; +} + +static int crc32c_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + u32 *crc = shash_desc_ctx(desc); + + *crc = crc32c_armv8_le(*crc, data, length); + return 0; +} + static int crc32_pmull_update(struct shash_desc *desc, const u8 *data, unsigned int length) { @@ -156,7 +174,7 @@ static int crc32c_pmull_final(struct shash_desc *desc, u8 *out) static struct shash_alg crc32_pmull_algs[] = { { .setkey = crc32_pmull_setkey, .init = crc32_pmull_init, - .update = crc32_pmull_update, + .update = crc32_update, .final = crc32_pmull_final, .descsize = sizeof(u32), .digestsize = sizeof(u32), @@ -171,7 +189,7 @@ static struct shash_alg crc32_pmull_algs[] = { { }, { .setkey = crc32_pmull_setkey, .init = crc32_pmull_init, - .update = crc32c_pmull_update, + .update = crc32c_update, .final = crc32c_pmull_final, .descsize = sizeof(u32), .digestsize = sizeof(u32), @@ -187,14 +205,20 @@ static struct shash_alg crc32_pmull_algs[] = { { static int __init crc32_pmull_mod_init(void) { - if (elf_hwcap & HWCAP_CRC32) { - fallback_crc32 = crc32_armv8_le; - fallback_crc32c = crc32c_armv8_le; - } else { - fallback_crc32 = crc32_le; - fallback_crc32c = __crc32c_le; + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) { + crc32_pmull_algs[0].update = crc32_pmull_update; + crc32_pmull_algs[1].update = crc32c_pmull_update; + + if (elf_hwcap & HWCAP_CRC32) { + fallback_crc32 = crc32_armv8_le; + fallback_crc32c = crc32c_armv8_le; + } else { + fallback_crc32 = crc32_le; + fallback_crc32c = __crc32c_le; + } + } else if (!(elf_hwcap & HWCAP_CRC32)) { + return -ENODEV; } - return crypto_register_shashes(crc32_pmull_algs, ARRAY_SIZE(crc32_pmull_algs)); } @@ -205,7 +229,12 @@ static void __exit crc32_pmull_mod_exit(void) ARRAY_SIZE(crc32_pmull_algs)); } -module_cpu_feature_match(PMULL, crc32_pmull_mod_init); +static const struct cpu_feature crc32_cpu_feature[] = { + { cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { } +}; +MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature); + +module_init(crc32_pmull_mod_init); module_exit(crc32_pmull_mod_exit); MODULE_AUTHOR("Ard Biesheuvel "); -- cgit v1.2.3 From 4860620da7e5752d916737472c40be573aec1869 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 3 Feb 2017 14:49:37 +0000 Subject: crypto: arm64/aes - add NEON/Crypto Extensions CBCMAC/CMAC/XCBC driver On ARMv8 implementations that do not support the Crypto Extensions, such as the Raspberry Pi 3, the CCM driver falls back to the generic table based AES implementation to perform the MAC part of the algorithm, which is slow and not time invariant. So add a CBCMAC implementation to the shared glue code between NEON AES and Crypto Extensions AES, so that it can be used instead now that the CCM driver has been updated to look for CBCMAC implementations other than the one it supplies itself. Also, given how these algorithms mostly only differ in the way the key handling and the final encryption are implemented, expose CMAC and XCBC algorithms as well based on the same core update code. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-glue.c | 240 +++++++++++++++++++++++++++++++++++++++++- arch/arm64/crypto/aes-modes.S | 29 ++++- 2 files changed, 267 insertions(+), 2 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 055bc3f61138..bcf596b0197e 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -1,7 +1,7 @@ /* * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES * - * Copyright (C) 2013 Linaro Ltd + * Copyright (C) 2013 - 2017 Linaro Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,7 @@ #define aes_ctr_encrypt ce_aes_ctr_encrypt #define aes_xts_encrypt ce_aes_xts_encrypt #define aes_xts_decrypt ce_aes_xts_decrypt +#define aes_mac_update ce_aes_mac_update MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); #else #define MODE "neon" @@ -44,11 +46,15 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); #define aes_ctr_encrypt neon_aes_ctr_encrypt #define aes_xts_encrypt neon_aes_xts_encrypt #define aes_xts_decrypt neon_aes_xts_decrypt +#define aes_mac_update neon_aes_mac_update MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON"); MODULE_ALIAS_CRYPTO("ecb(aes)"); MODULE_ALIAS_CRYPTO("cbc(aes)"); MODULE_ALIAS_CRYPTO("ctr(aes)"); MODULE_ALIAS_CRYPTO("xts(aes)"); +MODULE_ALIAS_CRYPTO("cmac(aes)"); +MODULE_ALIAS_CRYPTO("xcbc(aes)"); +MODULE_ALIAS_CRYPTO("cbcmac(aes)"); #endif MODULE_AUTHOR("Ard Biesheuvel "); @@ -75,11 +81,25 @@ asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, int blocks, u8 const rk2[], u8 iv[], int first); +asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds, + int blocks, u8 dg[], int enc_before, + int enc_after); + struct crypto_aes_xts_ctx { struct crypto_aes_ctx key1; struct crypto_aes_ctx __aligned(8) key2; }; +struct mac_tfm_ctx { + struct crypto_aes_ctx key; + u8 __aligned(8) consts[]; +}; + +struct mac_desc_ctx { + unsigned int len; + u8 dg[AES_BLOCK_SIZE]; +}; + static int skcipher_aes_setkey(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { @@ -357,6 +377,217 @@ static struct skcipher_alg aes_algs[] = { { .decrypt = xts_decrypt, } }; +static int cbcmac_setkey(struct crypto_shash *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); + int err; + + err = aes_expandkey(&ctx->key, in_key, key_len); + if (err) + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + + return err; +} + +static void cmac_gf128_mul_by_x(be128 *y, const be128 *x) +{ + u64 a = be64_to_cpu(x->a); + u64 b = be64_to_cpu(x->b); + + y->a = cpu_to_be64((a << 1) | (b >> 63)); + y->b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0)); +} + +static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); + be128 *consts = (be128 *)ctx->consts; + u8 *rk = (u8 *)ctx->key.key_enc; + int rounds = 6 + key_len / 4; + int err; + + err = cbcmac_setkey(tfm, in_key, key_len); + if (err) + return err; + + /* encrypt the zero vector */ + kernel_neon_begin(); + aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, rk, rounds, 1, 1); + kernel_neon_end(); + + cmac_gf128_mul_by_x(consts, consts); + cmac_gf128_mul_by_x(consts + 1, consts); + + return 0; +} + +static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key, + unsigned int key_len) +{ + static u8 const ks[3][AES_BLOCK_SIZE] = { + { [0 ... AES_BLOCK_SIZE - 1] = 0x1 }, + { [0 ... AES_BLOCK_SIZE - 1] = 0x2 }, + { [0 ... AES_BLOCK_SIZE - 1] = 0x3 }, + }; + + struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); + u8 *rk = (u8 *)ctx->key.key_enc; + int rounds = 6 + key_len / 4; + u8 key[AES_BLOCK_SIZE]; + int err; + + err = cbcmac_setkey(tfm, in_key, key_len); + if (err) + return err; + + kernel_neon_begin(); + aes_ecb_encrypt(key, ks[0], rk, rounds, 1, 1); + aes_ecb_encrypt(ctx->consts, ks[1], rk, rounds, 2, 0); + kernel_neon_end(); + + return cbcmac_setkey(tfm, key, sizeof(key)); +} + +static int mac_init(struct shash_desc *desc) +{ + struct mac_desc_ctx *ctx = shash_desc_ctx(desc); + + memset(ctx->dg, 0, AES_BLOCK_SIZE); + ctx->len = 0; + + return 0; +} + +static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len) +{ + struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct mac_desc_ctx *ctx = shash_desc_ctx(desc); + int rounds = 6 + tctx->key.key_length / 4; + + while (len > 0) { + unsigned int l; + + if ((ctx->len % AES_BLOCK_SIZE) == 0 && + (ctx->len + len) > AES_BLOCK_SIZE) { + + int blocks = len / AES_BLOCK_SIZE; + + len %= AES_BLOCK_SIZE; + + kernel_neon_begin(); + aes_mac_update(p, tctx->key.key_enc, rounds, blocks, + ctx->dg, (ctx->len != 0), (len != 0)); + kernel_neon_end(); + + p += blocks * AES_BLOCK_SIZE; + + if (!len) { + ctx->len = AES_BLOCK_SIZE; + break; + } + ctx->len = 0; + } + + l = min(len, AES_BLOCK_SIZE - ctx->len); + + if (l <= AES_BLOCK_SIZE) { + crypto_xor(ctx->dg + ctx->len, p, l); + ctx->len += l; + len -= l; + p += l; + } + } + + return 0; +} + +static int cbcmac_final(struct shash_desc *desc, u8 *out) +{ + struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct mac_desc_ctx *ctx = shash_desc_ctx(desc); + int rounds = 6 + tctx->key.key_length / 4; + + kernel_neon_begin(); + aes_mac_update(NULL, tctx->key.key_enc, rounds, 0, ctx->dg, 1, 0); + kernel_neon_end(); + + memcpy(out, ctx->dg, AES_BLOCK_SIZE); + + return 0; +} + +static int cmac_final(struct shash_desc *desc, u8 *out) +{ + struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct mac_desc_ctx *ctx = shash_desc_ctx(desc); + int rounds = 6 + tctx->key.key_length / 4; + u8 *consts = tctx->consts; + + if (ctx->len != AES_BLOCK_SIZE) { + ctx->dg[ctx->len] ^= 0x80; + consts += AES_BLOCK_SIZE; + } + + kernel_neon_begin(); + aes_mac_update(consts, tctx->key.key_enc, rounds, 1, ctx->dg, 0, 1); + kernel_neon_end(); + + memcpy(out, ctx->dg, AES_BLOCK_SIZE); + + return 0; +} + +static struct shash_alg mac_algs[] = { { + .base.cra_name = "cmac(aes)", + .base.cra_driver_name = "cmac-aes-" MODE, + .base.cra_priority = PRIO, + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) + + 2 * AES_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = AES_BLOCK_SIZE, + .init = mac_init, + .update = mac_update, + .final = cmac_final, + .setkey = cmac_setkey, + .descsize = sizeof(struct mac_desc_ctx), +}, { + .base.cra_name = "xcbc(aes)", + .base.cra_driver_name = "xcbc-aes-" MODE, + .base.cra_priority = PRIO, + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) + + 2 * AES_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = AES_BLOCK_SIZE, + .init = mac_init, + .update = mac_update, + .final = cmac_final, + .setkey = xcbc_setkey, + .descsize = sizeof(struct mac_desc_ctx), +}, { + .base.cra_name = "cbcmac(aes)", + .base.cra_driver_name = "cbcmac-aes-" MODE, + .base.cra_priority = PRIO, + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct mac_tfm_ctx), + .base.cra_module = THIS_MODULE, + + .digestsize = AES_BLOCK_SIZE, + .init = mac_init, + .update = mac_update, + .final = cbcmac_final, + .setkey = cbcmac_setkey, + .descsize = sizeof(struct mac_desc_ctx), +} }; + static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)]; static void aes_exit(void) @@ -367,6 +598,7 @@ static void aes_exit(void) if (aes_simd_algs[i]) simd_skcipher_free(aes_simd_algs[i]); + crypto_unregister_shashes(mac_algs, ARRAY_SIZE(mac_algs)); crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); } @@ -383,6 +615,10 @@ static int __init aes_init(void) if (err) return err; + err = crypto_register_shashes(mac_algs, ARRAY_SIZE(mac_algs)); + if (err) + goto unregister_ciphers; + for (i = 0; i < ARRAY_SIZE(aes_algs); i++) { if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL)) continue; @@ -402,6 +638,8 @@ static int __init aes_init(void) unregister_simds: aes_exit(); +unregister_ciphers: + crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); return err; } diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index 92b982a8b112..2674d43d1384 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -1,7 +1,7 @@ /* * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES * - * Copyright (C) 2013 Linaro Ltd + * Copyright (C) 2013 - 2017 Linaro Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -525,3 +525,30 @@ AES_ENTRY(aes_xts_decrypt) FRAME_POP ret AES_ENDPROC(aes_xts_decrypt) + + /* + * aes_mac_update(u8 const in[], u32 const rk[], int rounds, + * int blocks, u8 dg[], int enc_before, int enc_after) + */ +AES_ENTRY(aes_mac_update) + ld1 {v0.16b}, [x4] /* get dg */ + enc_prepare w2, x1, x7 + cbnz w5, .Lmacenc + +.Lmacloop: + cbz w3, .Lmacout + ld1 {v1.16b}, [x0], #16 /* get next pt block */ + eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ + + subs w3, w3, #1 + csinv x5, x6, xzr, eq + cbz w5, .Lmacout + +.Lmacenc: + encrypt_block v0, w2, x1, x7, w8 + b .Lmacloop + +.Lmacout: + st1 {v0.16b}, [x4] /* return dg */ + ret +AES_ENDPROC(aes_mac_update) -- cgit v1.2.3