diff options
author | Jason A. Donenfeld <Jason@zx2c4.com> | 2020-01-06 06:40:48 +0300 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2020-01-16 10:18:12 +0300 |
commit | d7d7b853566254648df59f7ea27ea05952a6cfa8 (patch) | |
tree | 1dd0bf094b188627ba18438f51df753d5d969086 /arch/x86/crypto/poly1305_glue.c | |
parent | 0896ca2a0cb6127e8a129f1f2a680d49b6b0f65c (diff) | |
download | linux-d7d7b853566254648df59f7ea27ea05952a6cfa8.tar.xz |
crypto: x86/poly1305 - wire up faster implementations for kernel
These x86_64 vectorized implementations support AVX, AVX-2, and AVX512F.
The AVX-512F implementation is disabled on Skylake, due to throttling,
but it is quite fast on >= Cannonlake.
On the left is cycle counts on a Core i7 6700HQ using the AVX-2
codepath, comparing this implementation ("new") to the implementation in
the current crypto api ("old"). On the right are benchmarks on a Xeon
Gold 5120 using the AVX-512 codepath. The new implementation is faster
on all benchmarks.
AVX-2 AVX-512
--------- -----------
size old new size old new
---- ---- ---- ---- ---- ----
0 70 68 0 74 70
16 92 90 16 96 92
32 134 104 32 136 106
48 172 120 48 184 124
64 218 136 64 218 138
80 254 158 80 260 160
96 298 174 96 300 176
112 342 192 112 342 194
128 388 212 128 384 212
144 428 228 144 420 226
160 466 246 160 464 248
176 510 264 176 504 264
192 550 282 192 544 282
208 594 302 208 582 300
224 628 316 224 624 318
240 676 334 240 662 338
256 716 354 256 708 358
272 764 374 272 748 372
288 802 352 288 788 358
304 420 366 304 422 370
320 428 360 320 432 364
336 484 378 336 486 380
352 426 384 352 434 390
368 478 400 368 480 408
384 488 394 384 490 398
400 542 408 400 542 412
416 486 416 416 492 426
432 534 430 432 538 436
448 544 422 448 546 432
464 600 438 464 600 448
480 540 448 480 548 456
496 594 464 496 594 476
512 602 456 512 606 470
528 656 476 528 656 480
544 600 480 544 606 498
560 650 494 560 652 512
576 664 490 576 662 508
592 714 508 592 716 522
608 656 514 608 664 538
624 708 532 624 710 552
640 716 524 640 720 516
656 770 536 656 772 526
672 716 548 672 722 544
688 770 562 688 768 556
704 774 552 704 778 556
720 826 568 720 832 568
736 768 574 736 780 584
752 822 592 752 826 600
768 830 584 768 836 560
784 884 602 784 888 572
800 828 610 800 838 588
816 884 628 816 884 604
832 888 618 832 894 598
848 942 632 848 946 612
864 884 644 864 896 628
880 936 660 880 942 644
896 948 652 896 952 608
912 1000 664 912 1004 616
928 942 676 928 954 634
944 994 690 944 1000 646
960 1002 680 960 1008 646
976 1054 694 976 1062 658
992 1002 706 992 1012 674
1008 1052 720 1008 1058 690
This commit wires in the prior implementation from Andy, and makes the
following changes to be suitable for kernel land.
- Some cosmetic and structural changes, like renaming labels to
.Lname, constants, and other Linux conventions, as well as making
the code easy for us to maintain moving forward.
- CPU feature checking is done in C by the glue code.
- We avoid jumping into the middle of functions, to appease objtool,
and instead parameterize shared code.
- We maintain frame pointers so that stack traces make sense.
- We remove the dependency on the perl xlate code, which transforms
the output into things that assemblers we don't care about use.
Importantly, none of our changes affect the arithmetic or core code, but
just involve the differing environment of kernel space.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/poly1305_glue.c')
-rw-r--r-- | arch/x86/crypto/poly1305_glue.c | 473 |
1 files changed, 167 insertions, 306 deletions
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index edb7113e36f3..657363588e0c 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -1,8 +1,6 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-2.0 OR MIT /* - * Poly1305 authenticator algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include <crypto/algapi.h> @@ -13,279 +11,170 @@ #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> +#include <asm/intel-family.h> #include <asm/simd.h> -asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src, - const u32 *r, unsigned int blocks); -asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r, - unsigned int blocks, const u32 *u); -asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r, - unsigned int blocks, const u32 *u); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd); +asmlinkage void poly1305_init_x86_64(void *ctx, + const u8 key[POLY1305_KEY_SIZE]); +asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, + const size_t len, const u32 padbit); +asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, + const size_t len, const u32 padbit); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); + +struct poly1305_arch_internal { + union { + struct { + u32 h[5]; + u32 is_base2_26; + }; + u64 hs[3]; + }; + u64 r[2]; + u64 pad; + struct { u32 r2, r1, r4, r3; } rn[9]; +}; -static inline u64 mlt(u64 a, u64 b) +/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit + * the unfortunate situation of using AVX and then having to go back to scalar + * -- because the user is silly and has called the update function from two + * separate contexts -- then we need to convert back to the original base before + * proceeding. It is possible to reason that the initial reduction below is + * sufficient given the implementation invariants. However, for an avoidance of + * doubt and because this is not performance critical, we do the full reduction + * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py + */ +static void convert_to_base2_64(void *ctx) { - return a * b; -} + struct poly1305_arch_internal *state = ctx; + u32 cy; -static inline u32 sr(u64 v, u_char n) -{ - return v >> n; -} + if (!state->is_base2_26) + return; -static inline u32 and(u32 v, u32 mask) -{ - return v & mask; + cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; + cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; + cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; + cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; + state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; + state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); + state->hs[2] = state->h[4] >> 24; +#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) + cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); + state->hs[2] &= 3; + state->hs[0] += cy; + state->hs[1] += (cy = ULT(state->hs[0], cy)); + state->hs[2] += ULT(state->hs[1], cy); +#undef ULT + state->is_base2_26 = 0; } -static void poly1305_simd_mult(u32 *a, const u32 *b) +static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE]) { - u8 m[POLY1305_BLOCK_SIZE]; - - memset(m, 0, sizeof(m)); - /* The poly1305 block function adds a hi-bit to the accumulator which - * we don't need for key multiplication; compensate for it. */ - a[4] -= 1 << 24; - poly1305_block_sse2(a, m, b, 1); + poly1305_init_x86_64(ctx, key); } -static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key) +static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, + const u32 padbit) { - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff; - key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03; - key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff; - key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff; - key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff; -} + struct poly1305_arch_internal *state = ctx; -static void poly1305_integer_blocks(struct poly1305_state *state, - const struct poly1305_key *key, - const void *src, - unsigned int nblocks, u32 hibit) -{ - u32 r0, r1, r2, r3, r4; - u32 s1, s2, s3, s4; - u32 h0, h1, h2, h3, h4; - u64 d0, d1, d2, d3, d4; + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE || + PAGE_SIZE % POLY1305_BLOCK_SIZE); - if (!nblocks) + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) || + (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || + !crypto_simd_usable()) { + convert_to_base2_64(ctx); + poly1305_blocks_x86_64(ctx, inp, len, padbit); return; + } - r0 = key->r[0]; - r1 = key->r[1]; - r2 = key->r[2]; - r3 = key->r[3]; - r4 = key->r[4]; - - s1 = r1 * 5; - s2 = r2 * 5; - s3 = r3 * 5; - s4 = r4 * 5; - - h0 = state->h[0]; - h1 = state->h[1]; - h2 = state->h[2]; - h3 = state->h[3]; - h4 = state->h[4]; - - do { - /* h += m[i] */ - h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff; - h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff; - h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff; - h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff; - h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24); - - /* h *= r */ - d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) + - mlt(h3, s2) + mlt(h4, s1); - d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) + - mlt(h3, s3) + mlt(h4, s2); - d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) + - mlt(h3, s4) + mlt(h4, s3); - d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) + - mlt(h3, r0) + mlt(h4, s4); - d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) + - mlt(h3, r1) + mlt(h4, r0); - - /* (partial) h %= p */ - d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff); - d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff); - d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff); - d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff); - h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff); - h1 += h0 >> 26; h0 = h0 & 0x3ffffff; - - src += POLY1305_BLOCK_SIZE; - } while (--nblocks); - - state->h[0] = h0; - state->h[1] = h1; - state->h[2] = h2; - state->h[3] = h3; - state->h[4] = h4; + for (;;) { + const size_t bytes = min_t(size_t, len, PAGE_SIZE); + + kernel_fpu_begin(); + if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) + poly1305_blocks_avx512(ctx, inp, bytes, padbit); + else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2)) + poly1305_blocks_avx2(ctx, inp, bytes, padbit); + else + poly1305_blocks_avx(ctx, inp, bytes, padbit); + kernel_fpu_end(); + len -= bytes; + if (!len) + break; + inp += bytes; + } } -static void poly1305_integer_emit(const struct poly1305_state *state, void *dst) +static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]) { - u32 h0, h1, h2, h3, h4; - u32 g0, g1, g2, g3, g4; - u32 mask; - - /* fully carry h */ - h0 = state->h[0]; - h1 = state->h[1]; - h2 = state->h[2]; - h3 = state->h[3]; - h4 = state->h[4]; - - h2 += (h1 >> 26); h1 = h1 & 0x3ffffff; - h3 += (h2 >> 26); h2 = h2 & 0x3ffffff; - h4 += (h3 >> 26); h3 = h3 & 0x3ffffff; - h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff; - h1 += (h0 >> 26); h0 = h0 & 0x3ffffff; - - /* compute h + -p */ - g0 = h0 + 5; - g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff; - g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff; - g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff; - g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff; - - /* select h if h < p, or h + -p if h >= p */ - mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1; - g0 &= mask; - g1 &= mask; - g2 &= mask; - g3 &= mask; - g4 &= mask; - mask = ~mask; - h0 = (h0 & mask) | g0; - h1 = (h1 & mask) | g1; - h2 = (h2 & mask) | g2; - h3 = (h3 & mask) | g3; - h4 = (h4 & mask) | g4; - - /* h = h % (2^128) */ - put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0); - put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4); - put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8); - put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12); + struct poly1305_arch_internal *state = ctx; + + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) || + !state->is_base2_26 || !crypto_simd_usable()) { + convert_to_base2_64(ctx); + poly1305_emit_x86_64(ctx, mac, nonce); + } else + poly1305_emit_avx(ctx, mac, nonce); } -void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key) +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) { - poly1305_integer_setkey(desc->opaque_r, key); - desc->s[0] = get_unaligned_le32(key + 16); - desc->s[1] = get_unaligned_le32(key + 20); - desc->s[2] = get_unaligned_le32(key + 24); - desc->s[3] = get_unaligned_le32(key + 28); - poly1305_core_init(&desc->h); - desc->buflen = 0; - desc->sset = true; - desc->rset = 1; + poly1305_simd_init(&dctx->h, key); + dctx->s[0] = get_unaligned_le32(&key[16]); + dctx->s[1] = get_unaligned_le32(&key[20]); + dctx->s[2] = get_unaligned_le32(&key[24]); + dctx->s[3] = get_unaligned_le32(&key[28]); + dctx->buflen = 0; + dctx->sset = true; } -EXPORT_SYMBOL_GPL(poly1305_init_arch); +EXPORT_SYMBOL(poly1305_init_arch); -static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, - const u8 *src, unsigned int srclen) +static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, + const u8 *inp, unsigned int len) { - if (!dctx->sset) { - if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) { - poly1305_integer_setkey(dctx->r, src); - src += POLY1305_BLOCK_SIZE; - srclen -= POLY1305_BLOCK_SIZE; + unsigned int acc = 0; + if (unlikely(!dctx->sset)) { + if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) { + poly1305_simd_init(&dctx->h, inp); + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + acc += POLY1305_BLOCK_SIZE; dctx->rset = 1; } - if (srclen >= POLY1305_BLOCK_SIZE) { - dctx->s[0] = get_unaligned_le32(src + 0); - dctx->s[1] = get_unaligned_le32(src + 4); - dctx->s[2] = get_unaligned_le32(src + 8); - dctx->s[3] = get_unaligned_le32(src + 12); - src += POLY1305_BLOCK_SIZE; - srclen -= POLY1305_BLOCK_SIZE; + if (len >= POLY1305_BLOCK_SIZE) { + dctx->s[0] = get_unaligned_le32(&inp[0]); + dctx->s[1] = get_unaligned_le32(&inp[4]); + dctx->s[2] = get_unaligned_le32(&inp[8]); + dctx->s[3] = get_unaligned_le32(&inp[12]); + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + acc += POLY1305_BLOCK_SIZE; dctx->sset = true; } } - return srclen; -} - -static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx, - const u8 *src, unsigned int srclen) -{ - unsigned int datalen; - - if (unlikely(!dctx->sset)) { - datalen = crypto_poly1305_setdesckey(dctx, src, srclen); - src += srclen - datalen; - srclen = datalen; - } - if (srclen >= POLY1305_BLOCK_SIZE) { - poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src, - srclen / POLY1305_BLOCK_SIZE, 1); - srclen %= POLY1305_BLOCK_SIZE; - } - return srclen; -} - -static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, - const u8 *src, unsigned int srclen) -{ - unsigned int blocks, datalen; - - if (unlikely(!dctx->sset)) { - datalen = crypto_poly1305_setdesckey(dctx, src, srclen); - src += srclen - datalen; - srclen = datalen; - } - - if (IS_ENABLED(CONFIG_AS_AVX2) && - static_branch_likely(&poly1305_use_avx2) && - srclen >= POLY1305_BLOCK_SIZE * 4) { - if (unlikely(dctx->rset < 4)) { - if (dctx->rset < 2) { - dctx->r[1] = dctx->r[0]; - poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); - } - dctx->r[2] = dctx->r[1]; - poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r); - dctx->r[3] = dctx->r[2]; - poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r); - dctx->rset = 4; - } - blocks = srclen / (POLY1305_BLOCK_SIZE * 4); - poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks, - dctx->r[1].r); - src += POLY1305_BLOCK_SIZE * 4 * blocks; - srclen -= POLY1305_BLOCK_SIZE * 4 * blocks; - } - - if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { - if (unlikely(dctx->rset < 2)) { - dctx->r[1] = dctx->r[0]; - poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); - dctx->rset = 2; - } - blocks = srclen / (POLY1305_BLOCK_SIZE * 2); - poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r, - blocks, dctx->r[1].r); - src += POLY1305_BLOCK_SIZE * 2 * blocks; - srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; - } - if (srclen >= POLY1305_BLOCK_SIZE) { - poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1); - srclen -= POLY1305_BLOCK_SIZE; - } - return srclen; + return acc; } void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int srclen) { - unsigned int bytes; + unsigned int bytes, used; if (unlikely(dctx->buflen)) { bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); @@ -295,31 +184,19 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, dctx->buflen += bytes; if (dctx->buflen == POLY1305_BLOCK_SIZE) { - if (static_branch_likely(&poly1305_use_simd) && - likely(crypto_simd_usable())) { - kernel_fpu_begin(); - poly1305_simd_blocks(dctx, dctx->buf, - POLY1305_BLOCK_SIZE); - kernel_fpu_end(); - } else { - poly1305_scalar_blocks(dctx, dctx->buf, - POLY1305_BLOCK_SIZE); - } + if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE))) + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1); dctx->buflen = 0; } } if (likely(srclen >= POLY1305_BLOCK_SIZE)) { - if (static_branch_likely(&poly1305_use_simd) && - likely(crypto_simd_usable())) { - kernel_fpu_begin(); - bytes = poly1305_simd_blocks(dctx, src, srclen); - kernel_fpu_end(); - } else { - bytes = poly1305_scalar_blocks(dctx, src, srclen); - } - src += srclen - bytes; - srclen = bytes; + bytes = round_down(srclen, POLY1305_BLOCK_SIZE); + srclen -= bytes; + used = crypto_poly1305_setdctxkey(dctx, src, bytes); + if (likely(bytes - used)) + poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1); + src += bytes; } if (unlikely(srclen)) { @@ -329,31 +206,17 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, } EXPORT_SYMBOL(poly1305_update_arch); -void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst) +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) { - __le32 digest[4]; - u64 f = 0; - - if (unlikely(desc->buflen)) { - desc->buf[desc->buflen++] = 1; - memset(desc->buf + desc->buflen, 0, - POLY1305_BLOCK_SIZE - desc->buflen); - poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0); + if (unlikely(dctx->buflen)) { + dctx->buf[dctx->buflen++] = 1; + memset(dctx->buf + dctx->buflen, 0, + POLY1305_BLOCK_SIZE - dctx->buflen); + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); } - poly1305_integer_emit(&desc->h, digest); - - /* mac = (h + s) % (2^128) */ - f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0]; - put_unaligned_le32(f, dst + 0); - f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1]; - put_unaligned_le32(f, dst + 4); - f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2]; - put_unaligned_le32(f, dst + 8); - f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3]; - put_unaligned_le32(f, dst + 12); - - *desc = (struct poly1305_desc_ctx){}; + poly1305_simd_emit(&dctx->h, dst, dctx->s); + *dctx = (struct poly1305_desc_ctx){}; } EXPORT_SYMBOL(poly1305_final_arch); @@ -361,38 +224,34 @@ static int crypto_poly1305_init(struct shash_desc *desc) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - poly1305_core_init(&dctx->h); - dctx->buflen = 0; - dctx->rset = 0; - dctx->sset = false; - + *dctx = (struct poly1305_desc_ctx){}; return 0; } -static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) +static int crypto_poly1305_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - if (unlikely(!dctx->sset)) - return -ENOKEY; - - poly1305_final_arch(dctx, dst); + poly1305_update_arch(dctx, src, srclen); return 0; } -static int poly1305_simd_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) +static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - poly1305_update_arch(dctx, src, srclen); + if (unlikely(!dctx->sset)) + return -ENOKEY; + + poly1305_final_arch(dctx, dst); return 0; } static struct shash_alg alg = { .digestsize = POLY1305_DIGEST_SIZE, .init = crypto_poly1305_init, - .update = poly1305_simd_update, + .update = crypto_poly1305_update, .final = crypto_poly1305_final, .descsize = sizeof(struct poly1305_desc_ctx), .base = { @@ -406,17 +265,19 @@ static struct shash_alg alg = { static int __init poly1305_simd_mod_init(void) { - if (!boot_cpu_has(X86_FEATURE_XMM2)) - return 0; - - static_branch_enable(&poly1305_use_simd); - - if (IS_ENABLED(CONFIG_AS_AVX2) && - boot_cpu_has(X86_FEATURE_AVX) && + if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx); + if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) static_branch_enable(&poly1305_use_avx2); - + if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && + /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ + boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X) + static_branch_enable(&poly1305_use_avx512); return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; } @@ -430,7 +291,7 @@ module_init(poly1305_simd_mod_init); module_exit(poly1305_simd_mod_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); +MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); MODULE_DESCRIPTION("Poly1305 authenticator"); MODULE_ALIAS_CRYPTO("poly1305"); MODULE_ALIAS_CRYPTO("poly1305-simd"); |