/* * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental * SSE3 instruction set extensions introduced in Intel Core Microarchitecture * processors. CPUs supporting Intel(R) AVX extensions will get an additional * boost. * * This work was inspired by the vectorized implementation of Dean Gaudet. * Additional information on it can be found at: * http://www.arctic.org/~dean/crypto/sha1.html * * It was improved upon with more efficient vectorization of the message * scheduling. This implementation has also been optimized for all current and * several future generations of Intel CPUs. * * See this article for more information about the implementation details: * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ * * Copyright (C) 2010, Intel Corp. * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> * Ronen Zohar <ronen.zohar@intel.com> * * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: * Author: Mathias Krause <minipli@googlemail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ #define CTX %rdi // arg1 #define BUF %rsi // arg2 #define CNT %rdx // arg3 #define REG_A %ecx #define REG_B %esi #define REG_C %edi #define REG_D %ebp #define REG_E %edx #define REG_T1 %eax #define REG_T2 %ebx #define K_BASE %r8 #define HASH_PTR %r9 #define BUFFER_PTR %r10 #define BUFFER_END %r11 #define W_TMP1 %xmm0 #define W_TMP2 %xmm9 #define W0 %xmm1 #define W4 %xmm2 #define W8 %xmm3 #define W12 %xmm4 #define W16 %xmm5 #define W20 %xmm6 #define W24 %xmm7 #define W28 %xmm8 #define XMM_SHUFB_BSWAP %xmm10 /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ #define WK(t) (((t) & 15) * 4)(%rsp) #define W_PRECALC_AHEAD 16 /* * This macro implements the SHA-1 function's body for single 64-byte block * param: function's name */ .macro SHA1_VECTOR_ASM name .global \name .type \name, @function .align 32 \name: push %rbx push %rbp push %r12 mov %rsp, %r12 sub $64, %rsp # allocate workspace and $~15, %rsp # align stack mov CTX, HASH_PTR mov BUF, BUFFER_PTR shl $6, CNT # multiply by 64 add BUF, CNT mov CNT, BUFFER_END lea K_XMM_AR(%rip), K_BASE xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP SHA1_PIPELINED_MAIN_BODY # cleanup workspace mov $8, %ecx mov %rsp, %rdi xor %rax, %rax rep stosq mov %r12, %rsp # deallocate workspace pop %r12 pop %rbp pop %rbx ret .size \name, .-\name .endm /* * This macro implements 80 rounds of SHA-1 for one 64-byte block */ .macro SHA1_PIPELINED_MAIN_BODY INIT_REGALLOC mov (HASH_PTR), A mov 4(HASH_PTR), B mov 8(HASH_PTR), C mov 12(HASH_PTR), D mov 16(HASH_PTR), E .set i, 0 .rept W_PRECALC_AHEAD W_PRECALC i .set i, (i+1) .endr .align 4 1: RR F1,A,B,C,D,E,0 RR F1,D,E,A,B,C,2 RR F1,B,C,D,E,A,4 RR F1,E,A,B,C,D,6 RR F1,C,D,E,A,B,8 RR F1,A,B,C,D,E,10 RR F1,D,E,A,B,C,12 RR F1,B,C,D,E,A,14 RR F1,E,A,B,C,D,16 RR F1,C,D,E,A,B,18 RR F2,A,B,C,D,E,20 RR F2,D,E,A,B,C,22 RR F2,B,C,D,E,A,24 RR F2,E,A,B,C,D,26 RR F2,C,D,E,A,B,28 RR F2,A,B,C,D,E,30 RR F2,D,E,A,B,C,32 RR F2,B,C,D,E,A,34 RR F2,E,A,B,C,D,36 RR F2,C,D,E,A,B,38 RR F3,A,B,C,D,E,40 RR F3,D,E,A,B,C,42 RR F3,B,C,D,E,A,44 RR F3,E,A,B,C,D,46 RR F3,C,D,E,A,B,48 RR F3,A,B,C,D,E,50 RR F3,D,E,A,B,C,52 RR F3,B,C,D,E,A,54 RR F3,E,A,B,C,D,56 RR F3,C,D,E,A,B,58 add $64, BUFFER_PTR # move to the next 64-byte block cmp BUFFER_END, BUFFER_PTR # if the current is the last one use cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun RR F4,A,B,C,D,E,60 RR F4,D,E,A,B,C,62 RR F4,B,C,D,E,A,64 RR F4,E,A,B,C,D,66 RR F4,C,D,E,A,B,68 RR F4,A,B,C,D,E,70 RR F4,D,E,A,B,C,72 RR F4,B,C,D,E,A,74 RR F4,E,A,B,C,D,76 RR F4,C,D,E,A,B,78 UPDATE_HASH (HASH_PTR), A UPDATE_HASH 4(HASH_PTR), B UPDATE_HASH 8(HASH_PTR), C UPDATE_HASH 12(HASH_PTR), D UPDATE_HASH 16(HASH_PTR), E RESTORE_RENAMED_REGS cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end jne 1b .endm .macro INIT_REGALLOC .set A, REG_A .set B, REG_B .set C, REG_C .set D, REG_D .set E, REG_E .set T1, REG_T1 .set T2, REG_T2 .endm .macro RESTORE_RENAMED_REGS # order is important (REG_C is where it should be) mov B, REG_B mov D, REG_D mov A, REG_A mov E, REG_E .endm .macro SWAP_REG_NAMES a, b .set _T, \a .set \a, \b .set \b, _T .endm .macro F1 b, c, d mov \c, T1 SWAP_REG_NAMES \c, T1 xor \d, T1 and \b, T1 xor \d, T1 .endm .macro F2 b, c, d mov \d, T1 SWAP_REG_NAMES \d, T1 xor \c, T1 xor \b, T1 .endm .macro F3 b, c ,d mov \c, T1 SWAP_REG_NAMES \c, T1 mov \b, T2 or \b, T1 and \c, T2 and \d, T1 or T2, T1 .endm .macro F4 b, c, d F2 \b, \c, \d .endm .macro UPDATE_HASH hash, val add \hash, \val mov \val, \hash .endm /* * RR does two rounds of SHA-1 back to back with W[] pre-calc * t1 = F(b, c, d); e += w(i) * e += t1; b <<= 30; d += w(i+1); * t1 = F(a, b, c); * d += t1; a <<= 5; * e += a; * t1 = e; a >>= 7; * t1 <<= 5; * d += t1; */ .macro RR F, a, b, c, d, e, round add WK(\round), \e \F \b, \c, \d # t1 = F(b, c, d); W_PRECALC (\round + W_PRECALC_AHEAD) rol $30, \b add T1, \e add WK(\round + 1), \d \F \a, \b, \c W_PRECALC (\round + W_PRECALC_AHEAD + 1) rol $5, \a add \a, \e add T1, \d ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) mov \e, T1 SWAP_REG_NAMES \e, T1 rol $5, T1 add T1, \d # write: \a, \b # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c .endm .macro W_PRECALC r .set i, \r .if (i < 20) .set K_XMM, 0 .elseif (i < 40) .set K_XMM, 16 .elseif (i < 60) .set K_XMM, 32 .elseif (i < 80) .set K_XMM, 48 .endif .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) .set i, ((\r) % 80) # pre-compute for the next iteration .if (i == 0) W_PRECALC_RESET .endif W_PRECALC_00_15 .elseif (i<32) W_PRECALC_16_31 .elseif (i < 80) // rounds 32-79 W_PRECALC_32_79 .endif .endm .macro W_PRECALC_RESET .set W, W0 .set W_minus_04, W4 .set W_minus_08, W8 .set W_minus_12, W12 .set W_minus_16, W16 .set W_minus_20, W20 .set W_minus_24, W24 .set W_minus_28, W28 .set W_minus_32, W .endm .macro W_PRECALC_ROTATE .set W_minus_32, W_minus_28 .set W_minus_28, W_minus_24 .set W_minus_24, W_minus_20 .set W_minus_20, W_minus_16 .set W_minus_16, W_minus_12 .set W_minus_12, W_minus_08 .set W_minus_08, W_minus_04 .set W_minus_04, W .set W, W_minus_32 .endm .macro W_PRECALC_SSSE3 .macro W_PRECALC_00_15 W_PRECALC_00_15_SSSE3 .endm .macro W_PRECALC_16_31 W_PRECALC_16_31_SSSE3 .endm .macro W_PRECALC_32_79 W_PRECALC_32_79_SSSE3 .endm /* message scheduling pre-compute for rounds 0-15 */ .macro W_PRECALC_00_15_SSSE3 .if ((i & 3) == 0) movdqu (i*4)(BUFFER_PTR), W_TMP1 .elseif ((i & 3) == 1) pshufb XMM_SHUFB_BSWAP, W_TMP1 movdqa W_TMP1, W .elseif ((i & 3) == 2) paddd (K_BASE), W_TMP1 .elseif ((i & 3) == 3) movdqa W_TMP1, WK(i&~3) W_PRECALC_ROTATE .endif .endm /* message scheduling pre-compute for rounds 16-31 * * - calculating last 32 w[i] values in 8 XMM registers * - pre-calculate K+w[i] values and store to mem, for later load by ALU add * instruction * * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] * dependency, but improves for 32-79 */ .macro W_PRECALC_16_31_SSSE3 # blended scheduling of vector and scalar instruction streams, one 4-wide # vector iteration / 4 scalar rounds .if ((i & 3) == 0) movdqa W_minus_12, W palignr $8, W_minus_16, W # w[i-14] movdqa W_minus_04, W_TMP1 psrldq $4, W_TMP1 # w[i-3] pxor W_minus_08, W .elseif ((i & 3) == 1) pxor W_minus_16, W_TMP1 pxor W_TMP1, W movdqa W, W_TMP2 movdqa W, W_TMP1 pslldq $12, W_TMP2 .elseif ((i & 3) == 2) psrld $31, W pslld $1, W_TMP1 por W, W_TMP1 movdqa W_TMP2, W psrld $30, W_TMP2 pslld $2, W .elseif ((i & 3) == 3) pxor W, W_TMP1 pxor W_TMP2, W_TMP1 movdqa W_TMP1, W paddd K_XMM(K_BASE), W_TMP1 movdqa W_TMP1, WK(i&~3) W_PRECALC_ROTATE .endif .endm /* message scheduling pre-compute for rounds 32-79 * * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken */ .macro W_PRECALC_32_79_SSSE3 .if ((i & 3) == 0) movdqa W_minus_04, W_TMP1 pxor W_minus_28, W # W is W_minus_32 before xor palignr $8, W_minus_08, W_TMP1 .elseif ((i & 3) == 1) pxor W_minus_16, W pxor W_TMP1, W movdqa W, W_TMP1 .elseif ((i & 3) == 2) psrld $30, W pslld $2, W_TMP1 por W, W_TMP1 .elseif ((i & 3) == 3) movdqa W_TMP1, W paddd K_XMM(K_BASE), W_TMP1 movdqa W_TMP1, WK(i&~3) W_PRECALC_ROTATE .endif .endm .endm // W_PRECALC_SSSE3 #define K1 0x5a827999 #define K2 0x6ed9eba1 #define K3 0x8f1bbcdc #define K4 0xca62c1d6 .section .rodata .align 16 K_XMM_AR: .long K1, K1, K1, K1 .long K2, K2, K2, K2 .long K3, K3, K3, K3 .long K4, K4, K4, K4 BSWAP_SHUFB_CTL: .long 0x00010203 .long 0x04050607 .long 0x08090a0b .long 0x0c0d0e0f .section .text W_PRECALC_SSSE3 .macro xmm_mov a, b movdqu \a,\b .endm /* SSSE3 optimized implementation: * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, * unsigned int rounds); */ SHA1_VECTOR_ASM sha1_transform_ssse3 #ifdef CONFIG_AS_AVX .macro W_PRECALC_AVX .purgem W_PRECALC_00_15 .macro W_PRECALC_00_15 W_PRECALC_00_15_AVX .endm .purgem W_PRECALC_16_31 .macro W_PRECALC_16_31 W_PRECALC_16_31_AVX .endm .purgem W_PRECALC_32_79 .macro W_PRECALC_32_79 W_PRECALC_32_79_AVX .endm .macro W_PRECALC_00_15_AVX .if ((i & 3) == 0) vmovdqu (i*4)(BUFFER_PTR), W_TMP1 .elseif ((i & 3) == 1) vpshufb XMM_SHUFB_BSWAP, W_TMP1, W .elseif ((i & 3) == 2) vpaddd (K_BASE), W, W_TMP1 .elseif ((i & 3) == 3) vmovdqa W_TMP1, WK(i&~3) W_PRECALC_ROTATE .endif .endm .macro W_PRECALC_16_31_AVX .if ((i & 3) == 0) vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] vpxor W_minus_08, W, W vpxor W_minus_16, W_TMP1, W_TMP1 .elseif ((i & 3) == 1) vpxor W_TMP1, W, W vpslldq $12, W, W_TMP2 vpslld $1, W, W_TMP1 .elseif ((i & 3) == 2) vpsrld $31, W, W vpor W, W_TMP1, W_TMP1 vpslld $2, W_TMP2, W vpsrld $30, W_TMP2, W_TMP2 .elseif ((i & 3) == 3) vpxor W, W_TMP1, W_TMP1 vpxor W_TMP2, W_TMP1, W vpaddd K_XMM(K_BASE), W, W_TMP1 vmovdqu W_TMP1, WK(i&~3) W_PRECALC_ROTATE .endif .endm .macro W_PRECALC_32_79_AVX .if ((i & 3) == 0) vpalignr $8, W_minus_08, W_minus_04, W_TMP1 vpxor W_minus_28, W, W # W is W_minus_32 before xor .elseif ((i & 3) == 1) vpxor W_minus_16, W_TMP1, W_TMP1 vpxor W_TMP1, W, W .elseif ((i & 3) == 2) vpslld $2, W, W_TMP1 vpsrld $30, W, W vpor W, W_TMP1, W .elseif ((i & 3) == 3) vpaddd K_XMM(K_BASE), W, W_TMP1 vmovdqu W_TMP1, WK(i&~3) W_PRECALC_ROTATE .endif .endm .endm // W_PRECALC_AVX W_PRECALC_AVX .purgem xmm_mov .macro xmm_mov a, b vmovdqu \a,\b .endm /* AVX optimized implementation: * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, * unsigned int rounds); */ SHA1_VECTOR_ASM sha1_transform_avx #endif