diff options
-rw-r--r-- | arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 233 |
1 files changed, 92 insertions, 141 deletions
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index bbf860e90951..752812bc4991 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -7,6 +7,7 @@ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf * * Copyright (C) 2012 Intel Corporation. + * Copyright 2024 Google LLC * * Authors: * Wajdi Feghali <wajdi.k.feghali@intel.com> @@ -44,18 +45,9 @@ */ #include <linux/linkage.h> -#include <asm/nospec-branch.h> ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction -.macro LABEL prefix n -.L\prefix\n\(): -.endm - -.macro JMPTBL_ENTRY i -.quad .Lcrc_\i -.endm - # Define threshold below which buffers are considered "small" and routed to # regular CRC code that does not interleave the CRC instructions. #define SMALL_SIZE 200 @@ -64,139 +56,116 @@ .text SYM_FUNC_START(crc_pcl) -#define bufp rdi -#define bufp_dw %edi -#define bufp_w %di -#define bufp_b %dil -#define bufptmp %rcx -#define block_0 %rcx -#define block_1 %rdx -#define block_2 %r11 -#define len %esi -#define crc_init_arg %edx -#define tmp %rbx -#define crc_init %r8d -#define crc_init_q %r8 -#define crc1 %r9 -#define crc2 %r10 - - pushq %rbx - pushq %rdi - pushq %rsi +#define bufp %rdi +#define bufp_d %edi +#define len %esi +#define crc_init %edx +#define crc_init_q %rdx +#define n_misaligned %ecx /* overlaps chunk_bytes! */ +#define n_misaligned_q %rcx +#define chunk_bytes %ecx /* overlaps n_misaligned! */ +#define chunk_bytes_q %rcx +#define crc1 %r8 +#define crc2 %r9 - ## Move crc_init for Linux to a different - mov crc_init_arg, crc_init - - mov %bufp, bufptmp # rdi = *buf cmp $SMALL_SIZE, len jb .Lsmall ################################################################ ## 1) ALIGN: ################################################################ - neg %bufp - and $7, %bufp # calculate the unalignment amount of + mov bufp_d, n_misaligned + neg n_misaligned + and $7, n_misaligned # calculate the misalignment amount of # the address - je .Lproc_block # Skip if aligned + je .Laligned # Skip if aligned + # Process 1 <= n_misaligned <= 7 bytes individually in order to align + # the remaining data to an 8-byte boundary. .Ldo_align: - #### Calculate CRC of unaligned bytes of the buffer (if any) - movq (bufptmp), tmp # load a quadward from the buffer - add %bufp, bufptmp # align buffer pointer for quadword - # processing - sub bufp_dw, len # update buffer length + movq (bufp), %rax + add n_misaligned_q, bufp + sub n_misaligned, len .Lalign_loop: - crc32b %bl, crc_init # compute crc32 of 1-byte - shr $8, tmp # get next byte - dec %bufp + crc32b %al, crc_init # compute crc32 of 1-byte + shr $8, %rax # get next byte + dec n_misaligned jne .Lalign_loop - -.Lproc_block: +.Laligned: ################################################################ - ## 2) PROCESS BLOCKS: + ## 2) PROCESS BLOCK: ################################################################ - ## compute num of bytes to be processed - cmp $128*24, len jae .Lfull_block -.Lcontinue_block: - ## len < 128*24 - movq $2731, %rax # 2731 = ceil(2^16 / 24) - mul len - shrq $16, %rax - - ## eax contains floor(bytes / 24) = num 24-byte chunks to do - - ## process rax 24-byte chunks (128 >= rax >= 0) - - ## compute end address of each block - ## block 0 (base addr + RAX * 8) - ## block 1 (base addr + RAX * 16) - ## block 2 (base addr + RAX * 24) - lea (bufptmp, %rax, 8), block_0 - lea (block_0, %rax, 8), block_1 - lea (block_1, %rax, 8), block_2 - - xor crc1, crc1 - xor crc2, crc2 - - ## branch into array - leaq jump_table(%rip), %bufp - mov (%bufp,%rax,8), %bufp - JMP_NOSPEC bufp +.Lpartial_block: + # Compute floor(len / 24) to get num qwords to process from each lane. + imul $2731, len, %eax # 2731 = ceil(2^16 / 24) + shr $16, %eax + jmp .Lcrc_3lanes - ################################################################ - ## 2a) PROCESS FULL BLOCKS: - ################################################################ .Lfull_block: - movl $128,%eax - lea 128*8*2(block_0), block_1 - lea 128*8*3(block_0), block_2 - add $128*8*1, block_0 - - xor crc1,crc1 - xor crc2,crc2 - - # Fall through into top of crc array (crc_128) + # Processing 128 qwords from each lane. + mov $128, %eax ################################################################ - ## 3) CRC Array: + ## 3) CRC each of three lanes: ################################################################ - i=128 -.rept 128-1 -.altmacro -LABEL crc_ %i -.noaltmacro - ENDBR - crc32q -i*8(block_0), crc_init_q - crc32q -i*8(block_1), crc1 - crc32q -i*8(block_2), crc2 - i=(i-1) -.endr - -.altmacro -LABEL crc_ %i -.noaltmacro - ENDBR - crc32q -i*8(block_0), crc_init_q - crc32q -i*8(block_1), crc1 -# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet +.Lcrc_3lanes: + xor crc1,crc1 + xor crc2,crc2 + mov %eax, chunk_bytes + shl $3, chunk_bytes # num bytes to process from each lane + sub $5, %eax # 4 for 4x_loop, 1 for special last iter + jl .Lcrc_3lanes_4x_done + + # Unroll the loop by a factor of 4 to reduce the overhead of the loop + # bookkeeping instructions, which can compete with crc32q for the ALUs. +.Lcrc_3lanes_4x_loop: + crc32q (bufp), crc_init_q + crc32q (bufp,chunk_bytes_q), crc1 + crc32q (bufp,chunk_bytes_q,2), crc2 + crc32q 8(bufp), crc_init_q + crc32q 8(bufp,chunk_bytes_q), crc1 + crc32q 8(bufp,chunk_bytes_q,2), crc2 + crc32q 16(bufp), crc_init_q + crc32q 16(bufp,chunk_bytes_q), crc1 + crc32q 16(bufp,chunk_bytes_q,2), crc2 + crc32q 24(bufp), crc_init_q + crc32q 24(bufp,chunk_bytes_q), crc1 + crc32q 24(bufp,chunk_bytes_q,2), crc2 + add $32, bufp + sub $4, %eax + jge .Lcrc_3lanes_4x_loop + +.Lcrc_3lanes_4x_done: + add $4, %eax + jz .Lcrc_3lanes_last_qword + +.Lcrc_3lanes_1x_loop: + crc32q (bufp), crc_init_q + crc32q (bufp,chunk_bytes_q), crc1 + crc32q (bufp,chunk_bytes_q,2), crc2 + add $8, bufp + dec %eax + jnz .Lcrc_3lanes_1x_loop - mov block_2, block_0 +.Lcrc_3lanes_last_qword: + crc32q (bufp), crc_init_q + crc32q (bufp,chunk_bytes_q), crc1 +# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet ################################################################ ## 4) Combine three results: ################################################################ - lea (K_table-8)(%rip), %bufp # first entry is for idx 1 - shlq $3, %rax # rax *= 8 - pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 - leal (%eax,%eax,2), %eax # rax *= 3 (total *24) - sub %eax, len # len -= rax*24 + lea (K_table-8)(%rip), %rax # first entry is for idx 1 + pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2 + lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3 + sub %eax, len # len -= chunk_bytes * 3 movq crc_init_q, %xmm1 # CRC for block 1 pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 @@ -206,20 +175,19 @@ LABEL crc_ %i pxor %xmm2,%xmm1 movq %xmm1, %rax - xor -i*8(block_2), %rax + xor (bufp,chunk_bytes_q,2), %rax mov crc2, crc_init_q crc32 %rax, crc_init_q + lea 8(bufp,chunk_bytes_q,2), bufp ################################################################ - ## 5) Check for end: + ## 5) If more blocks remain, goto (2): ################################################################ -LABEL crc_ 0 - ENDBR cmp $128*24, len - jae .Lfull_block + jae .Lfull_block cmp $SMALL_SIZE, len - jae .Lcontinue_block + jae .Lpartial_block ####################################################################### ## 6) Process any remainder without interleaving: @@ -231,47 +199,30 @@ LABEL crc_ 0 shr $3, %eax jz .Ldo_dword .Ldo_qwords: - crc32q (bufptmp), crc_init_q - add $8, bufptmp + crc32q (bufp), crc_init_q + add $8, bufp dec %eax jnz .Ldo_qwords .Ldo_dword: test $4, len jz .Ldo_word - crc32l (bufptmp), crc_init - add $4, bufptmp + crc32l (bufp), crc_init + add $4, bufp .Ldo_word: test $2, len jz .Ldo_byte - crc32w (bufptmp), crc_init - add $2, bufptmp + crc32w (bufp), crc_init + add $2, bufp .Ldo_byte: test $1, len jz .Ldone - crc32b (bufptmp), crc_init + crc32b (bufp), crc_init .Ldone: mov crc_init, %eax - popq %rsi - popq %rdi - popq %rbx RET SYM_FUNC_END(crc_pcl) .section .rodata, "a", @progbits - ################################################################ - ## jump table Table is 129 entries x 2 bytes each - ################################################################ -.align 4 -jump_table: - i=0 -.rept 129 -.altmacro -JMPTBL_ENTRY %i -.noaltmacro - i=i+1 -.endr - - ################################################################ ## PCLMULQDQ tables ## Table is 128 entries x 2 words (8 bytes) each |