summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArd Biesheuvel <ard.biesheuvel@linaro.org>2018-11-27 20:42:55 +0300
committerWill Deacon <will.deacon@arm.com>2018-11-30 16:58:04 +0300
commitefdb25efc7645b326cd5eb82be5feeabe167c24e (patch)
treefca2b4c8aa400a78212468e40211689723bd6957
parent7dc48bf96aa0fc8aa5b38cc3e5c36ac03171e680 (diff)
downloadlinux-efdb25efc7645b326cd5eb82be5feeabe167c24e.tar.xz
arm64/lib: improve CRC32 performance for deep pipelines
Improve the performance of the crc32() asm routines by getting rid of most of the branches and small sized loads on the common path. Instead, use a branchless code path involving overlapping 16 byte loads to process the first (length % 32) bytes, and process the remainder using a loop that processes 32 bytes at a time. Tested using the following test program: #include <stdlib.h> extern void crc32_le(unsigned short, char const*, int); int main(void) { static const char buf[4096]; srand(20181126); for (int i = 0; i < 100 * 1000 * 1000; i++) crc32_le(0, buf, rand() % 1024); return 0; } On Cortex-A53 and Cortex-A57, the performance regresses but only very slightly. On Cortex-A72 however, the performance improves from $ time ./crc32 real 0m10.149s user 0m10.149s sys 0m0.000s to $ time ./crc32 real 0m7.915s user 0m7.915s sys 0m0.000s Cc: Rui Sun <sunrui26@huawei.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Will Deacon <will.deacon@arm.com>
-rw-r--r--arch/arm64/lib/crc32.S54
1 files changed, 49 insertions, 5 deletions
diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
index 5bc1e85b4e1c..f132f2a7522e 100644
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32.S
@@ -15,15 +15,59 @@
.cpu generic+crc
.macro __crc32, c
-0: subs x2, x2, #16
- b.mi 8f
- ldp x3, x4, [x1], #16
+ cmp x2, #16
+ b.lt 8f // less than 16 bytes
+
+ and x7, x2, #0x1f
+ and x2, x2, #~0x1f
+ cbz x7, 32f // multiple of 32 bytes
+
+ and x8, x7, #0xf
+ ldp x3, x4, [x1]
+ add x8, x8, x1
+ add x1, x1, x7
+ ldp x5, x6, [x8]
CPU_BE( rev x3, x3 )
CPU_BE( rev x4, x4 )
+CPU_BE( rev x5, x5 )
+CPU_BE( rev x6, x6 )
+
+ tst x7, #8
+ crc32\c\()x w8, w0, x3
+ csel x3, x3, x4, eq
+ csel w0, w0, w8, eq
+ tst x7, #4
+ lsr x4, x3, #32
+ crc32\c\()w w8, w0, w3
+ csel x3, x3, x4, eq
+ csel w0, w0, w8, eq
+ tst x7, #2
+ lsr w4, w3, #16
+ crc32\c\()h w8, w0, w3
+ csel w3, w3, w4, eq
+ csel w0, w0, w8, eq
+ tst x7, #1
+ crc32\c\()b w8, w0, w3
+ csel w0, w0, w8, eq
+ tst x7, #16
+ crc32\c\()x w8, w0, x5
+ crc32\c\()x w8, w8, x6
+ csel w0, w0, w8, eq
+ cbz x2, 0f
+
+32: ldp x3, x4, [x1], #32
+ sub x2, x2, #32
+ ldp x5, x6, [x1, #-16]
+CPU_BE( rev x3, x3 )
+CPU_BE( rev x4, x4 )
+CPU_BE( rev x5, x5 )
+CPU_BE( rev x6, x6 )
crc32\c\()x w0, w0, x3
crc32\c\()x w0, w0, x4
- b.ne 0b
- ret
+ crc32\c\()x w0, w0, x5
+ crc32\c\()x w0, w0, x6
+ cbnz x2, 32b
+0: ret
8: tbz x2, #3, 4f
ldr x3, [x1], #8