summaryrefslogtreecommitdiff
path: root/arch/loongarch/lib/memcpy.S
diff options
context:
space:
mode:
authorWANG Rui <wangrui@loongson.cn>2023-05-01 12:19:43 +0300
committerHuacai Chen <chenhuacai@loongson.cn>2023-05-01 12:19:43 +0300
commit8941e93ca5906fcca87dc74255b58054ec4d7868 (patch)
tree0bf2893d167c140db544ac0980342f4ec829106e /arch/loongarch/lib/memcpy.S
parent2b3bd32ea3a22ea2d5e591da4ac2c2b1fb17c0e0 (diff)
downloadlinux-8941e93ca5906fcca87dc74255b58054ec4d7868.tar.xz
LoongArch: Optimize memory ops (memset/memcpy/memmove)
To optimize memset()/memcpy()/memmove() and so on, we use a jump table to dispatch cases for short data lengths; and for long data lengths, we split the destination into head part (first 8 bytes), tail part (last 8 bytes) and middle part. The head part and tail part may be at unaligned addresses, while the middle part is always aligned (the middle part is allowed to overlap the head/tail part). In this way, the first and last 8 bytes may be unaligned accesses, but we can make sure the data in the middle is processed at an aligned destination address. We have tested micro-bench[1] on a Loongson-3C5000 16-core machine (2.2GHz): 1. memset | length | src offset | dst offset | speed before | speed after | % | |--------|------------|------------|--------------|-------------|---------| | 8 | 0 | 0 | 696.191 | 1518.785 | 118.16% | | 8 | 0 | 1 | 696.325 | 1518.937 | 118.14% | | 50 | 0 | 0 | 969.976 | 8053.902 | 730.32% | | 50 | 0 | 1 | 970.034 | 8058.475 | 730.74% | | 300 | 0 | 0 | 5876.612 | 16544.703 | 181.53% | | 300 | 0 | 1 | 5030.849 | 16549.011 | 228.95% | | 1200 | 0 | 0 | 11797.077 | 16752.137 | 42.00% | | 1200 | 0 | 1 | 5687.141 | 16645.233 | 192.68% | | 4000 | 0 | 0 | 15723.27 | 16761.557 | 6.60% | | 4000 | 0 | 1 | 5906.114 | 16732.316 | 183.30% | | 8000 | 0 | 0 | 16751.403 | 16770.002 | 0.11% | | 8000 | 0 | 1 | 5995.449 | 16754.07 | 179.45% | 2. memcpy | length | src offset | dst offset | speed before | speed after | % | |--------|------------|------------|--------------|-------------|---------| | 8 | 0 | 0 | 696.2 | 1670.605 | 139.96% | | 8 | 0 | 1 | 696.325 | 1671.138 | 139.99% | | 50 | 0 | 0 | 969.974 | 8724.999 | 799.51% | | 50 | 0 | 1 | 970.032 | 8730.138 | 799.98% | | 300 | 0 | 0 | 5564.662 | 16272.652 | 192.43% | | 300 | 0 | 1 | 4670.436 | 14972.842 | 220.59% | | 1200 | 0 | 0 | 10740.23 | 16751.728 | 55.97% | | 1200 | 0 | 1 | 5027.741 | 14874.564 | 195.85% | | 4000 | 0 | 0 | 15122.367 | 16737.642 | 10.68% | | 4000 | 0 | 1 | 5536.918 | 14890.397 | 168.93% | | 8000 | 0 | 0 | 16505.453 | 16553.543 | 0.29% | | 8000 | 0 | 1 | 5821.619 | 14841.804 | 154.94% | 3. memmove | length | src offset | dst offset | speed before | speed after | % | |--------|------------|------------|--------------|-------------|---------| | 8 | 0 | 0 | 982.693 | 1670.568 | 70.00% | | 8 | 0 | 1 | 983.023 | 1671.174 | 70.00% | | 50 | 0 | 0 | 1230.87 | 8727.625 | 609.06% | | 50 | 0 | 1 | 1232.515 | 8730.138 | 608.32% | | 300 | 0 | 0 | 6490.375 | 16296.993 | 151.09% | | 300 | 0 | 1 | 4282.687 | 14972.842 | 249.61% | | 1200 | 0 | 0 | 11742.755 | 16752.546 | 42.66% | | 1200 | 0 | 1 | 5039.338 | 14872.951 | 195.14% | | 4000 | 0 | 0 | 15467.786 | 16737.09 | 8.21% | | 4000 | 0 | 1 | 5009.905 | 14890.542 | 197.22% | | 8000 | 0 | 0 | 16489.664 | 16553.273 | 0.39% | | 8000 | 0 | 1 | 5823.786 | 14858.646 | 155.14% | * speed: MB/s * length: byte [1] https://github.com/heiher/mem-bench Signed-off-by: WANG Rui <wangrui@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Diffstat (limited to 'arch/loongarch/lib/memcpy.S')
-rw-r--r--arch/loongarch/lib/memcpy.S147
1 files changed, 121 insertions, 26 deletions
diff --git a/arch/loongarch/lib/memcpy.S b/arch/loongarch/lib/memcpy.S
index 3b7e1dec7109..39ce6621c704 100644
--- a/arch/loongarch/lib/memcpy.S
+++ b/arch/loongarch/lib/memcpy.S
@@ -44,6 +44,66 @@ SYM_FUNC_START(__memcpy_generic)
SYM_FUNC_END(__memcpy_generic)
_ASM_NOKPROBE(__memcpy_generic)
+ .align 5
+SYM_FUNC_START_NOALIGN(__memcpy_small)
+ pcaddi t0, 8
+ slli.d a2, a2, 5
+ add.d t0, t0, a2
+ jr t0
+
+ .align 5
+0: jr ra
+
+ .align 5
+1: ld.b t0, a1, 0
+ st.b t0, a0, 0
+ jr ra
+
+ .align 5
+2: ld.h t0, a1, 0
+ st.h t0, a0, 0
+ jr ra
+
+ .align 5
+3: ld.h t0, a1, 0
+ ld.b t1, a1, 2
+ st.h t0, a0, 0
+ st.b t1, a0, 2
+ jr ra
+
+ .align 5
+4: ld.w t0, a1, 0
+ st.w t0, a0, 0
+ jr ra
+
+ .align 5
+5: ld.w t0, a1, 0
+ ld.b t1, a1, 4
+ st.w t0, a0, 0
+ st.b t1, a0, 4
+ jr ra
+
+ .align 5
+6: ld.w t0, a1, 0
+ ld.h t1, a1, 4
+ st.w t0, a0, 0
+ st.h t1, a0, 4
+ jr ra
+
+ .align 5
+7: ld.w t0, a1, 0
+ ld.w t1, a1, 3
+ st.w t0, a0, 0
+ st.w t1, a0, 3
+ jr ra
+
+ .align 5
+8: ld.d t0, a1, 0
+ st.d t0, a0, 0
+ jr ra
+SYM_FUNC_END(__memcpy_small)
+_ASM_NOKPROBE(__memcpy_small)
+
/*
* void *__memcpy_fast(void *dst, const void *src, size_t n)
*
@@ -52,14 +112,27 @@ _ASM_NOKPROBE(__memcpy_generic)
* a2: n
*/
SYM_FUNC_START(__memcpy_fast)
- move a3, a0
- beqz a2, 3f
+ sltui t0, a2, 9
+ bnez t0, __memcpy_small
+
+ add.d a3, a1, a2
+ add.d a2, a0, a2
+ ld.d a6, a1, 0
+ ld.d a7, a3, -8
+
+ /* align up destination address */
+ andi t1, a0, 7
+ sub.d t0, zero, t1
+ addi.d t0, t0, 8
+ add.d a1, a1, t0
+ add.d a5, a0, t0
- ori a4, zero, 64
- blt a2, a4, 2f
+ addi.d a4, a3, -64
+ bgeu a1, a4, .Llt64
/* copy 64 bytes at a time */
-1: ld.d t0, a1, 0
+.Lloop64:
+ ld.d t0, a1, 0
ld.d t1, a1, 8
ld.d t2, a1, 16
ld.d t3, a1, 24
@@ -67,32 +140,54 @@ SYM_FUNC_START(__memcpy_fast)
ld.d t5, a1, 40
ld.d t6, a1, 48
ld.d t7, a1, 56
- st.d t0, a0, 0
- st.d t1, a0, 8
- st.d t2, a0, 16
- st.d t3, a0, 24
- st.d t4, a0, 32
- st.d t5, a0, 40
- st.d t6, a0, 48
- st.d t7, a0, 56
-
- addi.d a0, a0, 64
addi.d a1, a1, 64
- addi.d a2, a2, -64
- bge a2, a4, 1b
-
- beqz a2, 3f
+ st.d t0, a5, 0
+ st.d t1, a5, 8
+ st.d t2, a5, 16
+ st.d t3, a5, 24
+ st.d t4, a5, 32
+ st.d t5, a5, 40
+ st.d t6, a5, 48
+ st.d t7, a5, 56
+ addi.d a5, a5, 64
+ bltu a1, a4, .Lloop64
/* copy the remaining bytes */
-2: ld.b t0, a1, 0
- st.b t0, a0, 0
- addi.d a0, a0, 1
- addi.d a1, a1, 1
- addi.d a2, a2, -1
- bgt a2, zero, 2b
+.Llt64:
+ addi.d a4, a3, -32
+ bgeu a1, a4, .Llt32
+ ld.d t0, a1, 0
+ ld.d t1, a1, 8
+ ld.d t2, a1, 16
+ ld.d t3, a1, 24
+ addi.d a1, a1, 32
+ st.d t0, a5, 0
+ st.d t1, a5, 8
+ st.d t2, a5, 16
+ st.d t3, a5, 24
+ addi.d a5, a5, 32
+
+.Llt32:
+ addi.d a4, a3, -16
+ bgeu a1, a4, .Llt16
+ ld.d t0, a1, 0
+ ld.d t1, a1, 8
+ addi.d a1, a1, 16
+ st.d t0, a5, 0
+ st.d t1, a5, 8
+ addi.d a5, a5, 16
+
+.Llt16:
+ addi.d a4, a3, -8
+ bgeu a1, a4, .Llt8
+ ld.d t0, a1, 0
+ st.d t0, a5, 0
+
+.Llt8:
+ st.d a6, a0, 0
+ st.d a7, a2, -8
/* return */
-3: move a0, a3
jr ra
SYM_FUNC_END(__memcpy_fast)
_ASM_NOKPROBE(__memcpy_fast)