summaryrefslogtreecommitdiff
path: root/arch/powerpc/lib/string_64.S
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2012-05-27 23:54:03 +0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2012-07-03 08:14:41 +0400
commit17968fbbd19f1bb281ee4eb2548764ac5664c4ec (patch)
treec6b7a68ea7897e6bf213bffa4795fe209609cba9 /arch/powerpc/lib/string_64.S
parentd136e27326a3bd50d7929a43c018abf13e426b7e (diff)
downloadlinux-17968fbbd19f1bb281ee4eb2548764ac5664c4ec.tar.xz
powerpc: 64bit optimised __clear_user
I noticed __clear_user high up in a profile of one of my RAID stress tests. The testcase was doing a dd from /dev/zero which ends up calling __clear_user. __clear_user is basically a loop with a single 4 byte store which is horribly slow. We can do much better by aligning the desination and doing 32 bytes of 8 byte stores in a loop. The following testcase was used to verify the patch: http://ozlabs.org/~anton/junkcode/stress_clear_user.c To show the improvement in performance I ran a dd from /dev/zero to /dev/null on a POWER7 box: Before: # dd if=/dev/zero of=/dev/null bs=1M count=10000 10485760000 bytes (10 GB) copied, 3.72379 s, 2.8 GB/s After: # time dd if=/dev/zero of=/dev/null bs=1M count=10000 10485760000 bytes (10 GB) copied, 0.728318 s, 14.4 GB/s Over 5x faster. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/lib/string_64.S')
-rw-r--r--arch/powerpc/lib/string_64.S141
1 files changed, 141 insertions, 0 deletions
diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S
new file mode 100644
index 000000000000..6613b9047005
--- /dev/null
+++ b/arch/powerpc/lib/string_64.S
@@ -0,0 +1,141 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+
+#include <asm/ppc_asm.h>
+
+/**
+ * __clear_user: - Zero a block of memory in user space, with less checking.
+ * @to: Destination address, in user space.
+ * @n: Number of bytes to zero.
+ *
+ * Zero a block of memory in user space. Caller must check
+ * the specified block with access_ok() before calling this function.
+ *
+ * Returns number of bytes that could not be cleared.
+ * On success, this will be zero.
+ */
+
+ .macro err1
+100:
+ .section __ex_table,"a"
+ .align 3
+ .llong 100b,.Ldo_err1
+ .previous
+ .endm
+
+ .macro err2
+200:
+ .section __ex_table,"a"
+ .align 3
+ .llong 200b,.Ldo_err2
+ .previous
+ .endm
+
+ .macro err3
+300:
+ .section __ex_table,"a"
+ .align 3
+ .llong 300b,.Ldo_err3
+ .previous
+ .endm
+
+.Ldo_err1:
+ mr r3,r8
+
+.Ldo_err2:
+ mtctr r4
+1:
+err3; stb r0,0(r3)
+ addi r3,r3,1
+ addi r4,r4,-1
+ bdnz 1b
+
+.Ldo_err3:
+ mr r3,r4
+ blr
+
+_GLOBAL(__clear_user)
+ cmpdi r4,32
+ neg r6,r3
+ li r0,0
+ blt .Lshort_clear
+ mr r8,r3
+ mtocrf 0x01,r6
+ clrldi r6,r6,(64-3)
+
+ /* Get the destination 8 byte aligned */
+ bf cr7*4+3,1f
+err1; stb r0,0(r3)
+ addi r3,r3,1
+
+1: bf cr7*4+2,2f
+err1; sth r0,0(r3)
+ addi r3,r3,2
+
+2: bf cr7*4+1,3f
+err1; stw r0,0(r3)
+ addi r3,r3,4
+
+3: sub r4,r4,r6
+ srdi r6,r4,5
+ cmpdi r4,32
+ blt .Lshort_clear
+ mtctr r6
+
+ /* Do 32 byte chunks */
+4:
+err2; std r0,0(r3)
+err2; std r0,8(r3)
+err2; std r0,16(r3)
+err2; std r0,24(r3)
+ addi r3,r3,32
+ addi r4,r4,-32
+ bdnz 4b
+
+.Lshort_clear:
+ /* up to 31 bytes to go */
+ cmpdi r4,16
+ blt 6f
+err2; std r0,0(r3)
+err2; std r0,8(r3)
+ addi r3,r3,16
+ addi r4,r4,-16
+
+ /* Up to 15 bytes to go */
+6: mr r8,r3
+ clrldi r4,r4,(64-4)
+ mtocrf 0x01,r4
+ bf cr7*4+0,7f
+err1; std r0,0(r3)
+ addi r3,r3,8
+
+7: bf cr7*4+1,8f
+err1; stw r0,0(r3)
+ addi r3,r3,4
+
+8: bf cr7*4+2,9f
+err1; sth r0,0(r3)
+ addi r3,r3,2
+
+9: bf cr7*4+3,10f
+err1; stb r0,0(r3)
+
+10: li r3,0
+ blr