summaryrefslogtreecommitdiff
path: root/arch/powerpc/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r--arch/powerpc/lib/Makefile2
-rw-r--r--arch/powerpc/lib/code-patching.c16
-rw-r--r--arch/powerpc/lib/copy_32.S9
-rw-r--r--arch/powerpc/lib/copypage_64.S1
-rw-r--r--arch/powerpc/lib/copypage_power7.S4
-rw-r--r--arch/powerpc/lib/copyuser_64.S587
-rw-r--r--arch/powerpc/lib/copyuser_power7.S21
-rw-r--r--arch/powerpc/lib/feature-fixups-test.S1
-rw-r--r--arch/powerpc/lib/feature-fixups.c35
-rw-r--r--arch/powerpc/lib/hweight_64.S1
-rw-r--r--arch/powerpc/lib/ldstfp.S1
-rw-r--r--arch/powerpc/lib/locks.c1
-rw-r--r--arch/powerpc/lib/memcmp_64.S414
-rw-r--r--arch/powerpc/lib/memcpy_64.S11
-rw-r--r--arch/powerpc/lib/memcpy_power7.S28
-rw-r--r--arch/powerpc/lib/strlen_32.S78
-rw-r--r--arch/powerpc/lib/vmx-helper.c4
17 files changed, 839 insertions, 375 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index d0ca13ad8231..670286808928 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -12,7 +12,7 @@ CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE)
obj-y += string.o alloc.o code-patching.o feature-fixups.o
-obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o
+obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o strlen_32.o
# See corresponding test in arch/powerpc/Makefile
# 64-bit linker creates .sfpr on demand for final link (vmlinux),
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index e0d881ab304e..850f3b8f4da5 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -195,6 +195,22 @@ int patch_branch(unsigned int *addr, unsigned long target, int flags)
return patch_instruction(addr, create_branch(addr, target, flags));
}
+int patch_branch_site(s32 *site, unsigned long target, int flags)
+{
+ unsigned int *addr;
+
+ addr = (unsigned int *)((unsigned long)site + *site);
+ return patch_instruction(addr, create_branch(addr, target, flags));
+}
+
+int patch_instruction_site(s32 *site, unsigned int instr)
+{
+ unsigned int *addr;
+
+ addr = (unsigned int *)((unsigned long)site + *site);
+ return patch_instruction(addr, instr);
+}
+
bool is_offset_in_branch_range(long offset)
{
/*
diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index da425bb6b369..ba66846fe973 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -13,6 +13,7 @@
#include <asm/errno.h>
#include <asm/ppc_asm.h>
#include <asm/export.h>
+#include <asm/code-patching-asm.h>
#define COPY_16_BYTES \
lwz r7,4(r4); \
@@ -107,8 +108,8 @@ _GLOBAL(memset)
* Skip optimised bloc until cache is enabled. Will be replaced
* by 'bne' during boot to use normal procedure if r4 is not zero
*/
-_GLOBAL(memset_nocache_branch)
- b 2f
+5: b 2f
+ patch_site 5b, patch__memset_nocache
clrlwi r7,r6,32-LG_CACHELINE_BYTES
add r8,r7,r5
@@ -168,7 +169,9 @@ _GLOBAL(memmove)
/* fall through */
_GLOBAL(memcpy)
- b generic_memcpy
+1: b generic_memcpy
+ patch_site 1b, patch__memcpy_nocache
+
add r7,r3,r5 /* test if the src & dst overlap */
add r8,r4,r5
cmplw 0,r4,r7
diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
index 8d5034f645f3..694390357667 100644
--- a/arch/powerpc/lib/copypage_64.S
+++ b/arch/powerpc/lib/copypage_64.S
@@ -11,6 +11,7 @@
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
#include <asm/export.h>
+#include <asm/feature-fixups.h>
.section ".toc","aw"
PPC64_CACHES:
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
index 8fa73b7ab20e..e38f956f7d9f 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@ _GLOBAL(copypage_power7)
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
std r0,16(r1)
stdu r1,-STACKFRAMESIZE(r1)
- bl enter_vmx_copy
+ bl enter_vmx_ops
cmpwi r3,0
ld r0,STACKFRAMESIZE+16(r1)
ld r3,STK_REG(R31)(r1)
@@ -100,7 +100,7 @@ _GLOBAL(copypage_power7)
addi r3,r3,128
bdnz 1b
- b exit_vmx_copy /* tail call optimise */
+ b exit_vmx_ops /* tail call optimise */
#else
li r0,(PAGE_SIZE/128)
diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S
index 506677395681..96c514bee66b 100644
--- a/arch/powerpc/lib/copyuser_64.S
+++ b/arch/powerpc/lib/copyuser_64.S
@@ -9,6 +9,13 @@
#include <asm/processor.h>
#include <asm/ppc_asm.h>
#include <asm/export.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+#ifndef SELFTEST_CASE
+/* 0 == most CPUs, 1 == POWER6, 2 == Cell */
+#define SELFTEST_CASE 0
+#endif
#ifdef __BIG_ENDIAN__
#define sLd sld /* Shift towards low-numbered address. */
@@ -18,6 +25,28 @@
#define sHd sld /* Shift towards high-numbered address. */
#endif
+/*
+ * These macros are used to generate exception table entries.
+ * The exception handlers below use the original arguments
+ * (stored on the stack) and the point where we're up to in
+ * the destination buffer, i.e. the address of the first
+ * unmodified byte. Generally r3 points into the destination
+ * buffer, but the first unmodified byte is at a variable
+ * offset from r3. In the code below, the symbol r3_offset
+ * is set to indicate the current offset at each point in
+ * the code. This offset is then used as a negative offset
+ * from the exception handler code, and those instructions
+ * before the exception handlers are addi instructions that
+ * adjust r3 to point to the correct place.
+ */
+ .macro lex /* exception handler for load */
+100: EX_TABLE(100b, .Lld_exc - r3_offset)
+ .endm
+
+ .macro stex /* exception handler for store */
+100: EX_TABLE(100b, .Lst_exc - r3_offset)
+ .endm
+
.align 7
_GLOBAL_TOC(__copy_tofrom_user)
#ifdef CONFIG_PPC_BOOK3S_64
@@ -28,7 +57,7 @@ FTR_SECTION_ELSE
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
#endif
_GLOBAL(__copy_tofrom_user_base)
- /* first check for a whole page copy on a page boundary */
+ /* first check for a 4kB copy on a 4kB boundary */
cmpldi cr1,r5,16
cmpdi cr6,r5,4096
or r0,r3,r4
@@ -49,6 +78,7 @@ _GLOBAL(__copy_tofrom_user_base)
* At the time of writing the only CPU that has this combination of bits
* set is Power6.
*/
+test_feature = (SELFTEST_CASE == 1)
BEGIN_FTR_SECTION
nop
FTR_SECTION_ELSE
@@ -57,6 +87,8 @@ ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
CPU_FTR_UNALIGNED_LD_STD)
.Ldst_aligned:
addi r3,r3,-16
+r3_offset = 16
+test_feature = (SELFTEST_CASE == 0)
BEGIN_FTR_SECTION
andi. r0,r4,7
bne .Lsrc_unaligned
@@ -64,57 +96,69 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
blt cr1,.Ldo_tail /* if < 16 bytes to copy */
srdi r0,r5,5
cmpdi cr1,r0,0
-20: ld r7,0(r4)
-220: ld r6,8(r4)
+lex; ld r7,0(r4)
+lex; ld r6,8(r4)
addi r4,r4,16
mtctr r0
andi. r0,r5,0x10
beq 22f
addi r3,r3,16
+r3_offset = 0
addi r4,r4,-16
mr r9,r7
mr r8,r6
beq cr1,72f
-21: ld r7,16(r4)
-221: ld r6,24(r4)
+21:
+lex; ld r7,16(r4)
+lex; ld r6,24(r4)
addi r4,r4,32
-70: std r9,0(r3)
-270: std r8,8(r3)
-22: ld r9,0(r4)
-222: ld r8,8(r4)
-71: std r7,16(r3)
-271: std r6,24(r3)
+stex; std r9,0(r3)
+r3_offset = 8
+stex; std r8,8(r3)
+r3_offset = 16
+22:
+lex; ld r9,0(r4)
+lex; ld r8,8(r4)
+stex; std r7,16(r3)
+r3_offset = 24
+stex; std r6,24(r3)
addi r3,r3,32
+r3_offset = 0
bdnz 21b
-72: std r9,0(r3)
-272: std r8,8(r3)
+72:
+stex; std r9,0(r3)
+r3_offset = 8
+stex; std r8,8(r3)
+r3_offset = 16
andi. r5,r5,0xf
beq+ 3f
addi r4,r4,16
.Ldo_tail:
addi r3,r3,16
+r3_offset = 0
bf cr7*4+0,246f
-244: ld r9,0(r4)
+lex; ld r9,0(r4)
addi r4,r4,8
-245: std r9,0(r3)
+stex; std r9,0(r3)
addi r3,r3,8
246: bf cr7*4+1,1f
-23: lwz r9,0(r4)
+lex; lwz r9,0(r4)
addi r4,r4,4
-73: stw r9,0(r3)
+stex; stw r9,0(r3)
addi r3,r3,4
1: bf cr7*4+2,2f
-44: lhz r9,0(r4)
+lex; lhz r9,0(r4)
addi r4,r4,2
-74: sth r9,0(r3)
+stex; sth r9,0(r3)
addi r3,r3,2
2: bf cr7*4+3,3f
-45: lbz r9,0(r4)
-75: stb r9,0(r3)
+lex; lbz r9,0(r4)
+stex; stb r9,0(r3)
3: li r3,0
blr
.Lsrc_unaligned:
+r3_offset = 16
srdi r6,r5,3
addi r5,r5,-16
subf r4,r0,r4
@@ -127,58 +171,69 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
add r5,r5,r0
bt cr7*4+0,28f
-24: ld r9,0(r4) /* 3+2n loads, 2+2n stores */
-25: ld r0,8(r4)
+lex; ld r9,0(r4) /* 3+2n loads, 2+2n stores */
+lex; ld r0,8(r4)
sLd r6,r9,r10
-26: ldu r9,16(r4)
+lex; ldu r9,16(r4)
sHd r7,r0,r11
sLd r8,r0,r10
or r7,r7,r6
blt cr6,79f
-27: ld r0,8(r4)
+lex; ld r0,8(r4)
b 2f
-28: ld r0,0(r4) /* 4+2n loads, 3+2n stores */
-29: ldu r9,8(r4)
+28:
+lex; ld r0,0(r4) /* 4+2n loads, 3+2n stores */
+lex; ldu r9,8(r4)
sLd r8,r0,r10
addi r3,r3,-8
+r3_offset = 24
blt cr6,5f
-30: ld r0,8(r4)
+lex; ld r0,8(r4)
sHd r12,r9,r11
sLd r6,r9,r10
-31: ldu r9,16(r4)
+lex; ldu r9,16(r4)
or r12,r8,r12
sHd r7,r0,r11
sLd r8,r0,r10
addi r3,r3,16
+r3_offset = 8
beq cr6,78f
1: or r7,r7,r6
-32: ld r0,8(r4)
-76: std r12,8(r3)
+lex; ld r0,8(r4)
+stex; std r12,8(r3)
+r3_offset = 16
2: sHd r12,r9,r11
sLd r6,r9,r10
-33: ldu r9,16(r4)
+lex; ldu r9,16(r4)
or r12,r8,r12
-77: stdu r7,16(r3)
+stex; stdu r7,16(r3)
+r3_offset = 8
sHd r7,r0,r11
sLd r8,r0,r10
bdnz 1b
-78: std r12,8(r3)
+78:
+stex; std r12,8(r3)
+r3_offset = 16
or r7,r7,r6
-79: std r7,16(r3)
+79:
+stex; std r7,16(r3)
+r3_offset = 24
5: sHd r12,r9,r11
or r12,r8,r12
-80: std r12,24(r3)
+stex; std r12,24(r3)
+r3_offset = 32
bne 6f
li r3,0
blr
6: cmpwi cr1,r5,8
addi r3,r3,32
+r3_offset = 0
sLd r9,r9,r10
ble cr1,7f
-34: ld r0,8(r4)
+lex; ld r0,8(r4)
sHd r7,r0,r11
or r9,r7,r9
7:
@@ -186,7 +241,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
#ifdef __BIG_ENDIAN__
rotldi r9,r9,32
#endif
-94: stw r9,0(r3)
+stex; stw r9,0(r3)
#ifdef __LITTLE_ENDIAN__
rotrdi r9,r9,32
#endif
@@ -195,7 +250,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
#ifdef __BIG_ENDIAN__
rotldi r9,r9,16
#endif
-95: sth r9,0(r3)
+stex; sth r9,0(r3)
#ifdef __LITTLE_ENDIAN__
rotrdi r9,r9,16
#endif
@@ -204,7 +259,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
#ifdef __BIG_ENDIAN__
rotldi r9,r9,8
#endif
-96: stb r9,0(r3)
+stex; stb r9,0(r3)
#ifdef __LITTLE_ENDIAN__
rotrdi r9,r9,8
#endif
@@ -212,47 +267,55 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
blr
.Ldst_unaligned:
+r3_offset = 0
PPC_MTOCRF(0x01,r6) /* put #bytes to 8B bdry into cr7 */
subf r5,r6,r5
li r7,0
cmpldi cr1,r5,16
bf cr7*4+3,1f
-35: lbz r0,0(r4)
-81: stb r0,0(r3)
+100: EX_TABLE(100b, .Lld_exc_r7)
+ lbz r0,0(r4)
+100: EX_TABLE(100b, .Lst_exc_r7)
+ stb r0,0(r3)
addi r7,r7,1
1: bf cr7*4+2,2f
-36: lhzx r0,r7,r4
-82: sthx r0,r7,r3
+100: EX_TABLE(100b, .Lld_exc_r7)
+ lhzx r0,r7,r4
+100: EX_TABLE(100b, .Lst_exc_r7)
+ sthx r0,r7,r3
addi r7,r7,2
2: bf cr7*4+1,3f
-37: lwzx r0,r7,r4
-83: stwx r0,r7,r3
+100: EX_TABLE(100b, .Lld_exc_r7)
+ lwzx r0,r7,r4
+100: EX_TABLE(100b, .Lst_exc_r7)
+ stwx r0,r7,r3
3: PPC_MTOCRF(0x01,r5)
add r4,r6,r4
add r3,r6,r3
b .Ldst_aligned
.Lshort_copy:
+r3_offset = 0
bf cr7*4+0,1f
-38: lwz r0,0(r4)
-39: lwz r9,4(r4)
+lex; lwz r0,0(r4)
+lex; lwz r9,4(r4)
addi r4,r4,8
-84: stw r0,0(r3)
-85: stw r9,4(r3)
+stex; stw r0,0(r3)
+stex; stw r9,4(r3)
addi r3,r3,8
1: bf cr7*4+1,2f
-40: lwz r0,0(r4)
+lex; lwz r0,0(r4)
addi r4,r4,4
-86: stw r0,0(r3)
+stex; stw r0,0(r3)
addi r3,r3,4
2: bf cr7*4+2,3f
-41: lhz r0,0(r4)
+lex; lhz r0,0(r4)
addi r4,r4,2
-87: sth r0,0(r3)
+stex; sth r0,0(r3)
addi r3,r3,2
3: bf cr7*4+3,4f
-42: lbz r0,0(r4)
-88: stb r0,0(r3)
+lex; lbz r0,0(r4)
+stex; stb r0,0(r3)
4: li r3,0
blr
@@ -260,48 +323,34 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
* exception handlers follow
* we have to return the number of bytes not copied
* for an exception on a load, we set the rest of the destination to 0
+ * Note that the number of bytes of instructions for adjusting r3 needs
+ * to equal the amount of the adjustment, due to the trick of using
+ * .Lld_exc - r3_offset as the handler address.
*/
-136:
-137:
+.Lld_exc_r7:
add r3,r3,r7
- b 1f
-130:
-131:
+ b .Lld_exc
+
+ /* adjust by 24 */
addi r3,r3,8
-120:
-320:
-122:
-322:
-124:
-125:
-126:
-127:
-128:
-129:
-133:
+ nop
+ /* adjust by 16 */
addi r3,r3,8
-132:
+ nop
+ /* adjust by 8 */
addi r3,r3,8
-121:
-321:
-344:
-134:
-135:
-138:
-139:
-140:
-141:
-142:
-123:
-144:
-145:
+ nop
/*
- * here we have had a fault on a load and r3 points to the first
- * unmodified byte of the destination
+ * Here we have had a fault on a load and r3 points to the first
+ * unmodified byte of the destination. We use the original arguments
+ * and r3 to work out how much wasn't copied. Since we load some
+ * distance ahead of the stores, we continue copying byte-by-byte until
+ * we hit the load fault again in order to copy as much as possible.
*/
-1: ld r6,-24(r1)
+.Lld_exc:
+ ld r6,-24(r1)
ld r4,-16(r1)
ld r5,-8(r1)
subf r6,r6,r3
@@ -312,9 +361,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
* first see if we can copy any more bytes before hitting another exception
*/
mtctr r5
+r3_offset = 0
+100: EX_TABLE(100b, .Ldone)
43: lbz r0,0(r4)
addi r4,r4,1
-89: stb r0,0(r3)
+stex; stb r0,0(r3)
addi r3,r3,1
bdnz 43b
li r3,0 /* huh? all copied successfully this time? */
@@ -323,116 +374,63 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
/*
* here we have trapped again, amount remaining is in ctr.
*/
-143: mfctr r3
+.Ldone:
+ mfctr r3
blr
/*
- * exception handlers for stores: we just need to work
- * out how many bytes weren't copied
+ * exception handlers for stores: we need to work out how many bytes
+ * weren't copied, and we may need to copy some more.
+ * Note that the number of bytes of instructions for adjusting r3 needs
+ * to equal the amount of the adjustment, due to the trick of using
+ * .Lst_exc - r3_offset as the handler address.
*/
-182:
-183:
+.Lst_exc_r7:
add r3,r3,r7
- b 1f
-371:
-180:
+ b .Lst_exc
+
+ /* adjust by 24 */
addi r3,r3,8
-171:
-177:
-179:
+ nop
+ /* adjust by 16 */
addi r3,r3,8
-370:
-372:
-176:
-178:
+ nop
+ /* adjust by 8 */
addi r3,r3,4
-185:
+ /* adjust by 4 */
addi r3,r3,4
-170:
-172:
-345:
-173:
-174:
-175:
-181:
-184:
-186:
-187:
-188:
-189:
-194:
-195:
-196:
-1:
- ld r6,-24(r1)
- ld r5,-8(r1)
- add r6,r6,r5
- subf r3,r3,r6 /* #bytes not copied */
+.Lst_exc:
+ ld r6,-24(r1) /* original destination pointer */
+ ld r4,-16(r1) /* original source pointer */
+ ld r5,-8(r1) /* original number of bytes */
+ add r7,r6,r5
+ /*
+ * If the destination pointer isn't 8-byte aligned,
+ * we may have got the exception as a result of a
+ * store that overlapped a page boundary, so we may be
+ * able to copy a few more bytes.
+ */
+17: andi. r0,r3,7
+ beq 19f
+ subf r8,r6,r3 /* #bytes copied */
+100: EX_TABLE(100b,19f)
+ lbzx r0,r8,r4
+100: EX_TABLE(100b,19f)
+ stb r0,0(r3)
+ addi r3,r3,1
+ cmpld r3,r7
+ blt 17b
+19: subf r3,r3,r7 /* #bytes not copied in r3 */
blr
- EX_TABLE(20b,120b)
- EX_TABLE(220b,320b)
- EX_TABLE(21b,121b)
- EX_TABLE(221b,321b)
- EX_TABLE(70b,170b)
- EX_TABLE(270b,370b)
- EX_TABLE(22b,122b)
- EX_TABLE(222b,322b)
- EX_TABLE(71b,171b)
- EX_TABLE(271b,371b)
- EX_TABLE(72b,172b)
- EX_TABLE(272b,372b)
- EX_TABLE(244b,344b)
- EX_TABLE(245b,345b)
- EX_TABLE(23b,123b)
- EX_TABLE(73b,173b)
- EX_TABLE(44b,144b)
- EX_TABLE(74b,174b)
- EX_TABLE(45b,145b)
- EX_TABLE(75b,175b)
- EX_TABLE(24b,124b)
- EX_TABLE(25b,125b)
- EX_TABLE(26b,126b)
- EX_TABLE(27b,127b)
- EX_TABLE(28b,128b)
- EX_TABLE(29b,129b)
- EX_TABLE(30b,130b)
- EX_TABLE(31b,131b)
- EX_TABLE(32b,132b)
- EX_TABLE(76b,176b)
- EX_TABLE(33b,133b)
- EX_TABLE(77b,177b)
- EX_TABLE(78b,178b)
- EX_TABLE(79b,179b)
- EX_TABLE(80b,180b)
- EX_TABLE(34b,134b)
- EX_TABLE(94b,194b)
- EX_TABLE(95b,195b)
- EX_TABLE(96b,196b)
- EX_TABLE(35b,135b)
- EX_TABLE(81b,181b)
- EX_TABLE(36b,136b)
- EX_TABLE(82b,182b)
- EX_TABLE(37b,137b)
- EX_TABLE(83b,183b)
- EX_TABLE(38b,138b)
- EX_TABLE(39b,139b)
- EX_TABLE(84b,184b)
- EX_TABLE(85b,185b)
- EX_TABLE(40b,140b)
- EX_TABLE(86b,186b)
- EX_TABLE(41b,141b)
- EX_TABLE(87b,187b)
- EX_TABLE(42b,142b)
- EX_TABLE(88b,188b)
- EX_TABLE(43b,143b)
- EX_TABLE(89b,189b)
-
/*
* Routine to copy a whole page of data, optimized for POWER4.
* On POWER4 it is more than 50% faster than the simple loop
* above (following the .Ldst_aligned label).
*/
+ .macro exc
+100: EX_TABLE(100b, .Labort)
+ .endm
.Lcopy_page_4K:
std r31,-32(1)
std r30,-40(1)
@@ -451,86 +449,86 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
li r0,5
0: addi r5,r5,-24
mtctr r0
-20: ld r22,640(4)
-21: ld r21,512(4)
-22: ld r20,384(4)
-23: ld r11,256(4)
-24: ld r9,128(4)
-25: ld r7,0(4)
-26: ld r25,648(4)
-27: ld r24,520(4)
-28: ld r23,392(4)
-29: ld r10,264(4)
-30: ld r8,136(4)
-31: ldu r6,8(4)
+exc; ld r22,640(4)
+exc; ld r21,512(4)
+exc; ld r20,384(4)
+exc; ld r11,256(4)
+exc; ld r9,128(4)
+exc; ld r7,0(4)
+exc; ld r25,648(4)
+exc; ld r24,520(4)
+exc; ld r23,392(4)
+exc; ld r10,264(4)
+exc; ld r8,136(4)
+exc; ldu r6,8(4)
cmpwi r5,24
1:
-32: std r22,648(3)
-33: std r21,520(3)
-34: std r20,392(3)
-35: std r11,264(3)
-36: std r9,136(3)
-37: std r7,8(3)
-38: ld r28,648(4)
-39: ld r27,520(4)
-40: ld r26,392(4)
-41: ld r31,264(4)
-42: ld r30,136(4)
-43: ld r29,8(4)
-44: std r25,656(3)
-45: std r24,528(3)
-46: std r23,400(3)
-47: std r10,272(3)
-48: std r8,144(3)
-49: std r6,16(3)
-50: ld r22,656(4)
-51: ld r21,528(4)
-52: ld r20,400(4)
-53: ld r11,272(4)
-54: ld r9,144(4)
-55: ld r7,16(4)
-56: std r28,664(3)
-57: std r27,536(3)
-58: std r26,408(3)
-59: std r31,280(3)
-60: std r30,152(3)
-61: stdu r29,24(3)
-62: ld r25,664(4)
-63: ld r24,536(4)
-64: ld r23,408(4)
-65: ld r10,280(4)
-66: ld r8,152(4)
-67: ldu r6,24(4)
+exc; std r22,648(3)
+exc; std r21,520(3)
+exc; std r20,392(3)
+exc; std r11,264(3)
+exc; std r9,136(3)
+exc; std r7,8(3)
+exc; ld r28,648(4)
+exc; ld r27,520(4)
+exc; ld r26,392(4)
+exc; ld r31,264(4)
+exc; ld r30,136(4)
+exc; ld r29,8(4)
+exc; std r25,656(3)
+exc; std r24,528(3)
+exc; std r23,400(3)
+exc; std r10,272(3)
+exc; std r8,144(3)
+exc; std r6,16(3)
+exc; ld r22,656(4)
+exc; ld r21,528(4)
+exc; ld r20,400(4)
+exc; ld r11,272(4)
+exc; ld r9,144(4)
+exc; ld r7,16(4)
+exc; std r28,664(3)
+exc; std r27,536(3)
+exc; std r26,408(3)
+exc; std r31,280(3)
+exc; std r30,152(3)
+exc; stdu r29,24(3)
+exc; ld r25,664(4)
+exc; ld r24,536(4)
+exc; ld r23,408(4)
+exc; ld r10,280(4)
+exc; ld r8,152(4)
+exc; ldu r6,24(4)
bdnz 1b
-68: std r22,648(3)
-69: std r21,520(3)
-70: std r20,392(3)
-71: std r11,264(3)
-72: std r9,136(3)
-73: std r7,8(3)
-74: addi r4,r4,640
-75: addi r3,r3,648
+exc; std r22,648(3)
+exc; std r21,520(3)
+exc; std r20,392(3)
+exc; std r11,264(3)
+exc; std r9,136(3)
+exc; std r7,8(3)
+ addi r4,r4,640
+ addi r3,r3,648
bge 0b
mtctr r5
-76: ld r7,0(4)
-77: ld r8,8(4)
-78: ldu r9,16(4)
+exc; ld r7,0(4)
+exc; ld r8,8(4)
+exc; ldu r9,16(4)
3:
-79: ld r10,8(4)
-80: std r7,8(3)
-81: ld r7,16(4)
-82: std r8,16(3)
-83: ld r8,24(4)
-84: std r9,24(3)
-85: ldu r9,32(4)
-86: stdu r10,32(3)
+exc; ld r10,8(4)
+exc; std r7,8(3)
+exc; ld r7,16(4)
+exc; std r8,16(3)
+exc; ld r8,24(4)
+exc; std r9,24(3)
+exc; ldu r9,32(4)
+exc; stdu r10,32(3)
bdnz 3b
4:
-87: ld r10,8(4)
-88: std r7,8(3)
-89: std r8,16(3)
-90: std r9,24(3)
-91: std r10,32(3)
+exc; ld r10,8(4)
+exc; std r7,8(3)
+exc; std r8,16(3)
+exc; std r9,24(3)
+exc; std r10,32(3)
9: ld r20,-120(1)
ld r21,-112(1)
ld r22,-104(1)
@@ -550,7 +548,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
* on an exception, reset to the beginning and jump back into the
* standard __copy_tofrom_user
*/
-100: ld r20,-120(1)
+.Labort:
+ ld r20,-120(1)
ld r21,-112(1)
ld r22,-104(1)
ld r23,-96(1)
@@ -566,78 +565,4 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
ld r4,-16(r1)
li r5,4096
b .Ldst_aligned
-
- EX_TABLE(20b,100b)
- EX_TABLE(21b,100b)
- EX_TABLE(22b,100b)
- EX_TABLE(23b,100b)
- EX_TABLE(24b,100b)
- EX_TABLE(25b,100b)
- EX_TABLE(26b,100b)
- EX_TABLE(27b,100b)
- EX_TABLE(28b,100b)
- EX_TABLE(29b,100b)
- EX_TABLE(30b,100b)
- EX_TABLE(31b,100b)
- EX_TABLE(32b,100b)
- EX_TABLE(33b,100b)
- EX_TABLE(34b,100b)
- EX_TABLE(35b,100b)
- EX_TABLE(36b,100b)
- EX_TABLE(37b,100b)
- EX_TABLE(38b,100b)
- EX_TABLE(39b,100b)
- EX_TABLE(40b,100b)
- EX_TABLE(41b,100b)
- EX_TABLE(42b,100b)
- EX_TABLE(43b,100b)
- EX_TABLE(44b,100b)
- EX_TABLE(45b,100b)
- EX_TABLE(46b,100b)
- EX_TABLE(47b,100b)
- EX_TABLE(48b,100b)
- EX_TABLE(49b,100b)
- EX_TABLE(50b,100b)
- EX_TABLE(51b,100b)
- EX_TABLE(52b,100b)
- EX_TABLE(53b,100b)
- EX_TABLE(54b,100b)
- EX_TABLE(55b,100b)
- EX_TABLE(56b,100b)
- EX_TABLE(57b,100b)
- EX_TABLE(58b,100b)
- EX_TABLE(59b,100b)
- EX_TABLE(60b,100b)
- EX_TABLE(61b,100b)
- EX_TABLE(62b,100b)
- EX_TABLE(63b,100b)
- EX_TABLE(64b,100b)
- EX_TABLE(65b,100b)
- EX_TABLE(66b,100b)
- EX_TABLE(67b,100b)
- EX_TABLE(68b,100b)
- EX_TABLE(69b,100b)
- EX_TABLE(70b,100b)
- EX_TABLE(71b,100b)
- EX_TABLE(72b,100b)
- EX_TABLE(73b,100b)
- EX_TABLE(74b,100b)
- EX_TABLE(75b,100b)
- EX_TABLE(76b,100b)
- EX_TABLE(77b,100b)
- EX_TABLE(78b,100b)
- EX_TABLE(79b,100b)
- EX_TABLE(80b,100b)
- EX_TABLE(81b,100b)
- EX_TABLE(82b,100b)
- EX_TABLE(83b,100b)
- EX_TABLE(84b,100b)
- EX_TABLE(85b,100b)
- EX_TABLE(86b,100b)
- EX_TABLE(87b,100b)
- EX_TABLE(88b,100b)
- EX_TABLE(89b,100b)
- EX_TABLE(90b,100b)
- EX_TABLE(91b,100b)
-
EXPORT_SYMBOL(__copy_tofrom_user)
diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S
index 215e4760c09f..1a1fe180af62 100644
--- a/arch/powerpc/lib/copyuser_power7.S
+++ b/arch/powerpc/lib/copyuser_power7.S
@@ -19,6 +19,11 @@
*/
#include <asm/ppc_asm.h>
+#ifndef SELFTEST_CASE
+/* 0 == don't use VMX, 1 == use VMX */
+#define SELFTEST_CASE 0
+#endif
+
#ifdef __BIG_ENDIAN__
#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
@@ -80,7 +85,6 @@
_GLOBAL(__copy_tofrom_user_power7)
-#ifdef CONFIG_ALTIVEC
cmpldi r5,16
cmpldi cr1,r5,3328
@@ -89,15 +93,12 @@ _GLOBAL(__copy_tofrom_user_power7)
std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
blt .Lshort_copy
- bge cr1,.Lvmx_copy
-#else
- cmpldi r5,16
- std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
- std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
- std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
-
- blt .Lshort_copy
+#ifdef CONFIG_ALTIVEC
+test_feature = SELFTEST_CASE
+BEGIN_FTR_SECTION
+ bgt cr1,.Lvmx_copy
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
#endif
.Lnonvmx_copy:
@@ -278,8 +279,8 @@ err1; stb r0,0(r3)
addi r1,r1,STACKFRAMESIZE
b .Lnonvmx_copy
-#ifdef CONFIG_ALTIVEC
.Lvmx_copy:
+#ifdef CONFIG_ALTIVEC
mflr r0
std r0,16(r1)
stdu r1,-STACKFRAMESIZE(r1)
diff --git a/arch/powerpc/lib/feature-fixups-test.S b/arch/powerpc/lib/feature-fixups-test.S
index f16cec989506..ee7c5fd5fc64 100644
--- a/arch/powerpc/lib/feature-fixups-test.S
+++ b/arch/powerpc/lib/feature-fixups-test.S
@@ -11,6 +11,7 @@
#include <asm/feature-fixups.h>
#include <asm/ppc_asm.h>
#include <asm/synch.h>
+#include <asm/asm-compat.h>
.text
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index 8b69f868298c..e613b02bb2f0 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -304,6 +304,9 @@ void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_
printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i);
}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BARRIER_NOSPEC
void do_barrier_nospec_fixups(bool enable)
{
void *start, *end;
@@ -313,8 +316,38 @@ void do_barrier_nospec_fixups(bool enable)
do_barrier_nospec_fixups_range(enable, start, end);
}
+#endif /* CONFIG_PPC_BARRIER_NOSPEC */
-#endif /* CONFIG_PPC_BOOK3S_64 */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_end)
+{
+ unsigned int instr[2], *dest;
+ long *start, *end;
+ int i;
+
+ start = fixup_start;
+ end = fixup_end;
+
+ instr[0] = PPC_INST_NOP;
+ instr[1] = PPC_INST_NOP;
+
+ if (enable) {
+ pr_info("barrier-nospec: using isync; sync as speculation barrier\n");
+ instr[0] = PPC_INST_ISYNC;
+ instr[1] = PPC_INST_SYNC;
+ }
+
+ for (i = 0; start < end; start++, i++) {
+ dest = (void *)start + *start;
+
+ pr_devel("patching dest %lx\n", (unsigned long)dest);
+ patch_instruction(dest, instr[0]);
+ patch_instruction(dest + 1, instr[1]);
+ }
+
+ printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i);
+}
+#endif /* CONFIG_PPC_FSL_BOOK3E */
void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
{
diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S
index 3de7ac154f24..0526b2225260 100644
--- a/arch/powerpc/lib/hweight_64.S
+++ b/arch/powerpc/lib/hweight_64.S
@@ -20,6 +20,7 @@
#include <asm/processor.h>
#include <asm/ppc_asm.h>
#include <asm/export.h>
+#include <asm/feature-fixups.h>
/* Note: This code relies on -mminimal-toc */
diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S
index ae15eba49c1f..32e91994b6b2 100644
--- a/arch/powerpc/lib/ldstfp.S
+++ b/arch/powerpc/lib/ldstfp.S
@@ -15,6 +15,7 @@
#include <asm/ppc-opcode.h>
#include <asm/reg.h>
#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
#include <linux/errno.h>
#ifdef CONFIG_PPC_FPU
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index b7b1237d4aa6..35a0ef932e1a 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -15,7 +15,6 @@
#include <linux/kernel.h>
#include <linux/spinlock.h>
#include <linux/export.h>
-#include <linux/stringify.h>
#include <linux/smp.h>
/* waiting for a spinlock... */
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b7bd55..844d8e774492 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -9,6 +9,7 @@
*/
#include <asm/ppc_asm.h>
#include <asm/export.h>
+#include <asm/ppc-opcode.h>
#define off8 r6
#define off16 r7
@@ -24,28 +25,102 @@
#define rH r31
#ifdef __LITTLE_ENDIAN__
+#define LH lhbrx
+#define LW lwbrx
#define LD ldbrx
+#define LVS lvsr
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+ vperm _VRT,_VRB,_VRA,_VRC
#else
+#define LH lhzx
+#define LW lwzx
#define LD ldx
+#define LVS lvsl
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+ vperm _VRT,_VRA,_VRB,_VRC
#endif
-_GLOBAL(memcmp)
+#define VMX_THRESH 4096
+#define ENTER_VMX_OPS \
+ mflr r0; \
+ std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+ std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+ std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+ std r0,16(r1); \
+ stdu r1,-STACKFRAMESIZE(r1); \
+ bl enter_vmx_ops; \
+ cmpwi cr1,r3,0; \
+ ld r0,STACKFRAMESIZE+16(r1); \
+ ld r3,STK_REG(R31)(r1); \
+ ld r4,STK_REG(R30)(r1); \
+ ld r5,STK_REG(R29)(r1); \
+ addi r1,r1,STACKFRAMESIZE; \
+ mtlr r0
+
+#define EXIT_VMX_OPS \
+ mflr r0; \
+ std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+ std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+ std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+ std r0,16(r1); \
+ stdu r1,-STACKFRAMESIZE(r1); \
+ bl exit_vmx_ops; \
+ ld r0,STACKFRAMESIZE+16(r1); \
+ ld r3,STK_REG(R31)(r1); \
+ ld r4,STK_REG(R30)(r1); \
+ ld r5,STK_REG(R29)(r1); \
+ addi r1,r1,STACKFRAMESIZE; \
+ mtlr r0
+
+/*
+ * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
+ * 16 bytes boundary and permute the result with the 1st 16 bytes.
+
+ * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
+ * ^ ^ ^
+ * 0xbbbb10 0xbbbb20 0xbbb30
+ * ^
+ * _vaddr
+ *
+ *
+ * _vmask is the mask generated by LVS
+ * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
+ * for example: 0xyyyyyyyyyyyyy012 for big endian
+ * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
+ * for example: 0x3456789abcdefzzz for big endian
+ * The permute result is saved in _v_res.
+ * for example: 0x0123456789abcdef for big endian.
+ */
+#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
+ lvx _v2nd_qw,_vaddr,off16; \
+ VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
+
+/*
+ * There are 2 categories for memcmp:
+ * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
+ * are named like .Lsameoffset_xxxx
+ * 2) src/dst has different offset to the 8 bytes boundary. The handlers
+ * are named like .Ldiffoffset_xxxx
+ */
+_GLOBAL_TOC(memcmp)
cmpdi cr1,r5,0
- /* Use the short loop if both strings are not 8B aligned */
- or r6,r3,r4
+ /* Use the short loop if the src/dst addresses are not
+ * with the same offset of 8 bytes align boundary.
+ */
+ xor r6,r3,r4
andi. r6,r6,7
- /* Use the short loop if length is less than 32B */
- cmpdi cr6,r5,31
+ /* Fall back to short loop if compare at aligned addrs
+ * with less than 8 bytes.
+ */
+ cmpdi cr6,r5,7
beq cr1,.Lzero
- bne .Lshort
- bgt cr6,.Llong
+ bgt cr6,.Lno_short
.Lshort:
mtctr r5
-
1: lbz rA,0(r3)
lbz rB,0(r4)
subf. rC,rB,rA
@@ -78,11 +153,98 @@ _GLOBAL(memcmp)
li r3,0
blr
+.Lno_short:
+ dcbt 0,r3
+ dcbt 0,r4
+ bne .Ldiffoffset_8bytes_make_align_start
+
+
+.Lsameoffset_8bytes_make_align_start:
+ /* attempt to compare bytes not aligned with 8 bytes so that
+ * rest comparison can run based on 8 bytes alignment.
+ */
+ andi. r6,r3,7
+
+ /* Try to compare the first double word which is not 8 bytes aligned:
+ * load the first double word at (src & ~7UL) and shift left appropriate
+ * bits before comparision.
+ */
+ rlwinm r6,r3,3,26,28
+ beq .Lsameoffset_8bytes_aligned
+ clrrdi r3,r3,3
+ clrrdi r4,r4,3
+ LD rA,0,r3
+ LD rB,0,r4
+ sld rA,rA,r6
+ sld rB,rB,r6
+ cmpld cr0,rA,rB
+ srwi r6,r6,3
+ bne cr0,.LcmpAB_lightweight
+ subfic r6,r6,8
+ subf. r5,r6,r5
+ addi r3,r3,8
+ addi r4,r4,8
+ beq .Lzero
+
+.Lsameoffset_8bytes_aligned:
+ /* now we are aligned with 8 bytes.
+ * Use .Llong loop if left cmp bytes are equal or greater than 32B.
+ */
+ cmpdi cr6,r5,31
+ bgt cr6,.Llong
+
+.Lcmp_lt32bytes:
+ /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
+ cmpdi cr5,r5,7
+ srdi r0,r5,3
+ ble cr5,.Lcmp_rest_lt8bytes
+
+ /* handle 8 ~ 31 bytes */
+ clrldi r5,r5,61
+ mtctr r0
+2:
+ LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr0,rA,rB
+ addi r3,r3,8
+ addi r4,r4,8
+ bne cr0,.LcmpAB_lightweight
+ bdnz 2b
+
+ cmpwi r5,0
+ beq .Lzero
+
+.Lcmp_rest_lt8bytes:
+ /* Here we have only less than 8 bytes to compare with. at least s1
+ * Address is aligned with 8 bytes.
+ * The next double words are load and shift right with appropriate
+ * bits.
+ */
+ subfic r6,r5,8
+ slwi r6,r6,3
+ LD rA,0,r3
+ LD rB,0,r4
+ srd rA,rA,r6
+ srd rB,rB,r6
+ cmpld cr0,rA,rB
+ bne cr0,.LcmpAB_lightweight
+ b .Lzero
+
.Lnon_zero:
mr r3,rC
blr
.Llong:
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+ /* Try to use vmx loop if length is equal or greater than 4K */
+ cmpldi cr6,r5,VMX_THRESH
+ bge cr6,.Lsameoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Llong_novmx_cmp:
+#endif
+ /* At least s1 addr is aligned with 8 bytes */
li off8,8
li off16,16
li off24,24
@@ -232,4 +394,240 @@ _GLOBAL(memcmp)
ld r28,-32(r1)
ld r27,-40(r1)
blr
+
+.LcmpAB_lightweight: /* skip NV GPRS restore */
+ li r3,1
+ bgtlr
+ li r3,-1
+ blr
+
+#ifdef CONFIG_ALTIVEC
+.Lsameoffset_vmx_cmp:
+ /* Enter with src/dst addrs has the same offset with 8 bytes
+ * align boundary.
+ *
+ * There is an optimization based on following fact: memcmp()
+ * prones to fail early at the first 32 bytes.
+ * Before applying VMX instructions which will lead to 32x128bits
+ * VMX regs load/restore penalty, we compare the first 32 bytes
+ * so that we can catch the ~80% fail cases.
+ */
+
+ li r0,4
+ mtctr r0
+.Lsameoffset_prechk_32B_loop:
+ LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr0,rA,rB
+ addi r3,r3,8
+ addi r4,r4,8
+ bne cr0,.LcmpAB_lightweight
+ addi r5,r5,-8
+ bdnz .Lsameoffset_prechk_32B_loop
+
+ ENTER_VMX_OPS
+ beq cr1,.Llong_novmx_cmp
+
+3:
+ /* need to check whether r4 has the same offset with r3
+ * for 16 bytes boundary.
+ */
+ xor r0,r3,r4
+ andi. r0,r0,0xf
+ bne .Ldiffoffset_vmx_cmp_start
+
+ /* len is no less than 4KB. Need to align with 16 bytes further.
+ */
+ andi. rA,r3,8
+ LD rA,0,r3
+ beq 4f
+ LD rB,0,r4
+ cmpld cr0,rA,rB
+ addi r3,r3,8
+ addi r4,r4,8
+ addi r5,r5,-8
+
+ beq cr0,4f
+ /* save and restore cr0 */
+ mfocrf r5,128
+ EXIT_VMX_OPS
+ mtocrf 128,r5
+ b .LcmpAB_lightweight
+
+4:
+ /* compare 32 bytes for each loop */
+ srdi r0,r5,5
+ mtctr r0
+ clrldi r5,r5,59
+ li off16,16
+
+.balign 16
+5:
+ lvx v0,0,r3
+ lvx v1,0,r4
+ VCMPEQUD_RC(v0,v0,v1)
+ bnl cr6,7f
+ lvx v0,off16,r3
+ lvx v1,off16,r4
+ VCMPEQUD_RC(v0,v0,v1)
+ bnl cr6,6f
+ addi r3,r3,32
+ addi r4,r4,32
+ bdnz 5b
+
+ EXIT_VMX_OPS
+ cmpdi r5,0
+ beq .Lzero
+ b .Lcmp_lt32bytes
+
+6:
+ addi r3,r3,16
+ addi r4,r4,16
+
+7:
+ /* diff the last 16 bytes */
+ EXIT_VMX_OPS
+ LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr0,rA,rB
+ li off8,8
+ bne cr0,.LcmpAB_lightweight
+
+ LD rA,off8,r3
+ LD rB,off8,r4
+ cmpld cr0,rA,rB
+ bne cr0,.LcmpAB_lightweight
+ b .Lzero
+#endif
+
+.Ldiffoffset_8bytes_make_align_start:
+ /* now try to align s1 with 8 bytes */
+ rlwinm r6,r3,3,26,28
+ beq .Ldiffoffset_align_s1_8bytes
+
+ clrrdi r3,r3,3
+ LD rA,0,r3
+ LD rB,0,r4 /* unaligned load */
+ sld rA,rA,r6
+ srd rA,rA,r6
+ srd rB,rB,r6
+ cmpld cr0,rA,rB
+ srwi r6,r6,3
+ bne cr0,.LcmpAB_lightweight
+
+ subfic r6,r6,8
+ subf. r5,r6,r5
+ addi r3,r3,8
+ add r4,r4,r6
+
+ beq .Lzero
+
+.Ldiffoffset_align_s1_8bytes:
+ /* now s1 is aligned with 8 bytes. */
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+ /* only do vmx ops when the size equal or greater than 4K bytes */
+ cmpdi cr5,r5,VMX_THRESH
+ bge cr5,.Ldiffoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Ldiffoffset_novmx_cmp:
+#endif
+
+
+ cmpdi cr5,r5,31
+ ble cr5,.Lcmp_lt32bytes
+
+#ifdef CONFIG_ALTIVEC
+ b .Llong_novmx_cmp
+#else
+ b .Llong
+#endif
+
+#ifdef CONFIG_ALTIVEC
+.Ldiffoffset_vmx_cmp:
+ /* perform a 32 bytes pre-checking before
+ * enable VMX operations.
+ */
+ li r0,4
+ mtctr r0
+.Ldiffoffset_prechk_32B_loop:
+ LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr0,rA,rB
+ addi r3,r3,8
+ addi r4,r4,8
+ bne cr0,.LcmpAB_lightweight
+ addi r5,r5,-8
+ bdnz .Ldiffoffset_prechk_32B_loop
+
+ ENTER_VMX_OPS
+ beq cr1,.Ldiffoffset_novmx_cmp
+
+.Ldiffoffset_vmx_cmp_start:
+ /* Firstly try to align r3 with 16 bytes */
+ andi. r6,r3,0xf
+ li off16,16
+ beq .Ldiffoffset_vmx_s1_16bytes_align
+
+ LVS v3,0,r3
+ LVS v4,0,r4
+
+ lvx v5,0,r3
+ lvx v6,0,r4
+ LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
+ LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+
+ VCMPEQUB_RC(v7,v9,v10)
+ bnl cr6,.Ldiffoffset_vmx_diff_found
+
+ subfic r6,r6,16
+ subf r5,r6,r5
+ add r3,r3,r6
+ add r4,r4,r6
+
+.Ldiffoffset_vmx_s1_16bytes_align:
+ /* now s1 is aligned with 16 bytes */
+ lvx v6,0,r4
+ LVS v4,0,r4
+ srdi r6,r5,5 /* loop for 32 bytes each */
+ clrldi r5,r5,59
+ mtctr r6
+
+.balign 16
+.Ldiffoffset_vmx_32bytesloop:
+ /* the first qw of r4 was saved in v6 */
+ lvx v9,0,r3
+ LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+ VCMPEQUB_RC(v7,v9,v10)
+ vor v6,v8,v8
+ bnl cr6,.Ldiffoffset_vmx_diff_found
+
+ addi r3,r3,16
+ addi r4,r4,16
+
+ lvx v9,0,r3
+ LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+ VCMPEQUB_RC(v7,v9,v10)
+ vor v6,v8,v8
+ bnl cr6,.Ldiffoffset_vmx_diff_found
+
+ addi r3,r3,16
+ addi r4,r4,16
+
+ bdnz .Ldiffoffset_vmx_32bytesloop
+
+ EXIT_VMX_OPS
+
+ cmpdi r5,0
+ beq .Lzero
+ b .Lcmp_lt32bytes
+
+.Ldiffoffset_vmx_diff_found:
+ EXIT_VMX_OPS
+ /* anyway, the diff will appear in next 16 bytes */
+ li r5,16
+ b .Lcmp_lt32bytes
+
+#endif
EXPORT_SYMBOL(memcmp)
diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S
index 8d8265be1a59..273ea67e60a1 100644
--- a/arch/powerpc/lib/memcpy_64.S
+++ b/arch/powerpc/lib/memcpy_64.S
@@ -9,6 +9,13 @@
#include <asm/processor.h>
#include <asm/ppc_asm.h>
#include <asm/export.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+#ifndef SELFTEST_CASE
+/* For big-endian, 0 == most CPUs, 1 == POWER6, 2 == Cell */
+#define SELFTEST_CASE 0
+#endif
.align 7
_GLOBAL_TOC(memcpy)
@@ -20,10 +27,8 @@ BEGIN_FTR_SECTION
#endif
FTR_SECTION_ELSE
#ifdef CONFIG_PPC_BOOK3S_64
-#ifndef SELFTEST
b memcpy_power7
#endif
-#endif
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
#ifdef __LITTLE_ENDIAN__
/* dumb little-endian memcpy that will get replaced at runtime */
@@ -47,6 +52,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
cleared.
At the time of writing the only CPU that has this combination of bits
set is Power6. */
+test_feature = (SELFTEST_CASE == 1)
BEGIN_FTR_SECTION
nop
FTR_SECTION_ELSE
@@ -55,6 +61,7 @@ ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
CPU_FTR_UNALIGNED_LD_STD)
.Ldst_aligned:
addi r3,r3,-16
+test_feature = (SELFTEST_CASE == 0)
BEGIN_FTR_SECTION
andi. r0,r4,7
bne .Lsrc_unaligned
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
index df7de9d3da08..89bfefcf7fcc 100644
--- a/arch/powerpc/lib/memcpy_power7.S
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -19,7 +19,10 @@
*/
#include <asm/ppc_asm.h>
-_GLOBAL(memcpy_power7)
+#ifndef SELFTEST_CASE
+/* 0 == don't use VMX, 1 == use VMX */
+#define SELFTEST_CASE 0
+#endif
#ifdef __BIG_ENDIAN__
#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
@@ -29,20 +32,17 @@ _GLOBAL(memcpy_power7)
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
#endif
-#ifdef CONFIG_ALTIVEC
+_GLOBAL(memcpy_power7)
cmpldi r5,16
cmpldi cr1,r5,4096
-
std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-
blt .Lshort_copy
- bgt cr1,.Lvmx_copy
-#else
- cmpldi r5,16
-
- std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
- blt .Lshort_copy
+#ifdef CONFIG_ALTIVEC
+test_feature = SELFTEST_CASE
+BEGIN_FTR_SECTION
+ bgt cr1, .Lvmx_copy
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
#endif
.Lnonvmx_copy:
@@ -223,14 +223,14 @@ _GLOBAL(memcpy_power7)
addi r1,r1,STACKFRAMESIZE
b .Lnonvmx_copy
-#ifdef CONFIG_ALTIVEC
.Lvmx_copy:
+#ifdef CONFIG_ALTIVEC
mflr r0
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
std r0,16(r1)
stdu r1,-STACKFRAMESIZE(r1)
- bl enter_vmx_copy
+ bl enter_vmx_ops
cmpwi cr1,r3,0
ld r0,STACKFRAMESIZE+16(r1)
ld r3,STK_REG(R31)(r1)
@@ -445,7 +445,7 @@ _GLOBAL(memcpy_power7)
15: addi r1,r1,STACKFRAMESIZE
ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
- b exit_vmx_copy /* tail call optimise */
+ b exit_vmx_ops /* tail call optimise */
.Lvmx_unaligned_copy:
/* Get the destination 16B aligned */
@@ -649,5 +649,5 @@ _GLOBAL(memcpy_power7)
15: addi r1,r1,STACKFRAMESIZE
ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
- b exit_vmx_copy /* tail call optimise */
+ b exit_vmx_ops /* tail call optimise */
#endif /* CONFIG_ALTIVEC */
diff --git a/arch/powerpc/lib/strlen_32.S b/arch/powerpc/lib/strlen_32.S
new file mode 100644
index 000000000000..0a8d3f64d493
--- /dev/null
+++ b/arch/powerpc/lib/strlen_32.S
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * strlen() for PPC32
+ *
+ * Copyright (C) 2018 Christophe Leroy CS Systemes d'Information.
+ *
+ * Inspired from glibc implementation
+ */
+#include <asm/ppc_asm.h>
+#include <asm/export.h>
+#include <asm/cache.h>
+
+ .text
+
+/*
+ * Algorithm:
+ *
+ * 1) Given a word 'x', we can test to see if it contains any 0 bytes
+ * by subtracting 0x01010101, and seeing if any of the high bits of each
+ * byte changed from 0 to 1. This works because the least significant
+ * 0 byte must have had no incoming carry (otherwise it's not the least
+ * significant), so it is 0x00 - 0x01 == 0xff. For all other
+ * byte values, either they have the high bit set initially, or when
+ * 1 is subtracted you get a value in the range 0x00-0x7f, none of which
+ * have their high bit set. The expression here is
+ * (x - 0x01010101) & ~x & 0x80808080), which gives 0x00000000 when
+ * there were no 0x00 bytes in the word. You get 0x80 in bytes that
+ * match, but possibly false 0x80 matches in the next more significant
+ * byte to a true match due to carries. For little-endian this is
+ * of no consequence since the least significant match is the one
+ * we're interested in, but big-endian needs method 2 to find which
+ * byte matches.
+ * 2) Given a word 'x', we can test to see _which_ byte was zero by
+ * calculating ~(((x & ~0x80808080) - 0x80808080 - 1) | x | ~0x80808080).
+ * This produces 0x80 in each byte that was zero, and 0x00 in all
+ * the other bytes. The '| ~0x80808080' clears the low 7 bits in each
+ * byte, and the '| x' part ensures that bytes with the high bit set
+ * produce 0x00. The addition will carry into the high bit of each byte
+ * iff that byte had one of its low 7 bits set. We can then just see
+ * which was the most significant bit set and divide by 8 to find how
+ * many to add to the index.
+ * This is from the book 'The PowerPC Compiler Writer's Guide',
+ * by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren.
+ */
+
+_GLOBAL(strlen)
+ andi. r0, r3, 3
+ lis r7, 0x0101
+ addi r10, r3, -4
+ addic r7, r7, 0x0101 /* r7 = 0x01010101 (lomagic) & clear XER[CA] */
+ rotlwi r6, r7, 31 /* r6 = 0x80808080 (himagic) */
+ bne- 3f
+ .balign IFETCH_ALIGN_BYTES
+1: lwzu r9, 4(r10)
+2: subf r8, r7, r9
+ and. r8, r8, r6
+ beq+ 1b
+ andc. r8, r8, r9
+ beq+ 1b
+ andc r8, r9, r6
+ orc r9, r9, r6
+ subfe r8, r6, r8
+ nor r8, r8, r9
+ cntlzw r8, r8
+ subf r3, r3, r10
+ srwi r8, r8, 3
+ add r3, r3, r8
+ blr
+
+ /* Missaligned string: make sure bytes before string are seen not 0 */
+3: xor r10, r10, r0
+ orc r8, r8, r8
+ lwzu r9, 4(r10)
+ slwi r0, r0, 3
+ srw r8, r8, r0
+ orc r9, r9, r8
+ b 2b
+EXPORT_SYMBOL(strlen)
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index bf925cdcaca9..9f340494a8ac 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -53,7 +53,7 @@ int exit_vmx_usercopy(void)
return 0;
}
-int enter_vmx_copy(void)
+int enter_vmx_ops(void)
{
if (in_interrupt())
return 0;
@@ -70,7 +70,7 @@ int enter_vmx_copy(void)
* passed a pointer to the destination which we return as required by a
* memcpy implementation.
*/
-void *exit_vmx_copy(void *dest)
+void *exit_vmx_ops(void *dest)
{
disable_kernel_altivec();
preempt_enable();