17 files changed, 839 insertions, 375 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index d0ca13ad8231..670286808928 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -12,7 +12,7 @@ CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE)
 
 obj-y += string.o alloc.o code-patching.o feature-fixups.o
 
-obj-$(CONFIG_PPC32)	+= div64.o copy_32.o crtsavres.o
+obj-$(CONFIG_PPC32)	+= div64.o copy_32.o crtsavres.o strlen_32.o
 
 # See corresponding test in arch/powerpc/Makefile
 # 64-bit linker creates .sfpr on demand for final link (vmlinux),
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index e0d881ab304e..850f3b8f4da5 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -195,6 +195,22 @@ int patch_branch(unsigned int *addr, unsigned long target, int flags)
 	return patch_instruction(addr, create_branch(addr, target, flags));
 }
 
+int patch_branch_site(s32 *site, unsigned long target, int flags)
+{
+	unsigned int *addr;
+
+	addr = (unsigned int *)((unsigned long)site + *site);
+	return patch_instruction(addr, create_branch(addr, target, flags));
+}
+
+int patch_instruction_site(s32 *site, unsigned int instr)
+{
+	unsigned int *addr;
+
+	addr = (unsigned int *)((unsigned long)site + *site);
+	return patch_instruction(addr, instr);
+}
+
 bool is_offset_in_branch_range(long offset)
 {
 	/*
diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index da425bb6b369..ba66846fe973 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -13,6 +13,7 @@
 #include <asm/errno.h>
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/code-patching-asm.h>
 
 #define COPY_16_BYTES		\
 	lwz	r7,4(r4);	\
@@ -107,8 +108,8 @@ _GLOBAL(memset)
 	 * Skip optimised bloc until cache is enabled. Will be replaced
 	 * by 'bne' during boot to use normal procedure if r4 is not zero
 	 */
-_GLOBAL(memset_nocache_branch)
-	b	2f
+5:	b	2f
+	patch_site	5b, patch__memset_nocache
 
 	clrlwi	r7,r6,32-LG_CACHELINE_BYTES
 	add	r8,r7,r5
@@ -168,7 +169,9 @@ _GLOBAL(memmove)
 	/* fall through */
 
 _GLOBAL(memcpy)
-	b	generic_memcpy
+1:	b	generic_memcpy
+	patch_site	1b, patch__memcpy_nocache
+
 	add	r7,r3,r5		/* test if the src & dst overlap */
 	add	r8,r4,r5
 	cmplw	0,r4,r7
diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
index 8d5034f645f3..694390357667 100644
--- a/arch/powerpc/lib/copypage_64.S
+++ b/arch/powerpc/lib/copypage_64.S
@@ -11,6 +11,7 @@
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/export.h>
+#include <asm/feature-fixups.h>
 
         .section        ".toc","aw"
 PPC64_CACHES:
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
index 8fa73b7ab20e..e38f956f7d9f 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@ _GLOBAL(copypage_power7)
 	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
@@ -100,7 +100,7 @@ _GLOBAL(copypage_power7)
 	addi	r3,r3,128
 	bdnz	1b
 
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 
 #else
 	li	r0,(PAGE_SIZE/128)
diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S
index 506677395681..96c514bee66b 100644
--- a/arch/powerpc/lib/copyuser_64.S
+++ b/arch/powerpc/lib/copyuser_64.S
@@ -9,6 +9,13 @@
 #include <asm/processor.h>
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+#ifndef SELFTEST_CASE
+/* 0 == most CPUs, 1 == POWER6, 2 == Cell */
+#define SELFTEST_CASE	0
+#endif
 
 #ifdef __BIG_ENDIAN__
 #define sLd sld		/* Shift towards low-numbered address. */
@@ -18,6 +25,28 @@
 #define sHd sld		/* Shift towards high-numbered address. */
 #endif
 
+/*
+ * These macros are used to generate exception table entries.
+ * The exception handlers below use the original arguments
+ * (stored on the stack) and the point where we're up to in
+ * the destination buffer, i.e. the address of the first
+ * unmodified byte.  Generally r3 points into the destination
+ * buffer, but the first unmodified byte is at a variable
+ * offset from r3.  In the code below, the symbol r3_offset
+ * is set to indicate the current offset at each point in
+ * the code.  This offset is then used as a negative offset
+ * from the exception handler code, and those instructions
+ * before the exception handlers are addi instructions that
+ * adjust r3 to point to the correct place.
+ */
+	.macro	lex		/* exception handler for load */
+100:	EX_TABLE(100b, .Lld_exc - r3_offset)
+	.endm
+
+	.macro	stex		/* exception handler for store */
+100:	EX_TABLE(100b, .Lst_exc - r3_offset)
+	.endm
+
 	.align	7
 _GLOBAL_TOC(__copy_tofrom_user)
 #ifdef CONFIG_PPC_BOOK3S_64
@@ -28,7 +57,7 @@ FTR_SECTION_ELSE
 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
 #endif
 _GLOBAL(__copy_tofrom_user_base)
-	/* first check for a whole page copy on a page boundary */
+	/* first check for a 4kB copy on a 4kB boundary */
 	cmpldi	cr1,r5,16
 	cmpdi	cr6,r5,4096
 	or	r0,r3,r4
@@ -49,6 +78,7 @@ _GLOBAL(__copy_tofrom_user_base)
  * At the time of writing the only CPU that has this combination of bits
  * set is Power6.
  */
+test_feature = (SELFTEST_CASE == 1)
 BEGIN_FTR_SECTION
 	nop
 FTR_SECTION_ELSE
@@ -57,6 +87,8 @@ ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
 		    CPU_FTR_UNALIGNED_LD_STD)
 .Ldst_aligned:
 	addi	r3,r3,-16
+r3_offset = 16
+test_feature = (SELFTEST_CASE == 0)
 BEGIN_FTR_SECTION
 	andi.	r0,r4,7
 	bne	.Lsrc_unaligned
@@ -64,57 +96,69 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 	blt	cr1,.Ldo_tail		/* if < 16 bytes to copy */
 	srdi	r0,r5,5
 	cmpdi	cr1,r0,0
-20:	ld	r7,0(r4)
-220:	ld	r6,8(r4)
+lex;	ld	r7,0(r4)
+lex;	ld	r6,8(r4)
 	addi	r4,r4,16
 	mtctr	r0
 	andi.	r0,r5,0x10
 	beq	22f
 	addi	r3,r3,16
+r3_offset = 0
 	addi	r4,r4,-16
 	mr	r9,r7
 	mr	r8,r6
 	beq	cr1,72f
-21:	ld	r7,16(r4)
-221:	ld	r6,24(r4)
+21:
+lex;	ld	r7,16(r4)
+lex;	ld	r6,24(r4)
 	addi	r4,r4,32
-70:	std	r9,0(r3)
-270:	std	r8,8(r3)
-22:	ld	r9,0(r4)
-222:	ld	r8,8(r4)
-71:	std	r7,16(r3)
-271:	std	r6,24(r3)
+stex;	std	r9,0(r3)
+r3_offset = 8
+stex;	std	r8,8(r3)
+r3_offset = 16
+22:
+lex;	ld	r9,0(r4)
+lex;	ld	r8,8(r4)
+stex;	std	r7,16(r3)
+r3_offset = 24
+stex;	std	r6,24(r3)
 	addi	r3,r3,32
+r3_offset = 0
 	bdnz	21b
-72:	std	r9,0(r3)
-272:	std	r8,8(r3)
+72:
+stex;	std	r9,0(r3)
+r3_offset = 8
+stex;	std	r8,8(r3)
+r3_offset = 16
 	andi.	r5,r5,0xf
 	beq+	3f
 	addi	r4,r4,16
 .Ldo_tail:
 	addi	r3,r3,16
+r3_offset = 0
 	bf	cr7*4+0,246f
-244:	ld	r9,0(r4)
+lex;	ld	r9,0(r4)
 	addi	r4,r4,8
-245:	std	r9,0(r3)
+stex;	std	r9,0(r3)
 	addi	r3,r3,8
 246:	bf	cr7*4+1,1f
-23:	lwz	r9,0(r4)
+lex;	lwz	r9,0(r4)
 	addi	r4,r4,4
-73:	stw	r9,0(r3)
+stex;	stw	r9,0(r3)
 	addi	r3,r3,4
 1:	bf	cr7*4+2,2f
-44:	lhz	r9,0(r4)
+lex;	lhz	r9,0(r4)
 	addi	r4,r4,2
-74:	sth	r9,0(r3)
+stex;	sth	r9,0(r3)
 	addi	r3,r3,2
 2:	bf	cr7*4+3,3f
-45:	lbz	r9,0(r4)
-75:	stb	r9,0(r3)
+lex;	lbz	r9,0(r4)
+stex;	stb	r9,0(r3)
 3:	li	r3,0
 	blr
 
 .Lsrc_unaligned:
+r3_offset = 16
 	srdi	r6,r5,3
 	addi	r5,r5,-16
 	subf	r4,r0,r4
@@ -127,58 +171,69 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 	add	r5,r5,r0
 	bt	cr7*4+0,28f
 
-24:	ld	r9,0(r4)	/* 3+2n loads, 2+2n stores */
-25:	ld	r0,8(r4)
+lex;	ld	r9,0(r4)	/* 3+2n loads, 2+2n stores */
+lex;	ld	r0,8(r4)
 	sLd	r6,r9,r10
-26:	ldu	r9,16(r4)
+lex;	ldu	r9,16(r4)
 	sHd	r7,r0,r11
 	sLd	r8,r0,r10
 	or	r7,r7,r6
 	blt	cr6,79f
-27:	ld	r0,8(r4)
+lex;	ld	r0,8(r4)
 	b	2f
 
-28:	ld	r0,0(r4)	/* 4+2n loads, 3+2n stores */
-29:	ldu	r9,8(r4)
+28:
+lex;	ld	r0,0(r4)	/* 4+2n loads, 3+2n stores */
+lex;	ldu	r9,8(r4)
 	sLd	r8,r0,r10
 	addi	r3,r3,-8
+r3_offset = 24
 	blt	cr6,5f
-30:	ld	r0,8(r4)
+lex;	ld	r0,8(r4)
 	sHd	r12,r9,r11
 	sLd	r6,r9,r10
-31:	ldu	r9,16(r4)
+lex;	ldu	r9,16(r4)
 	or	r12,r8,r12
 	sHd	r7,r0,r11
 	sLd	r8,r0,r10
 	addi	r3,r3,16
+r3_offset = 8
 	beq	cr6,78f
 
 1:	or	r7,r7,r6
-32:	ld	r0,8(r4)
-76:	std	r12,8(r3)
+lex;	ld	r0,8(r4)
+stex;	std	r12,8(r3)
+r3_offset = 16
 2:	sHd	r12,r9,r11
 	sLd	r6,r9,r10
-33:	ldu	r9,16(r4)
+lex;	ldu	r9,16(r4)
 	or	r12,r8,r12
-77:	stdu	r7,16(r3)
+stex;	stdu	r7,16(r3)
+r3_offset = 8
 	sHd	r7,r0,r11
 	sLd	r8,r0,r10
 	bdnz	1b
 
-78:	std	r12,8(r3)
+78:
+stex;	std	r12,8(r3)
+r3_offset = 16
 	or	r7,r7,r6
-79:	std	r7,16(r3)
+79:
+stex;	std	r7,16(r3)
+r3_offset = 24
 5:	sHd	r12,r9,r11
 	or	r12,r8,r12
-80:	std	r12,24(r3)
+stex;	std	r12,24(r3)
+r3_offset = 32
 	bne	6f
 	li	r3,0
 	blr
 6:	cmpwi	cr1,r5,8
 	addi	r3,r3,32
+r3_offset = 0
 	sLd	r9,r9,r10
 	ble	cr1,7f
-34:	ld	r0,8(r4)
+lex;	ld	r0,8(r4)
 	sHd	r7,r0,r11
 	or	r9,r7,r9
 7:
@@ -186,7 +241,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 #ifdef __BIG_ENDIAN__
 	rotldi	r9,r9,32
 #endif
-94:	stw	r9,0(r3)
+stex;	stw	r9,0(r3)
 #ifdef __LITTLE_ENDIAN__
 	rotrdi	r9,r9,32
 #endif
@@ -195,7 +250,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 #ifdef __BIG_ENDIAN__
 	rotldi	r9,r9,16
 #endif
-95:	sth	r9,0(r3)
+stex;	sth	r9,0(r3)
 #ifdef __LITTLE_ENDIAN__
 	rotrdi	r9,r9,16
 #endif
@@ -204,7 +259,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 #ifdef __BIG_ENDIAN__
 	rotldi	r9,r9,8
 #endif
-96:	stb	r9,0(r3)
+stex;	stb	r9,0(r3)
 #ifdef __LITTLE_ENDIAN__
 	rotrdi	r9,r9,8
 #endif
@@ -212,47 +267,55 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 	blr
 
 .Ldst_unaligned:
+r3_offset = 0
 	PPC_MTOCRF(0x01,r6)		/* put #bytes to 8B bdry into cr7 */
 	subf	r5,r6,r5
 	li	r7,0
 	cmpldi	cr1,r5,16
 	bf	cr7*4+3,1f
-35:	lbz	r0,0(r4)
-81:	stb	r0,0(r3)
+100:	EX_TABLE(100b, .Lld_exc_r7)
+	lbz	r0,0(r4)
+100:	EX_TABLE(100b, .Lst_exc_r7)
+	stb	r0,0(r3)
 	addi	r7,r7,1
 1:	bf	cr7*4+2,2f
-36:	lhzx	r0,r7,r4
-82:	sthx	r0,r7,r3
+100:	EX_TABLE(100b, .Lld_exc_r7)
+	lhzx	r0,r7,r4
+100:	EX_TABLE(100b, .Lst_exc_r7)
+	sthx	r0,r7,r3
 	addi	r7,r7,2
 2:	bf	cr7*4+1,3f
-37:	lwzx	r0,r7,r4
-83:	stwx	r0,r7,r3
+100:	EX_TABLE(100b, .Lld_exc_r7)
+	lwzx	r0,r7,r4
+100:	EX_TABLE(100b, .Lst_exc_r7)
+	stwx	r0,r7,r3
 3:	PPC_MTOCRF(0x01,r5)
 	add	r4,r6,r4
 	add	r3,r6,r3
 	b	.Ldst_aligned
 
 .Lshort_copy:
+r3_offset = 0
 	bf	cr7*4+0,1f
-38:	lwz	r0,0(r4)
-39:	lwz	r9,4(r4)
+lex;	lwz	r0,0(r4)
+lex;	lwz	r9,4(r4)
 	addi	r4,r4,8
-84:	stw	r0,0(r3)
-85:	stw	r9,4(r3)
+stex;	stw	r0,0(r3)
+stex;	stw	r9,4(r3)
 	addi	r3,r3,8
 1:	bf	cr7*4+1,2f
-40:	lwz	r0,0(r4)
+lex;	lwz	r0,0(r4)
 	addi	r4,r4,4
-86:	stw	r0,0(r3)
+stex;	stw	r0,0(r3)
 	addi	r3,r3,4
 2:	bf	cr7*4+2,3f
-41:	lhz	r0,0(r4)
+lex;	lhz	r0,0(r4)
 	addi	r4,r4,2
-87:	sth	r0,0(r3)
+stex;	sth	r0,0(r3)
 	addi	r3,r3,2
 3:	bf	cr7*4+3,4f
-42:	lbz	r0,0(r4)
-88:	stb	r0,0(r3)
+lex;	lbz	r0,0(r4)
+stex;	stb	r0,0(r3)
 4:	li	r3,0
 	blr
 
@@ -260,48 +323,34 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
  * exception handlers follow
  * we have to return the number of bytes not copied
  * for an exception on a load, we set the rest of the destination to 0
+ * Note that the number of bytes of instructions for adjusting r3 needs
+ * to equal the amount of the adjustment, due to the trick of using
+ * .Lld_exc - r3_offset as the handler address.
  */
 
-136:
-137:
+.Lld_exc_r7:
 	add	r3,r3,r7
-	b	1f
-130:
-131:
+	b	.Lld_exc
+
+	/* adjust by 24 */
 	addi	r3,r3,8
-120:
-320:
-122:
-322:
-124:
-125:
-126:
-127:
-128:
-129:
-133:
+	nop
+	/* adjust by 16 */
 	addi	r3,r3,8
-132:
+	nop
+	/* adjust by 8 */
 	addi	r3,r3,8
-121:
-321:
-344:
-134:
-135:
-138:
-139:
-140:
-141:
-142:
-123:
-144:
-145:
+	nop
 
 /*
- * here we have had a fault on a load and r3 points to the first
- * unmodified byte of the destination
+ * Here we have had a fault on a load and r3 points to the first
+ * unmodified byte of the destination.  We use the original arguments
+ * and r3 to work out how much wasn't copied.  Since we load some
+ * distance ahead of the stores, we continue copying byte-by-byte until
+ * we hit the load fault again in order to copy as much as possible.
  */
-1:	ld	r6,-24(r1)
+.Lld_exc:
+	ld	r6,-24(r1)
 	ld	r4,-16(r1)
 	ld	r5,-8(r1)
 	subf	r6,r6,r3
@@ -312,9 +361,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
  * first see if we can copy any more bytes before hitting another exception
  */
 	mtctr	r5
+r3_offset = 0
+100:	EX_TABLE(100b, .Ldone)
 43:	lbz	r0,0(r4)
 	addi	r4,r4,1
-89:	stb	r0,0(r3)
+stex;	stb	r0,0(r3)
 	addi	r3,r3,1
 	bdnz	43b
 	li	r3,0		/* huh? all copied successfully this time? */
@@ -323,116 +374,63 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 /*
  * here we have trapped again, amount remaining is in ctr.
  */
-143:	mfctr	r3
+.Ldone:
+	mfctr	r3
 	blr
 
 /*
- * exception handlers for stores: we just need to work
- * out how many bytes weren't copied
+ * exception handlers for stores: we need to work out how many bytes
+ * weren't copied, and we may need to copy some more.
+ * Note that the number of bytes of instructions for adjusting r3 needs
+ * to equal the amount of the adjustment, due to the trick of using
+ * .Lst_exc - r3_offset as the handler address.
  */
-182:
-183:
+.Lst_exc_r7:
 	add	r3,r3,r7
-	b	1f
-371:
-180:
+	b	.Lst_exc
+
+	/* adjust by 24 */
 	addi	r3,r3,8
-171:
-177:
-179:
+	nop
+	/* adjust by 16 */
 	addi	r3,r3,8
-370:
-372:
-176:
-178:
+	nop
+	/* adjust by 8 */
 	addi	r3,r3,4
-185:
+	/* adjust by 4 */
 	addi	r3,r3,4
-170:
-172:
-345:
-173:
-174:
-175:
-181:
-184:
-186:
-187:
-188:
-189:	
-194:
-195:
-196:
-1:
-	ld	r6,-24(r1)
-	ld	r5,-8(r1)
-	add	r6,r6,r5
-	subf	r3,r3,r6	/* #bytes not copied */
+.Lst_exc:
+	ld	r6,-24(r1)	/* original destination pointer */
+	ld	r4,-16(r1)	/* original source pointer */
+	ld	r5,-8(r1)	/* original number of bytes */
+	add	r7,r6,r5
+	/*
+	 * If the destination pointer isn't 8-byte aligned,
+	 * we may have got the exception as a result of a
+	 * store that overlapped a page boundary, so we may be
+	 * able to copy a few more bytes.
+	 */
+17:	andi.	r0,r3,7
+	beq	19f
+	subf	r8,r6,r3	/* #bytes copied */
+100:	EX_TABLE(100b,19f)
+	lbzx	r0,r8,r4
+100:	EX_TABLE(100b,19f)
+	stb	r0,0(r3)
+	addi	r3,r3,1
+	cmpld	r3,r7
+	blt	17b
+19:	subf	r3,r3,r7	/* #bytes not copied in r3 */
 	blr
 
-	EX_TABLE(20b,120b)
-	EX_TABLE(220b,320b)
-	EX_TABLE(21b,121b)
-	EX_TABLE(221b,321b)
-	EX_TABLE(70b,170b)
-	EX_TABLE(270b,370b)
-	EX_TABLE(22b,122b)
-	EX_TABLE(222b,322b)
-	EX_TABLE(71b,171b)
-	EX_TABLE(271b,371b)
-	EX_TABLE(72b,172b)
-	EX_TABLE(272b,372b)
-	EX_TABLE(244b,344b)
-	EX_TABLE(245b,345b)
-	EX_TABLE(23b,123b)
-	EX_TABLE(73b,173b)
-	EX_TABLE(44b,144b)
-	EX_TABLE(74b,174b)
-	EX_TABLE(45b,145b)
-	EX_TABLE(75b,175b)
-	EX_TABLE(24b,124b)
-	EX_TABLE(25b,125b)
-	EX_TABLE(26b,126b)
-	EX_TABLE(27b,127b)
-	EX_TABLE(28b,128b)
-	EX_TABLE(29b,129b)
-	EX_TABLE(30b,130b)
-	EX_TABLE(31b,131b)
-	EX_TABLE(32b,132b)
-	EX_TABLE(76b,176b)
-	EX_TABLE(33b,133b)
-	EX_TABLE(77b,177b)
-	EX_TABLE(78b,178b)
-	EX_TABLE(79b,179b)
-	EX_TABLE(80b,180b)
-	EX_TABLE(34b,134b)
-	EX_TABLE(94b,194b)
-	EX_TABLE(95b,195b)
-	EX_TABLE(96b,196b)
-	EX_TABLE(35b,135b)
-	EX_TABLE(81b,181b)
-	EX_TABLE(36b,136b)
-	EX_TABLE(82b,182b)
-	EX_TABLE(37b,137b)
-	EX_TABLE(83b,183b)
-	EX_TABLE(38b,138b)
-	EX_TABLE(39b,139b)
-	EX_TABLE(84b,184b)
-	EX_TABLE(85b,185b)
-	EX_TABLE(40b,140b)
-	EX_TABLE(86b,186b)
-	EX_TABLE(41b,141b)
-	EX_TABLE(87b,187b)
-	EX_TABLE(42b,142b)
-	EX_TABLE(88b,188b)
-	EX_TABLE(43b,143b)
-	EX_TABLE(89b,189b)
-
 /*
  * Routine to copy a whole page of data, optimized for POWER4.
  * On POWER4 it is more than 50% faster than the simple loop
  * above (following the .Ldst_aligned label).
  */
+	.macro	exc
+100:	EX_TABLE(100b, .Labort)
+	.endm
 .Lcopy_page_4K:
 	std	r31,-32(1)
 	std	r30,-40(1)
@@ -451,86 +449,86 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 	li	r0,5
 0:	addi	r5,r5,-24
 	mtctr	r0
-20:	ld	r22,640(4)
-21:	ld	r21,512(4)
-22:	ld	r20,384(4)
-23:	ld	r11,256(4)
-24:	ld	r9,128(4)
-25:	ld	r7,0(4)
-26:	ld	r25,648(4)
-27:	ld	r24,520(4)
-28:	ld	r23,392(4)
-29:	ld	r10,264(4)
-30:	ld	r8,136(4)
-31:	ldu	r6,8(4)
+exc;	ld	r22,640(4)
+exc;	ld	r21,512(4)
+exc;	ld	r20,384(4)
+exc;	ld	r11,256(4)
+exc;	ld	r9,128(4)
+exc;	ld	r7,0(4)
+exc;	ld	r25,648(4)
+exc;	ld	r24,520(4)
+exc;	ld	r23,392(4)
+exc;	ld	r10,264(4)
+exc;	ld	r8,136(4)
+exc;	ldu	r6,8(4)
 	cmpwi	r5,24
 1:
-32:	std	r22,648(3)
-33:	std	r21,520(3)
-34:	std	r20,392(3)
-35:	std	r11,264(3)
-36:	std	r9,136(3)
-37:	std	r7,8(3)
-38:	ld	r28,648(4)
-39:	ld	r27,520(4)
-40:	ld	r26,392(4)
-41:	ld	r31,264(4)
-42:	ld	r30,136(4)
-43:	ld	r29,8(4)
-44:	std	r25,656(3)
-45:	std	r24,528(3)
-46:	std	r23,400(3)
-47:	std	r10,272(3)
-48:	std	r8,144(3)
-49:	std	r6,16(3)
-50:	ld	r22,656(4)
-51:	ld	r21,528(4)
-52:	ld	r20,400(4)
-53:	ld	r11,272(4)
-54:	ld	r9,144(4)
-55:	ld	r7,16(4)
-56:	std	r28,664(3)
-57:	std	r27,536(3)
-58:	std	r26,408(3)
-59:	std	r31,280(3)
-60:	std	r30,152(3)
-61:	stdu	r29,24(3)
-62:	ld	r25,664(4)
-63:	ld	r24,536(4)
-64:	ld	r23,408(4)
-65:	ld	r10,280(4)
-66:	ld	r8,152(4)
-67:	ldu	r6,24(4)
+exc;	std	r22,648(3)
+exc;	std	r21,520(3)
+exc;	std	r20,392(3)
+exc;	std	r11,264(3)
+exc;	std	r9,136(3)
+exc;	std	r7,8(3)
+exc;	ld	r28,648(4)
+exc;	ld	r27,520(4)
+exc;	ld	r26,392(4)
+exc;	ld	r31,264(4)
+exc;	ld	r30,136(4)
+exc;	ld	r29,8(4)
+exc;	std	r25,656(3)
+exc;	std	r24,528(3)
+exc;	std	r23,400(3)
+exc;	std	r10,272(3)
+exc;	std	r8,144(3)
+exc;	std	r6,16(3)
+exc;	ld	r22,656(4)
+exc;	ld	r21,528(4)
+exc;	ld	r20,400(4)
+exc;	ld	r11,272(4)
+exc;	ld	r9,144(4)
+exc;	ld	r7,16(4)
+exc;	std	r28,664(3)
+exc;	std	r27,536(3)
+exc;	std	r26,408(3)
+exc;	std	r31,280(3)
+exc;	std	r30,152(3)
+exc;	stdu	r29,24(3)
+exc;	ld	r25,664(4)
+exc;	ld	r24,536(4)
+exc;	ld	r23,408(4)
+exc;	ld	r10,280(4)
+exc;	ld	r8,152(4)
+exc;	ldu	r6,24(4)
 	bdnz	1b
-68:	std	r22,648(3)
-69:	std	r21,520(3)
-70:	std	r20,392(3)
-71:	std	r11,264(3)
-72:	std	r9,136(3)
-73:	std	r7,8(3)
-74:	addi	r4,r4,640
-75:	addi	r3,r3,648
+exc;	std	r22,648(3)
+exc;	std	r21,520(3)
+exc;	std	r20,392(3)
+exc;	std	r11,264(3)
+exc;	std	r9,136(3)
+exc;	std	r7,8(3)
+	addi	r4,r4,640
+	addi	r3,r3,648
 	bge	0b
 	mtctr	r5
-76:	ld	r7,0(4)
-77:	ld	r8,8(4)
-78:	ldu	r9,16(4)
+exc;	ld	r7,0(4)
+exc;	ld	r8,8(4)
+exc;	ldu	r9,16(4)
 3:
-79:	ld	r10,8(4)
-80:	std	r7,8(3)
-81:	ld	r7,16(4)
-82:	std	r8,16(3)
-83:	ld	r8,24(4)
-84:	std	r9,24(3)
-85:	ldu	r9,32(4)
-86:	stdu	r10,32(3)
+exc;	ld	r10,8(4)
+exc;	std	r7,8(3)
+exc;	ld	r7,16(4)
+exc;	std	r8,16(3)
+exc;	ld	r8,24(4)
+exc;	std	r9,24(3)
+exc;	ldu	r9,32(4)
+exc;	stdu	r10,32(3)
 	bdnz	3b
 4:
-87:	ld	r10,8(4)
-88:	std	r7,8(3)
-89:	std	r8,16(3)
-90:	std	r9,24(3)
-91:	std	r10,32(3)
+exc;	ld	r10,8(4)
+exc;	std	r7,8(3)
+exc;	std	r8,16(3)
+exc;	std	r9,24(3)
+exc;	std	r10,32(3)
 9:	ld	r20,-120(1)
 	ld	r21,-112(1)
 	ld	r22,-104(1)
@@ -550,7 +548,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
  * on an exception, reset to the beginning and jump back into the
  * standard __copy_tofrom_user
  */
-100:	ld	r20,-120(1)
+.Labort:
+	ld	r20,-120(1)
 	ld	r21,-112(1)
 	ld	r22,-104(1)
 	ld	r23,-96(1)
@@ -566,78 +565,4 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 	ld	r4,-16(r1)
 	li	r5,4096
 	b	.Ldst_aligned
-
-	EX_TABLE(20b,100b)
-	EX_TABLE(21b,100b)
-	EX_TABLE(22b,100b)
-	EX_TABLE(23b,100b)
-	EX_TABLE(24b,100b)
-	EX_TABLE(25b,100b)
-	EX_TABLE(26b,100b)
-	EX_TABLE(27b,100b)
-	EX_TABLE(28b,100b)
-	EX_TABLE(29b,100b)
-	EX_TABLE(30b,100b)
-	EX_TABLE(31b,100b)
-	EX_TABLE(32b,100b)
-	EX_TABLE(33b,100b)
-	EX_TABLE(34b,100b)
-	EX_TABLE(35b,100b)
-	EX_TABLE(36b,100b)
-	EX_TABLE(37b,100b)
-	EX_TABLE(38b,100b)
-	EX_TABLE(39b,100b)
-	EX_TABLE(40b,100b)
-	EX_TABLE(41b,100b)
-	EX_TABLE(42b,100b)
-	EX_TABLE(43b,100b)
-	EX_TABLE(44b,100b)
-	EX_TABLE(45b,100b)
-	EX_TABLE(46b,100b)
-	EX_TABLE(47b,100b)
-	EX_TABLE(48b,100b)
-	EX_TABLE(49b,100b)
-	EX_TABLE(50b,100b)
-	EX_TABLE(51b,100b)
-	EX_TABLE(52b,100b)
-	EX_TABLE(53b,100b)
-	EX_TABLE(54b,100b)
-	EX_TABLE(55b,100b)
-	EX_TABLE(56b,100b)
-	EX_TABLE(57b,100b)
-	EX_TABLE(58b,100b)
-	EX_TABLE(59b,100b)
-	EX_TABLE(60b,100b)
-	EX_TABLE(61b,100b)
-	EX_TABLE(62b,100b)
-	EX_TABLE(63b,100b)
-	EX_TABLE(64b,100b)
-	EX_TABLE(65b,100b)
-	EX_TABLE(66b,100b)
-	EX_TABLE(67b,100b)
-	EX_TABLE(68b,100b)
-	EX_TABLE(69b,100b)
-	EX_TABLE(70b,100b)
-	EX_TABLE(71b,100b)
-	EX_TABLE(72b,100b)
-	EX_TABLE(73b,100b)
-	EX_TABLE(74b,100b)
-	EX_TABLE(75b,100b)
-	EX_TABLE(76b,100b)
-	EX_TABLE(77b,100b)
-	EX_TABLE(78b,100b)
-	EX_TABLE(79b,100b)
-	EX_TABLE(80b,100b)
-	EX_TABLE(81b,100b)
-	EX_TABLE(82b,100b)
-	EX_TABLE(83b,100b)
-	EX_TABLE(84b,100b)
-	EX_TABLE(85b,100b)
-	EX_TABLE(86b,100b)
-	EX_TABLE(87b,100b)
-	EX_TABLE(88b,100b)
-	EX_TABLE(89b,100b)
-	EX_TABLE(90b,100b)
-	EX_TABLE(91b,100b)
-
 EXPORT_SYMBOL(__copy_tofrom_user)
diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S
index 215e4760c09f..1a1fe180af62 100644
--- a/arch/powerpc/lib/copyuser_power7.S
+++ b/arch/powerpc/lib/copyuser_power7.S
@@ -19,6 +19,11 @@
  */
 #include <asm/ppc_asm.h>
 
+#ifndef SELFTEST_CASE
+/* 0 == don't use VMX, 1 == use VMX */
+#define SELFTEST_CASE	0
+#endif
+
 #ifdef __BIG_ENDIAN__
 #define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
 #define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
@@ -80,7 +85,6 @@
 
 
 _GLOBAL(__copy_tofrom_user_power7)
-#ifdef CONFIG_ALTIVEC
 	cmpldi	r5,16
 	cmpldi	cr1,r5,3328
 
@@ -89,15 +93,12 @@ _GLOBAL(__copy_tofrom_user_power7)
 	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
 
 	blt	.Lshort_copy
-	bge	cr1,.Lvmx_copy
-#else
-	cmpldi	r5,16
 
-	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
-	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
-
-	blt	.Lshort_copy
+#ifdef CONFIG_ALTIVEC
+test_feature = SELFTEST_CASE
+BEGIN_FTR_SECTION
+	bgt	cr1,.Lvmx_copy
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif
 
 .Lnonvmx_copy:
@@ -278,8 +279,8 @@ err1;	stb	r0,0(r3)
 	addi	r1,r1,STACKFRAMESIZE
 	b	.Lnonvmx_copy
 
-#ifdef CONFIG_ALTIVEC
 .Lvmx_copy:
+#ifdef CONFIG_ALTIVEC
 	mflr	r0
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
diff --git a/arch/powerpc/lib/feature-fixups-test.S b/arch/powerpc/lib/feature-fixups-test.S
index f16cec989506..ee7c5fd5fc64 100644
--- a/arch/powerpc/lib/feature-fixups-test.S
+++ b/arch/powerpc/lib/feature-fixups-test.S
@@ -11,6 +11,7 @@
 #include <asm/feature-fixups.h>
 #include <asm/ppc_asm.h>
 #include <asm/synch.h>
+#include <asm/asm-compat.h>
 
 	.text
 
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index 8b69f868298c..e613b02bb2f0 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -304,6 +304,9 @@ void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_
 	printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i);
 }
 
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BARRIER_NOSPEC
 void do_barrier_nospec_fixups(bool enable)
 {
 	void *start, *end;
@@ -313,8 +316,38 @@ void do_barrier_nospec_fixups(bool enable)
 
 	do_barrier_nospec_fixups_range(enable, start, end);
 }
+#endif /* CONFIG_PPC_BARRIER_NOSPEC */
 
-#endif /* CONFIG_PPC_BOOK3S_64 */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_end)
+{
+	unsigned int instr[2], *dest;
+	long *start, *end;
+	int i;
+
+	start = fixup_start;
+	end = fixup_end;
+
+	instr[0] = PPC_INST_NOP;
+	instr[1] = PPC_INST_NOP;
+
+	if (enable) {
+		pr_info("barrier-nospec: using isync; sync as speculation barrier\n");
+		instr[0] = PPC_INST_ISYNC;
+		instr[1] = PPC_INST_SYNC;
+	}
+
+	for (i = 0; start < end; start++, i++) {
+		dest = (void *)start + *start;
+
+		pr_devel("patching dest %lx\n", (unsigned long)dest);
+		patch_instruction(dest, instr[0]);
+		patch_instruction(dest + 1, instr[1]);
+	}
+
+	printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i);
+}
+#endif /* CONFIG_PPC_FSL_BOOK3E */
 
 void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
 {
diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S
index 3de7ac154f24..0526b2225260 100644
--- a/arch/powerpc/lib/hweight_64.S
+++ b/arch/powerpc/lib/hweight_64.S
@@ -20,6 +20,7 @@
 #include <asm/processor.h>
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/feature-fixups.h>
 
 /* Note: This code relies on -mminimal-toc */
 
diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S
index ae15eba49c1f..32e91994b6b2 100644
--- a/arch/powerpc/lib/ldstfp.S
+++ b/arch/powerpc/lib/ldstfp.S
@@ -15,6 +15,7 @@
 #include <asm/ppc-opcode.h>
 #include <asm/reg.h>
 #include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
 #include <linux/errno.h>
 
 #ifdef CONFIG_PPC_FPU
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index b7b1237d4aa6..35a0ef932e1a 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -15,7 +15,6 @@
 #include <linux/kernel.h>
 #include <linux/spinlock.h>
 #include <linux/export.h>
-#include <linux/stringify.h>
 #include <linux/smp.h>
 
 /* waiting for a spinlock... */
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b7bd55..844d8e774492 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -9,6 +9,7 @@
  */
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/ppc-opcode.h>
 
 #define off8	r6
 #define off16	r7
@@ -24,28 +25,102 @@
 #define rH	r31
 
 #ifdef __LITTLE_ENDIAN__
+#define LH	lhbrx
+#define LW	lwbrx
 #define LD	ldbrx
+#define LVS	lvsr
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+	vperm _VRT,_VRB,_VRA,_VRC
 #else
+#define LH	lhzx
+#define LW	lwzx
 #define LD	ldx
+#define LVS	lvsl
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+	vperm _VRT,_VRA,_VRB,_VRC
 #endif
 
-_GLOBAL(memcmp)
+#define VMX_THRESH 4096
+#define ENTER_VMX_OPS	\
+	mflr    r0;	\
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+	std     r0,16(r1); \
+	stdu    r1,-STACKFRAMESIZE(r1); \
+	bl      enter_vmx_ops; \
+	cmpwi   cr1,r3,0; \
+	ld      r0,STACKFRAMESIZE+16(r1); \
+	ld      r3,STK_REG(R31)(r1); \
+	ld      r4,STK_REG(R30)(r1); \
+	ld      r5,STK_REG(R29)(r1); \
+	addi	r1,r1,STACKFRAMESIZE; \
+	mtlr    r0
+
+#define EXIT_VMX_OPS \
+	mflr    r0; \
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+	std     r0,16(r1); \
+	stdu    r1,-STACKFRAMESIZE(r1); \
+	bl      exit_vmx_ops; \
+	ld      r0,STACKFRAMESIZE+16(r1); \
+	ld      r3,STK_REG(R31)(r1); \
+	ld      r4,STK_REG(R30)(r1); \
+	ld      r5,STK_REG(R29)(r1); \
+	addi	r1,r1,STACKFRAMESIZE; \
+	mtlr    r0
+
+/*
+ * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
+ * 16 bytes boundary and permute the result with the 1st 16 bytes.
+
+ *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
+ *    ^                                  ^                                 ^
+ * 0xbbbb10                          0xbbbb20                          0xbbb30
+ *                                 ^
+ *                                _vaddr
+ *
+ *
+ * _vmask is the mask generated by LVS
+ * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
+ *   for example: 0xyyyyyyyyyyyyy012 for big endian
+ * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
+ *   for example: 0x3456789abcdefzzz for big endian
+ * The permute result is saved in _v_res.
+ *   for example: 0x0123456789abcdef for big endian.
+ */
+#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
+        lvx     _v2nd_qw,_vaddr,off16; \
+        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
+
+/*
+ * There are 2 categories for memcmp:
+ * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
+ * are named like .Lsameoffset_xxxx
+ * 2) src/dst has different offset to the 8 bytes boundary. The handlers
+ * are named like .Ldiffoffset_xxxx
+ */
+_GLOBAL_TOC(memcmp)
 	cmpdi	cr1,r5,0
 
-	/* Use the short loop if both strings are not 8B aligned */
-	or	r6,r3,r4
+	/* Use the short loop if the src/dst addresses are not
+	 * with the same offset of 8 bytes align boundary.
+	 */
+	xor	r6,r3,r4
 	andi.	r6,r6,7
 
-	/* Use the short loop if length is less than 32B */
-	cmpdi	cr6,r5,31
+	/* Fall back to short loop if compare at aligned addrs
+	 * with less than 8 bytes.
+	 */
+	cmpdi   cr6,r5,7
 
 	beq	cr1,.Lzero
-	bne	.Lshort
-	bgt	cr6,.Llong
+	bgt	cr6,.Lno_short
 
 .Lshort:
 	mtctr	r5
-
 1:	lbz	rA,0(r3)
 	lbz	rB,0(r4)
 	subf.	rC,rB,rA
@@ -78,11 +153,98 @@ _GLOBAL(memcmp)
 	li	r3,0
 	blr
 
+.Lno_short:
+	dcbt	0,r3
+	dcbt	0,r4
+	bne	.Ldiffoffset_8bytes_make_align_start
+
+
+.Lsameoffset_8bytes_make_align_start:
+	/* attempt to compare bytes not aligned with 8 bytes so that
+	 * rest comparison can run based on 8 bytes alignment.
+	 */
+	andi.   r6,r3,7
+
+	/* Try to compare the first double word which is not 8 bytes aligned:
+	 * load the first double word at (src & ~7UL) and shift left appropriate
+	 * bits before comparision.
+	 */
+	rlwinm  r6,r3,3,26,28
+	beq     .Lsameoffset_8bytes_aligned
+	clrrdi	r3,r3,3
+	clrrdi	r4,r4,3
+	LD	rA,0,r3
+	LD	rB,0,r4
+	sld	rA,rA,r6
+	sld	rB,rB,r6
+	cmpld	cr0,rA,rB
+	srwi	r6,r6,3
+	bne	cr0,.LcmpAB_lightweight
+	subfic  r6,r6,8
+	subf.	r5,r6,r5
+	addi	r3,r3,8
+	addi	r4,r4,8
+	beq	.Lzero
+
+.Lsameoffset_8bytes_aligned:
+	/* now we are aligned with 8 bytes.
+	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
+	 */
+	cmpdi   cr6,r5,31
+	bgt	cr6,.Llong
+
+.Lcmp_lt32bytes:
+	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
+	cmpdi   cr5,r5,7
+	srdi    r0,r5,3
+	ble	cr5,.Lcmp_rest_lt8bytes
+
+	/* handle 8 ~ 31 bytes */
+	clrldi  r5,r5,61
+	mtctr   r0
+2:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bne	cr0,.LcmpAB_lightweight
+	bdnz	2b
+
+	cmpwi   r5,0
+	beq	.Lzero
+
+.Lcmp_rest_lt8bytes:
+	/* Here we have only less than 8 bytes to compare with. at least s1
+	 * Address is aligned with 8 bytes.
+	 * The next double words are load and shift right with appropriate
+	 * bits.
+	 */
+	subfic  r6,r5,8
+	slwi	r6,r6,3
+	LD	rA,0,r3
+	LD	rB,0,r4
+	srd	rA,rA,r6
+	srd	rB,rB,r6
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	b	.Lzero
+
 .Lnon_zero:
 	mr	r3,rC
 	blr
 
 .Llong:
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	/* Try to use vmx loop if length is equal or greater than 4K */
+	cmpldi  cr6,r5,VMX_THRESH
+	bge	cr6,.Lsameoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Llong_novmx_cmp:
+#endif
+	/* At least s1 addr is aligned with 8 bytes */
 	li	off8,8
 	li	off16,16
 	li	off24,24
@@ -232,4 +394,240 @@ _GLOBAL(memcmp)
 	ld	r28,-32(r1)
 	ld	r27,-40(r1)
 	blr
+
+.LcmpAB_lightweight:   /* skip NV GPRS restore */
+	li	r3,1
+	bgtlr
+	li	r3,-1
+	blr
+
+#ifdef CONFIG_ALTIVEC
+.Lsameoffset_vmx_cmp:
+	/* Enter with src/dst addrs has the same offset with 8 bytes
+	 * align boundary.
+	 *
+	 * There is an optimization based on following fact: memcmp()
+	 * prones to fail early at the first 32 bytes.
+	 * Before applying VMX instructions which will lead to 32x128bits
+	 * VMX regs load/restore penalty, we compare the first 32 bytes
+	 * so that we can catch the ~80% fail cases.
+	 */
+
+	li	r0,4
+	mtctr	r0
+.Lsameoffset_prechk_32B_loop:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bne     cr0,.LcmpAB_lightweight
+	addi	r5,r5,-8
+	bdnz	.Lsameoffset_prechk_32B_loop
+
+	ENTER_VMX_OPS
+	beq     cr1,.Llong_novmx_cmp
+
+3:
+	/* need to check whether r4 has the same offset with r3
+	 * for 16 bytes boundary.
+	 */
+	xor	r0,r3,r4
+	andi.	r0,r0,0xf
+	bne	.Ldiffoffset_vmx_cmp_start
+
+	/* len is no less than 4KB. Need to align with 16 bytes further.
+	 */
+	andi.	rA,r3,8
+	LD	rA,0,r3
+	beq	4f
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	addi	r5,r5,-8
+
+	beq	cr0,4f
+	/* save and restore cr0 */
+	mfocrf  r5,128
+	EXIT_VMX_OPS
+	mtocrf  128,r5
+	b	.LcmpAB_lightweight
+
+4:
+	/* compare 32 bytes for each loop */
+	srdi	r0,r5,5
+	mtctr	r0
+	clrldi  r5,r5,59
+	li	off16,16
+
+.balign 16
+5:
+	lvx 	v0,0,r3
+	lvx 	v1,0,r4
+	VCMPEQUD_RC(v0,v0,v1)
+	bnl	cr6,7f
+	lvx 	v0,off16,r3
+	lvx 	v1,off16,r4
+	VCMPEQUD_RC(v0,v0,v1)
+	bnl	cr6,6f
+	addi	r3,r3,32
+	addi	r4,r4,32
+	bdnz	5b
+
+	EXIT_VMX_OPS
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lcmp_lt32bytes
+
+6:
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+7:
+	/* diff the last 16 bytes */
+	EXIT_VMX_OPS
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	li	off8,8
+	bne	cr0,.LcmpAB_lightweight
+
+	LD	rA,off8,r3
+	LD	rB,off8,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	b	.Lzero
+#endif
+
+.Ldiffoffset_8bytes_make_align_start:
+	/* now try to align s1 with 8 bytes */
+	rlwinm  r6,r3,3,26,28
+	beq     .Ldiffoffset_align_s1_8bytes
+
+	clrrdi	r3,r3,3
+	LD	rA,0,r3
+	LD	rB,0,r4  /* unaligned load */
+	sld	rA,rA,r6
+	srd	rA,rA,r6
+	srd	rB,rB,r6
+	cmpld	cr0,rA,rB
+	srwi	r6,r6,3
+	bne	cr0,.LcmpAB_lightweight
+
+	subfic  r6,r6,8
+	subf.	r5,r6,r5
+	addi	r3,r3,8
+	add	r4,r4,r6
+
+	beq	.Lzero
+
+.Ldiffoffset_align_s1_8bytes:
+	/* now s1 is aligned with 8 bytes. */
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	/* only do vmx ops when the size equal or greater than 4K bytes */
+	cmpdi	cr5,r5,VMX_THRESH
+	bge	cr5,.Ldiffoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Ldiffoffset_novmx_cmp:
+#endif
+
+
+	cmpdi   cr5,r5,31
+	ble	cr5,.Lcmp_lt32bytes
+
+#ifdef CONFIG_ALTIVEC
+	b	.Llong_novmx_cmp
+#else
+	b	.Llong
+#endif
+
+#ifdef CONFIG_ALTIVEC
+.Ldiffoffset_vmx_cmp:
+	/* perform a 32 bytes pre-checking before
+	 * enable VMX operations.
+	 */
+	li	r0,4
+	mtctr	r0
+.Ldiffoffset_prechk_32B_loop:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bne     cr0,.LcmpAB_lightweight
+	addi	r5,r5,-8
+	bdnz	.Ldiffoffset_prechk_32B_loop
+
+	ENTER_VMX_OPS
+	beq     cr1,.Ldiffoffset_novmx_cmp
+
+.Ldiffoffset_vmx_cmp_start:
+	/* Firstly try to align r3 with 16 bytes */
+	andi.   r6,r3,0xf
+	li	off16,16
+	beq     .Ldiffoffset_vmx_s1_16bytes_align
+
+	LVS	v3,0,r3
+	LVS	v4,0,r4
+
+	lvx     v5,0,r3
+	lvx     v6,0,r4
+	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+
+	VCMPEQUB_RC(v7,v9,v10)
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	subfic  r6,r6,16
+	subf    r5,r6,r5
+	add     r3,r3,r6
+	add     r4,r4,r6
+
+.Ldiffoffset_vmx_s1_16bytes_align:
+	/* now s1 is aligned with 16 bytes */
+	lvx     v6,0,r4
+	LVS	v4,0,r4
+	srdi	r6,r5,5  /* loop for 32 bytes each */
+	clrldi  r5,r5,59
+	mtctr	r6
+
+.balign	16
+.Ldiffoffset_vmx_32bytesloop:
+	/* the first qw of r4 was saved in v6 */
+	lvx	v9,0,r3
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+	VCMPEQUB_RC(v7,v9,v10)
+	vor	v6,v8,v8
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+	lvx	v9,0,r3
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+	VCMPEQUB_RC(v7,v9,v10)
+	vor	v6,v8,v8
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+	bdnz	.Ldiffoffset_vmx_32bytesloop
+
+	EXIT_VMX_OPS
+
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lcmp_lt32bytes
+
+.Ldiffoffset_vmx_diff_found:
+	EXIT_VMX_OPS
+	/* anyway, the diff will appear in next 16 bytes */
+	li	r5,16
+	b	.Lcmp_lt32bytes
+
+#endif
 EXPORT_SYMBOL(memcmp)
diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S
index 8d8265be1a59..273ea67e60a1 100644
--- a/arch/powerpc/lib/memcpy_64.S
+++ b/arch/powerpc/lib/memcpy_64.S
@@ -9,6 +9,13 @@
 #include <asm/processor.h>
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+#ifndef SELFTEST_CASE
+/* For big-endian, 0 == most CPUs, 1 == POWER6, 2 == Cell */
+#define SELFTEST_CASE	0
+#endif
 
 	.align	7
 _GLOBAL_TOC(memcpy)
@@ -20,10 +27,8 @@ BEGIN_FTR_SECTION
 #endif
 FTR_SECTION_ELSE
 #ifdef CONFIG_PPC_BOOK3S_64
-#ifndef SELFTEST
 	b	memcpy_power7
 #endif
-#endif
 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
 #ifdef __LITTLE_ENDIAN__
 	/* dumb little-endian memcpy that will get replaced at runtime */
@@ -47,6 +52,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
    cleared.
    At the time of writing the only CPU that has this combination of bits
    set is Power6. */
+test_feature = (SELFTEST_CASE == 1)
 BEGIN_FTR_SECTION
 	nop
 FTR_SECTION_ELSE
@@ -55,6 +61,7 @@ ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
                     CPU_FTR_UNALIGNED_LD_STD)
 .Ldst_aligned:
 	addi	r3,r3,-16
+test_feature = (SELFTEST_CASE == 0)
 BEGIN_FTR_SECTION
 	andi.	r0,r4,7
 	bne	.Lsrc_unaligned
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
index df7de9d3da08..89bfefcf7fcc 100644
--- a/arch/powerpc/lib/memcpy_power7.S
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -19,7 +19,10 @@
  */
 #include <asm/ppc_asm.h>
 
-_GLOBAL(memcpy_power7)
+#ifndef SELFTEST_CASE
+/* 0 == don't use VMX, 1 == use VMX */
+#define SELFTEST_CASE	0
+#endif
 
 #ifdef __BIG_ENDIAN__
 #define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
@@ -29,20 +32,17 @@ _GLOBAL(memcpy_power7)
 #define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
 #endif
 
-#ifdef CONFIG_ALTIVEC
+_GLOBAL(memcpy_power7)
 	cmpldi	r5,16
 	cmpldi	cr1,r5,4096
-
 	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-
 	blt	.Lshort_copy
-	bgt	cr1,.Lvmx_copy
-#else
-	cmpldi	r5,16
-
-	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
 
-	blt	.Lshort_copy
+#ifdef CONFIG_ALTIVEC
+test_feature = SELFTEST_CASE
+BEGIN_FTR_SECTION
+	bgt	cr1, .Lvmx_copy
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif
 
 .Lnonvmx_copy:
@@ -223,14 +223,14 @@ _GLOBAL(memcpy_power7)
 	addi	r1,r1,STACKFRAMESIZE
 	b	.Lnonvmx_copy
 
-#ifdef CONFIG_ALTIVEC
 .Lvmx_copy:
+#ifdef CONFIG_ALTIVEC
 	mflr	r0
 	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
 	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	cr1,r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
@@ -445,7 +445,7 @@ _GLOBAL(memcpy_power7)
 
 15:	addi	r1,r1,STACKFRAMESIZE
 	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 
 .Lvmx_unaligned_copy:
 	/* Get the destination 16B aligned */
@@ -649,5 +649,5 @@ _GLOBAL(memcpy_power7)
 
 15:	addi	r1,r1,STACKFRAMESIZE
 	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 #endif /* CONFIG_ALTIVEC */
diff --git a/arch/powerpc/lib/strlen_32.S b/arch/powerpc/lib/strlen_32.S
new file mode 100644
index 000000000000..0a8d3f64d493
--- /dev/null
+++ b/arch/powerpc/lib/strlen_32.S
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * strlen() for PPC32
+ *
+ * Copyright (C) 2018 Christophe Leroy CS Systemes d'Information.
+ *
+ * Inspired from glibc implementation
+ */
+#include <asm/ppc_asm.h>
+#include <asm/export.h>
+#include <asm/cache.h>
+
+	.text
+
+/*
+ * Algorithm:
+ *
+ * 1) Given a word 'x', we can test to see if it contains any 0 bytes
+ *    by subtracting 0x01010101, and seeing if any of the high bits of each
+ *    byte changed from 0 to 1. This works because the least significant
+ *    0 byte must have had no incoming carry (otherwise it's not the least
+ *    significant), so it is 0x00 - 0x01 == 0xff. For all other
+ *    byte values, either they have the high bit set initially, or when
+ *    1 is subtracted you get a value in the range 0x00-0x7f, none of which
+ *    have their high bit set. The expression here is
+ *    (x - 0x01010101) & ~x & 0x80808080), which gives 0x00000000 when
+ *    there were no 0x00 bytes in the word.  You get 0x80 in bytes that
+ *    match, but possibly false 0x80 matches in the next more significant
+ *    byte to a true match due to carries.  For little-endian this is
+ *    of no consequence since the least significant match is the one
+ *    we're interested in, but big-endian needs method 2 to find which
+ *    byte matches.
+ * 2) Given a word 'x', we can test to see _which_ byte was zero by
+ *    calculating ~(((x & ~0x80808080) - 0x80808080 - 1) | x | ~0x80808080).
+ *    This produces 0x80 in each byte that was zero, and 0x00 in all
+ *    the other bytes. The '| ~0x80808080' clears the low 7 bits in each
+ *    byte, and the '| x' part ensures that bytes with the high bit set
+ *    produce 0x00. The addition will carry into the high bit of each byte
+ *    iff that byte had one of its low 7 bits set. We can then just see
+ *    which was the most significant bit set and divide by 8 to find how
+ *    many to add to the index.
+ *    This is from the book 'The PowerPC Compiler Writer's Guide',
+ *    by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren.
+ */
+
+_GLOBAL(strlen)
+	andi.   r0, r3, 3
+	lis	r7, 0x0101
+	addi	r10, r3, -4
+	addic	r7, r7, 0x0101	/* r7 = 0x01010101 (lomagic) & clear XER[CA] */
+	rotlwi	r6, r7, 31 	/* r6 = 0x80808080 (himagic) */
+	bne-	3f
+	.balign IFETCH_ALIGN_BYTES
+1:	lwzu	r9, 4(r10)
+2:	subf	r8, r7, r9
+	and.	r8, r8, r6
+	beq+	1b
+	andc.	r8, r8, r9
+	beq+	1b
+	andc	r8, r9, r6
+	orc	r9, r9, r6
+	subfe	r8, r6, r8
+	nor	r8, r8, r9
+	cntlzw	r8, r8
+	subf	r3, r3, r10
+	srwi	r8, r8, 3
+	add	r3, r3, r8
+	blr
+
+	/* Missaligned string: make sure bytes before string are seen not 0 */
+3:	xor	r10, r10, r0
+	orc	r8, r8, r8
+	lwzu	r9, 4(r10)
+	slwi	r0, r0, 3
+	srw	r8, r8, r0
+	orc	r9, r9, r8
+	b	2b
+EXPORT_SYMBOL(strlen)
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index bf925cdcaca9..9f340494a8ac 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -53,7 +53,7 @@ int exit_vmx_usercopy(void)
 	return 0;
 }
 
-int enter_vmx_copy(void)
+int enter_vmx_ops(void)
 {
 	if (in_interrupt())
 		return 0;
@@ -70,7 +70,7 @@ int enter_vmx_copy(void)
  * passed a pointer to the destination which we return as required by a
  * memcpy implementation.
  */
-void *exit_vmx_copy(void *dest)
+void *exit_vmx_ops(void *dest)
 {
 	disable_kernel_altivec();
 	preempt_enable();