26 files changed, 670 insertions, 1041 deletions
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
index f4c6472f2fc4..f29212e40f40 100644
--- a/arch/powerpc/mm/8xx_mmu.c
+++ b/arch/powerpc/mm/8xx_mmu.c
@@ -22,8 +22,11 @@
 
 extern int __map_without_ltlbs;
 
+static unsigned long block_mapped_ram;
+
 /*
- * Return PA for this VA if it is in IMMR area, or 0
+ * Return PA for this VA if it is in an area mapped with LTLBs.
+ * Otherwise, returns 0
  */
 phys_addr_t v_block_mapped(unsigned long va)
 {
@@ -33,11 +36,13 @@ phys_addr_t v_block_mapped(unsigned long va)
 		return 0;
 	if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE)
 		return p + va - VIRT_IMMR_BASE;
+	if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram)
+		return __pa(va);
 	return 0;
 }
 
 /*
- * Return VA for a given PA or 0 if not mapped
+ * Return VA for a given PA mapped with LTLBs or 0 if not mapped
  */
 unsigned long p_block_mapped(phys_addr_t pa)
 {
@@ -47,6 +52,8 @@ unsigned long p_block_mapped(phys_addr_t pa)
 		return 0;
 	if (pa >= p && pa < p + IMMR_SIZE)
 		return VIRT_IMMR_BASE + pa - p;
+	if (pa < block_mapped_ram)
+		return (unsigned long)__va(pa);
 	return 0;
 }
 
@@ -58,7 +65,7 @@ unsigned long p_block_mapped(phys_addr_t pa)
 void __init MMU_init_hw(void)
 {
 	/* PIN up to the 3 first 8Mb after IMMR in DTLB table */
-#ifdef CONFIG_PIN_TLB
+#ifdef CONFIG_PIN_TLB_DATA
 	unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000;
 	unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SHARED | _PAGE_DIRTY;
 #ifdef CONFIG_PIN_TLB_IMMR
@@ -80,7 +87,7 @@ void __init MMU_init_hw(void)
 #endif
 }
 
-static void mmu_mapin_immr(void)
+static void __init mmu_mapin_immr(void)
 {
 	unsigned long p = PHYS_IMMR_BASE;
 	unsigned long v = VIRT_IMMR_BASE;
@@ -96,8 +103,11 @@ static void mmu_mapin_immr(void)
 extern unsigned int DTLBMiss_jmp;
 #endif
 extern unsigned int DTLBMiss_cmp, FixupDAR_cmp;
+#ifndef CONFIG_PIN_TLB_TEXT
+extern unsigned int ITLBMiss_cmp;
+#endif
 
-void mmu_patch_cmp_limit(unsigned int *addr, unsigned long mapped)
+static void __init mmu_patch_cmp_limit(unsigned int *addr, unsigned long mapped)
 {
 	unsigned int instr = *addr;
 
@@ -116,6 +126,9 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
 #ifndef CONFIG_PIN_TLB_IMMR
 		patch_instruction(&DTLBMiss_jmp, PPC_INST_NOP);
 #endif
+#ifndef CONFIG_PIN_TLB_TEXT
+		mmu_patch_cmp_limit(&ITLBMiss_cmp, 0);
+#endif
 	} else {
 		mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
 	}
@@ -133,11 +146,13 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
 	if (mapped)
 		memblock_set_current_limit(mapped);
 
+	block_mapped_ram = mapped;
+
 	return mapped;
 }
 
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
-				phys_addr_t first_memblock_size)
+void __init setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				       phys_addr_t first_memblock_size)
 {
 	/* We don't currently support the first MEMBLOCK not mapping 0
 	 * physical on those processors
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 7414034df1c3..fb844d2f266e 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -8,7 +8,7 @@ ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 
 obj-y				:= fault.o mem.o pgtable.o mmap.o \
 				   init_$(BITS).o pgtable_$(BITS).o \
-				   init-common.o
+				   init-common.o mmu_context.o
 obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
 				   tlb_nohash_low.o
 obj-$(CONFIG_PPC_BOOK3E)	+= tlb_low_$(BITS)e.o
@@ -22,8 +22,6 @@ ifeq ($(CONFIG_PPC_STD_MMU_64),y)
 obj-$(CONFIG_PPC_4K_PAGES)	+= hash64_4k.o
 obj-$(CONFIG_PPC_64K_PAGES)	+= hash64_64k.o
 endif
-obj-$(CONFIG_PPC_ICSWX)		+= icswx.o
-obj-$(CONFIG_PPC_ICSWX_PID)	+= icswx_pid.o
 obj-$(CONFIG_40x)		+= 40x_mmu.o
 obj-$(CONFIG_44x)		+= 44x_mmu.o
 obj-$(CONFIG_PPC_8xx)		+= 8xx_mmu.o
diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c
index b1c144b03fcf..5c4c93dcff19 100644
--- a/arch/powerpc/mm/dump_hashpagetable.c
+++ b/arch/powerpc/mm/dump_hashpagetable.c
@@ -205,7 +205,7 @@ static void dump_hpte_info(struct pg_state *st, unsigned long ea, u64 v, u64 r,
 	aps_index = calculate_pagesize(st, aps, "actual");
 	if (aps_index != 2)
 		seq_printf(st->seq, "LP enc: %lx", lp);
-	seq_puts(st->seq, "\n");
+	seq_putc(st->seq, '\n');
 }
 
 
diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c
index 44fe4833910f..c9282d27b203 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -350,7 +350,7 @@ static void note_page(struct pg_state *st, unsigned long addr,
 					  st->current_flags,
 					  pg_level[st->level].num);
 
-			seq_puts(st->seq, "\n");
+			seq_putc(st->seq, '\n');
 		}
 
 		/*
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 4c422632047b..4797d08581ce 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -45,43 +45,39 @@
 #include <asm/siginfo.h>
 #include <asm/debug.h>
 
-#include "icswx.h"
-
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs)
+static inline bool notify_page_fault(struct pt_regs *regs)
 {
-	int ret = 0;
+	bool ret = false;
 
+#ifdef CONFIG_KPROBES
 	/* kprobe_running() needs smp_processor_id() */
 	if (!user_mode(regs)) {
 		preempt_disable();
 		if (kprobe_running() && kprobe_fault_handler(regs, 11))
-			ret = 1;
+			ret = true;
 		preempt_enable();
 	}
+#endif /* CONFIG_KPROBES */
+
+	if (unlikely(debugger_fault_handler(regs)))
+		ret = true;
 
 	return ret;
 }
-#else
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-	return 0;
-}
-#endif
 
 /*
  * Check whether the instruction at regs->nip is a store using
  * an update addressing form which will update r1.
  */
-static int store_updates_sp(struct pt_regs *regs)
+static bool store_updates_sp(struct pt_regs *regs)
 {
 	unsigned int inst;
 
 	if (get_user(inst, (unsigned int __user *)regs->nip))
-		return 0;
+		return false;
 	/* check for 1 in the rA field */
 	if (((inst >> 16) & 0x1f) != 1)
-		return 0;
+		return false;
 	/* check major opcode */
 	switch (inst >> 26) {
 	case 37:	/* stwu */
@@ -89,7 +85,7 @@ static int store_updates_sp(struct pt_regs *regs)
 	case 45:	/* sthu */
 	case 53:	/* stfsu */
 	case 55:	/* stfdu */
-		return 1;
+		return true;
 	case 62:	/* std or stdu */
 		return (inst & 3) == 1;
 	case 31:
@@ -101,18 +97,53 @@ static int store_updates_sp(struct pt_regs *regs)
 		case 439:	/* sthux */
 		case 695:	/* stfsux */
 		case 759:	/* stfdux */
-			return 1;
+			return true;
 		}
 	}
-	return 0;
+	return false;
 }
 /*
  * do_page_fault error handling helpers
  */
 
-#define MM_FAULT_RETURN		0
-#define MM_FAULT_CONTINUE	-1
-#define MM_FAULT_ERR(sig)	(sig)
+static int
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code)
+{
+	/*
+	 * If we are in kernel mode, bail out with a SEGV, this will
+	 * be caught by the assembly which will restore the non-volatile
+	 * registers before calling bad_page_fault()
+	 */
+	if (!user_mode(regs))
+		return SIGSEGV;
+
+	_exception(SIGSEGV, regs, si_code, address);
+
+	return 0;
+}
+
+static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address)
+{
+	return __bad_area_nosemaphore(regs, address, SEGV_MAPERR);
+}
+
+static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code)
+{
+	struct mm_struct *mm = current->mm;
+
+	/*
+	 * Something tried to access memory that isn't in our memory map..
+	 * Fix it, but check if it's kernel or user first..
+	 */
+	up_read(&mm->mmap_sem);
+
+	return __bad_area_nosemaphore(regs, address, si_code);
+}
+
+static noinline int bad_area(struct pt_regs *regs, unsigned long address)
+{
+	return __bad_area(regs, address, SEGV_MAPERR);
+}
 
 static int do_sigbus(struct pt_regs *regs, unsigned long address,
 		     unsigned int fault)
@@ -121,7 +152,7 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
 	unsigned int lsb = 0;
 
 	if (!user_mode(regs))
-		return MM_FAULT_ERR(SIGBUS);
+		return SIGBUS;
 
 	current->thread.trap_nr = BUS_ADRERR;
 	info.si_signo = SIGBUS;
@@ -142,25 +173,17 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
 #endif
 	info.si_addr_lsb = lsb;
 	force_sig_info(SIGBUS, &info, current);
-	return MM_FAULT_RETURN;
+	return 0;
 }
 
 static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
 {
 	/*
-	 * Pagefault was interrupted by SIGKILL. We have no reason to
-	 * continue the pagefault.
+	 * Kernel page fault interrupted by SIGKILL. We have no reason to
+	 * continue processing.
 	 */
-	if (fatal_signal_pending(current)) {
-		/* Coming from kernel, we need to deal with uaccess fixups */
-		if (user_mode(regs))
-			return MM_FAULT_RETURN;
-		return MM_FAULT_ERR(SIGKILL);
-	}
-
-	/* No fault: be happy */
-	if (!(fault & VM_FAULT_ERROR))
-		return MM_FAULT_CONTINUE;
+	if (fatal_signal_pending(current) && !user_mode(regs))
+		return SIGKILL;
 
 	/* Out of memory */
 	if (fault & VM_FAULT_OOM) {
@@ -169,19 +192,176 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
 		 * made us unable to handle the page fault gracefully.
 		 */
 		if (!user_mode(regs))
-			return MM_FAULT_ERR(SIGKILL);
+			return SIGSEGV;
 		pagefault_out_of_memory();
-		return MM_FAULT_RETURN;
+	} else {
+		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+			     VM_FAULT_HWPOISON_LARGE))
+			return do_sigbus(regs, addr, fault);
+		else if (fault & VM_FAULT_SIGSEGV)
+			return bad_area_nosemaphore(regs, addr);
+		else
+			BUG();
+	}
+	return 0;
+}
+
+/* Is this a bad kernel fault ? */
+static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
+			     unsigned long address)
+{
+	if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT))) {
+		printk_ratelimited(KERN_CRIT "kernel tried to execute"
+				   " exec-protected page (%lx) -"
+				   "exploit attempt? (uid: %d)\n",
+				   address, from_kuid(&init_user_ns,
+						      current_uid()));
 	}
+	return is_exec || (address >= TASK_SIZE);
+}
 
-	if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE))
-		return do_sigbus(regs, addr, fault);
+static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
+				struct vm_area_struct *vma,
+				bool store_update_sp)
+{
+	/*
+	 * N.B. The POWER/Open ABI allows programs to access up to
+	 * 288 bytes below the stack pointer.
+	 * The kernel signal delivery code writes up to about 1.5kB
+	 * below the stack pointer (r1) before decrementing it.
+	 * The exec code can write slightly over 640kB to the stack
+	 * before setting the user r1.  Thus we allow the stack to
+	 * expand to 1MB without further checks.
+	 */
+	if (address + 0x100000 < vma->vm_end) {
+		/* get user regs even if this fault is in kernel mode */
+		struct pt_regs *uregs = current->thread.regs;
+		if (uregs == NULL)
+			return true;
 
-	/* We don't understand the fault code, this is fatal */
-	BUG();
-	return MM_FAULT_CONTINUE;
+		/*
+		 * A user-mode access to an address a long way below
+		 * the stack pointer is only valid if the instruction
+		 * is one which would update the stack pointer to the
+		 * address accessed if the instruction completed,
+		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
+		 * (or the byte, halfword, float or double forms).
+		 *
+		 * If we don't check this then any write to the area
+		 * between the last mapped region and the stack will
+		 * expand the stack rather than segfaulting.
+		 */
+		if (address + 2048 < uregs->gpr[1] && !store_update_sp)
+			return true;
+	}
+	return false;
+}
+
+static bool access_error(bool is_write, bool is_exec,
+			 struct vm_area_struct *vma)
+{
+	/*
+	 * Allow execution from readable areas if the MMU does not
+	 * provide separate controls over reading and executing.
+	 *
+	 * Note: That code used to not be enabled for 4xx/BookE.
+	 * It is now as I/D cache coherency for these is done at
+	 * set_pte_at() time and I see no reason why the test
+	 * below wouldn't be valid on those processors. This -may-
+	 * break programs compiled with a really old ABI though.
+	 */
+	if (is_exec) {
+		return !(vma->vm_flags & VM_EXEC) &&
+			(cpu_has_feature(CPU_FTR_NOEXECUTE) ||
+			 !(vma->vm_flags & (VM_READ | VM_WRITE)));
+	}
+
+	if (is_write) {
+		if (unlikely(!(vma->vm_flags & VM_WRITE)))
+			return true;
+		return false;
+	}
+
+	if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+		return true;
+
+	return false;
 }
 
+#ifdef CONFIG_PPC_SMLPAR
+static inline void cmo_account_page_fault(void)
+{
+	if (firmware_has_feature(FW_FEATURE_CMO)) {
+		u32 page_ins;
+
+		preempt_disable();
+		page_ins = be32_to_cpu(get_lppaca()->page_ins);
+		page_ins += 1 << PAGE_FACTOR;
+		get_lppaca()->page_ins = cpu_to_be32(page_ins);
+		preempt_enable();
+	}
+}
+#else
+static inline void cmo_account_page_fault(void) { }
+#endif /* CONFIG_PPC_SMLPAR */
+
+#ifdef CONFIG_PPC_STD_MMU
+static void sanity_check_fault(bool is_write, unsigned long error_code)
+{
+	/*
+	 * For hash translation mode, we should never get a
+	 * PROTFAULT. Any update to pte to reduce access will result in us
+	 * removing the hash page table entry, thus resulting in a DSISR_NOHPTE
+	 * fault instead of DSISR_PROTFAULT.
+	 *
+	 * A pte update to relax the access will not result in a hash page table
+	 * entry invalidate and hence can result in DSISR_PROTFAULT.
+	 * ptep_set_access_flags() doesn't do a hpte flush. This is why we have
+	 * the special !is_write in the below conditional.
+	 *
+	 * For platforms that doesn't supports coherent icache and do support
+	 * per page noexec bit, we do setup things such that we do the
+	 * sync between D/I cache via fault. But that is handled via low level
+	 * hash fault code (hash_page_do_lazy_icache()) and we should not reach
+	 * here in such case.
+	 *
+	 * For wrong access that can result in PROTFAULT, the above vma->vm_flags
+	 * check should handle those and hence we should fall to the bad_area
+	 * handling correctly.
+	 *
+	 * For embedded with per page exec support that doesn't support coherent
+	 * icache we do get PROTFAULT and we handle that D/I cache sync in
+	 * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON
+	 * is conditional for server MMU.
+	 *
+	 * For radix, we can get prot fault for autonuma case, because radix
+	 * page table will have them marked noaccess for user.
+	 */
+	if (!radix_enabled() && !is_write)
+		WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
+}
+#else
+static void sanity_check_fault(bool is_write, unsigned long error_code) { }
+#endif /* CONFIG_PPC_STD_MMU */
+
+/*
+ * Define the correct "is_write" bit in error_code based
+ * on the processor family
+ */
+#if (defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+#define page_fault_is_write(__err)	((__err) & ESR_DST)
+#define page_fault_is_bad(__err)	(0)
+#else
+#define page_fault_is_write(__err)	((__err) & DSISR_ISSTORE)
+#if defined(CONFIG_PPC_8xx)
+#define page_fault_is_bad(__err)	((__err) & DSISR_NOEXEC_OR_G)
+#elif defined(CONFIG_PPC64)
+#define page_fault_is_bad(__err)	((__err) & DSISR_BAD_FAULT_64S)
+#else
+#define page_fault_is_bad(__err)	((__err) & DSISR_BAD_FAULT_32S)
+#endif
+#endif
+
 /*
  * For 600- and 800-family processors, the error_code parameter is DSISR
  * for a data fault, SRR1 for an instruction fault. For 400-family processors
@@ -195,92 +375,56 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
  * The return value is 0 if the fault was handled, or the signal
  * number if this is a kernel fault that can't be handled here.
  */
-int do_page_fault(struct pt_regs *regs, unsigned long address,
-			    unsigned long error_code)
+static int __do_page_fault(struct pt_regs *regs, unsigned long address,
+			   unsigned long error_code)
 {
-	enum ctx_state prev_state = exception_enter();
 	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
-	int code = SEGV_MAPERR;
-	int is_write = 0;
-	int trap = TRAP(regs);
- 	int is_exec = trap == 0x400;
+ 	int is_exec = TRAP(regs) == 0x400;
 	int is_user = user_mode(regs);
-	int fault;
-	int rc = 0, store_update_sp = 0;
+	int is_write = page_fault_is_write(error_code);
+	int fault, major = 0;
+	bool store_update_sp = false;
 
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
-	/*
-	 * Fortunately the bit assignments in SRR1 for an instruction
-	 * fault and DSISR for a data fault are mostly the same for the
-	 * bits we are interested in.  But there are some bits which
-	 * indicate errors in DSISR but can validly be set in SRR1.
-	 */
-	if (is_exec)
-		error_code &= 0x48200000;
-	else
-		is_write = error_code & DSISR_ISSTORE;
-#else
-	is_write = error_code & ESR_DST;
-#endif /* CONFIG_4xx || CONFIG_BOOKE */
+	if (notify_page_fault(regs))
+		return 0;
 
-#ifdef CONFIG_PPC_ICSWX
-	/*
-	 * we need to do this early because this "data storage
-	 * interrupt" does not update the DAR/DEAR so we don't want to
-	 * look at it
-	 */
-	if (error_code & ICSWX_DSI_UCT) {
-		rc = acop_handle_fault(regs, address, error_code);
-		if (rc)
-			goto bail;
+	if (unlikely(page_fault_is_bad(error_code))) {
+		if (is_user) {
+			_exception(SIGBUS, regs, BUS_OBJERR, address);
+			return 0;
+		}
+		return SIGBUS;
 	}
-#endif /* CONFIG_PPC_ICSWX */
-
-	if (notify_page_fault(regs))
-		goto bail;
 
-	if (unlikely(debugger_fault_handler(regs)))
-		goto bail;
+	/* Additional sanity check(s) */
+	sanity_check_fault(is_write, error_code);
 
 	/*
 	 * The kernel should never take an execute fault nor should it
 	 * take a page fault to a kernel address.
 	 */
-	if (!is_user && (is_exec || (address >= TASK_SIZE))) {
-		rc = SIGSEGV;
-		goto bail;
-	}
+	if (unlikely(!is_user && bad_kernel_fault(is_exec, error_code, address)))
+		return SIGSEGV;
 
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
-      defined(CONFIG_PPC_BOOK3S_64) || defined(CONFIG_PPC_8xx))
-  	if (error_code & DSISR_DABRMATCH) {
-		/* breakpoint match */
-		do_break(regs, address, error_code);
-		goto bail;
+	/*
+	 * If we're in an interrupt, have no user context or are running
+	 * in a region with pagefaults disabled then we must not take the fault
+	 */
+	if (unlikely(faulthandler_disabled() || !mm)) {
+		if (is_user)
+			printk_ratelimited(KERN_ERR "Page fault in user mode"
+					   " with faulthandler_disabled()=%d"
+					   " mm=%p\n",
+					   faulthandler_disabled(), mm);
+		return bad_area_nosemaphore(regs, address);
 	}
-#endif
 
 	/* We restore the interrupt state now */
 	if (!arch_irq_disabled_regs(regs))
 		local_irq_enable();
 
-	if (faulthandler_disabled() || mm == NULL) {
-		if (!is_user) {
-			rc = SIGSEGV;
-			goto bail;
-		}
-		/* faulthandler_disabled() in user mode is really bad,
-		   as is current->mm == NULL. */
-		printk(KERN_EMERG "Page fault in user mode with "
-		       "faulthandler_disabled() = %d mm = %p\n",
-		       faulthandler_disabled(), mm);
-		printk(KERN_EMERG "NIP = %lx  MSR = %lx\n",
-		       regs->nip, regs->msr);
-		die("Weird page fault", regs, SIGSEGV);
-	}
-
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
 	/*
@@ -293,6 +437,10 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 
 	if (is_user)
 		flags |= FAULT_FLAG_USER;
+	if (is_write)
+		flags |= FAULT_FLAG_WRITE;
+	if (is_exec)
+		flags |= FAULT_FLAG_INSTRUCTION;
 
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -309,9 +457,9 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * source.  If this is invalid we can skip the address space check,
 	 * thus avoiding the deadlock.
 	 */
-	if (!down_read_trylock(&mm->mmap_sem)) {
+	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
 		if (!is_user && !search_exception_tables(regs->nip))
-			goto bad_area_nosemaphore;
+			return bad_area_nosemaphore(regs, address);
 
 retry:
 		down_read(&mm->mmap_sem);
@@ -325,122 +473,24 @@ retry:
 	}
 
 	vma = find_vma(mm, address);
-	if (!vma)
-		goto bad_area;
-	if (vma->vm_start <= address)
+	if (unlikely(!vma))
+		return bad_area(regs, address);
+	if (likely(vma->vm_start <= address))
 		goto good_area;
-	if (!(vma->vm_flags & VM_GROWSDOWN))
-		goto bad_area;
+	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+		return bad_area(regs, address);
 
-	/*
-	 * N.B. The POWER/Open ABI allows programs to access up to
-	 * 288 bytes below the stack pointer.
-	 * The kernel signal delivery code writes up to about 1.5kB
-	 * below the stack pointer (r1) before decrementing it.
-	 * The exec code can write slightly over 640kB to the stack
-	 * before setting the user r1.  Thus we allow the stack to
-	 * expand to 1MB without further checks.
-	 */
-	if (address + 0x100000 < vma->vm_end) {
-		/* get user regs even if this fault is in kernel mode */
-		struct pt_regs *uregs = current->thread.regs;
-		if (uregs == NULL)
-			goto bad_area;
+	/* The stack is being expanded, check if it's valid */
+	if (unlikely(bad_stack_expansion(regs, address, vma, store_update_sp)))
+		return bad_area(regs, address);
 
-		/*
-		 * A user-mode access to an address a long way below
-		 * the stack pointer is only valid if the instruction
-		 * is one which would update the stack pointer to the
-		 * address accessed if the instruction completed,
-		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
-		 * (or the byte, halfword, float or double forms).
-		 *
-		 * If we don't check this then any write to the area
-		 * between the last mapped region and the stack will
-		 * expand the stack rather than segfaulting.
-		 */
-		if (address + 2048 < uregs->gpr[1] && !store_update_sp)
-			goto bad_area;
-	}
-	if (expand_stack(vma, address))
-		goto bad_area;
+	/* Try to expand it */
+	if (unlikely(expand_stack(vma, address)))
+		return bad_area(regs, address);
 
 good_area:
-	code = SEGV_ACCERR;
-#if defined(CONFIG_6xx)
-	if (error_code & 0x95700000)
-		/* an error such as lwarx to I/O controller space,
-		   address matching DABR, eciwx, etc. */
-		goto bad_area;
-#endif /* CONFIG_6xx */
-#if defined(CONFIG_8xx)
-        /* The MPC8xx seems to always set 0x80000000, which is
-         * "undefined".  Of those that can be set, this is the only
-         * one which seems bad.
-         */
-	if (error_code & 0x10000000)
-                /* Guarded storage error. */
-		goto bad_area;
-#endif /* CONFIG_8xx */
-
-	if (is_exec) {
-		/*
-		 * Allow execution from readable areas if the MMU does not
-		 * provide separate controls over reading and executing.
-		 *
-		 * Note: That code used to not be enabled for 4xx/BookE.
-		 * It is now as I/D cache coherency for these is done at
-		 * set_pte_at() time and I see no reason why the test
-		 * below wouldn't be valid on those processors. This -may-
-		 * break programs compiled with a really old ABI though.
-		 */
-		if (!(vma->vm_flags & VM_EXEC) &&
-		    (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
-		     !(vma->vm_flags & (VM_READ | VM_WRITE))))
-			goto bad_area;
-	/* a write */
-	} else if (is_write) {
-		if (!(vma->vm_flags & VM_WRITE))
-			goto bad_area;
-		flags |= FAULT_FLAG_WRITE;
-	/* a read */
-	} else {
-		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-			goto bad_area;
-	}
-#ifdef CONFIG_PPC_STD_MMU
-	/*
-	 * For hash translation mode, we should never get a
-	 * PROTFAULT. Any update to pte to reduce access will result in us
-	 * removing the hash page table entry, thus resulting in a DSISR_NOHPTE
-	 * fault instead of DSISR_PROTFAULT.
-	 *
-	 * A pte update to relax the access will not result in a hash page table
-	 * entry invalidate and hence can result in DSISR_PROTFAULT.
-	 * ptep_set_access_flags() doesn't do a hpte flush. This is why we have
-	 * the special !is_write in the below conditional.
-	 *
-	 * For platforms that doesn't supports coherent icache and do support
-	 * per page noexec bit, we do setup things such that we do the
-	 * sync between D/I cache via fault. But that is handled via low level
-	 * hash fault code (hash_page_do_lazy_icache()) and we should not reach
-	 * here in such case.
-	 *
-	 * For wrong access that can result in PROTFAULT, the above vma->vm_flags
-	 * check should handle those and hence we should fall to the bad_area
-	 * handling correctly.
-	 *
-	 * For embedded with per page exec support that doesn't support coherent
-	 * icache we do get PROTFAULT and we handle that D/I cache sync in
-	 * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON
-	 * is conditional for server MMU.
-	 *
-	 * For radix, we can get prot fault for autonuma case, because radix
-	 * page table will have them marked noaccess for user.
-	 */
-	if (!radix_enabled() && !is_write)
-		WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
-#endif /* CONFIG_PPC_STD_MMU */
+	if (unlikely(access_error(is_write, is_exec, vma)))
+		return bad_area(regs, address);
 
 	/*
 	 * If for any reason at all we couldn't handle the fault,
@@ -448,6 +498,7 @@ good_area:
 	 * the fault.
 	 */
 	fault = handle_mm_fault(vma, address, flags);
+	major |= fault & VM_FAULT_MAJOR;
 
 	/*
 	 * Handle the retry right now, the mmap_sem has been released in that
@@ -465,64 +516,39 @@ good_area:
 			if (!fatal_signal_pending(current))
 				goto retry;
 		}
-		/* We will enter mm_fault_error() below */
-	} else
-		up_read(&current->mm->mmap_sem);
-
-	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
-		if (fault & VM_FAULT_SIGSEGV)
-			goto bad_area_nosemaphore;
-		rc = mm_fault_error(regs, address, fault);
-		if (rc >= MM_FAULT_RETURN)
-			goto bail;
-		else
-			rc = 0;
+
+		/*
+		 * User mode? Just return to handle the fatal exception otherwise
+		 * return to bad_page_fault
+		 */
+		return is_user ? 0 : SIGBUS;
 	}
 
+	up_read(&current->mm->mmap_sem);
+
+	if (unlikely(fault & VM_FAULT_ERROR))
+		return mm_fault_error(regs, address, fault);
+
 	/*
 	 * Major/minor page fault accounting.
 	 */
-	if (fault & VM_FAULT_MAJOR) {
+	if (major) {
 		current->maj_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
-			      regs, address);
-#ifdef CONFIG_PPC_SMLPAR
-		if (firmware_has_feature(FW_FEATURE_CMO)) {
-			u32 page_ins;
-
-			preempt_disable();
-			page_ins = be32_to_cpu(get_lppaca()->page_ins);
-			page_ins += 1 << PAGE_FACTOR;
-			get_lppaca()->page_ins = cpu_to_be32(page_ins);
-			preempt_enable();
-		}
-#endif /* CONFIG_PPC_SMLPAR */
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+		cmo_account_page_fault();
 	} else {
 		current->min_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
-			      regs, address);
-	}
-
-	goto bail;
-
-bad_area:
-	up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-	/* User mode accesses cause a SIGSEGV */
-	if (is_user) {
-		_exception(SIGSEGV, regs, code, address);
-		goto bail;
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
 	}
+	return 0;
+}
+NOKPROBE_SYMBOL(__do_page_fault);
 
-	if (is_exec && (error_code & DSISR_PROTFAULT))
-		printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
-				   " page (%lx) - exploit attempt? (uid: %d)\n",
-				   address, from_kuid(&init_user_ns, current_uid()));
-
-	rc = SIGSEGV;
-
-bail:
+int do_page_fault(struct pt_regs *regs, unsigned long address,
+		  unsigned long error_code)
+{
+	enum ctx_state prev_state = exception_enter();
+	int rc = __do_page_fault(regs, address, error_code);
 	exception_exit(prev_state);
 	return rc;
 }
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 6f962e5cb5e1..ffbd7c0bda96 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -575,7 +575,6 @@ _GLOBAL(flush_hash_pages)
 	rlwinm	r8,r8,0,31,29		/* clear HASHPTE bit */
 	stwcx.	r8,0,r5			/* update the pte */
 	bne-	33b
-EXPORT_SYMBOL(flush_hash_pages)
 
 	/* Get the address of the primary PTE group in the hash table (r3) */
 _GLOBAL(flush_hash_patch_A)
@@ -634,6 +633,7 @@ _GLOBAL(flush_hash_patch_B)
 	SYNC_601
 	isync
 	blr
+EXPORT_SYMBOL(flush_hash_pages)
 
 /*
  * Flush an entry from the TLB
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 7a20669c19e7..67ec2e927253 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -61,6 +61,7 @@
 #include <asm/tm.h>
 #include <asm/trace.h>
 #include <asm/ps3.h>
+#include <asm/pte-walk.h>
 
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
@@ -507,9 +508,9 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
 	printk(KERN_INFO "Huge page(16GB) memory: "
 			"addr = 0x%lX size = 0x%lX pages = %d\n",
 			phys_addr, block_size, expected_pages);
-	if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) {
+	if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) {
 		memblock_reserve(phys_addr, block_size * expected_pages);
-		add_gpage(phys_addr, block_size, expected_pages);
+		pseries_add_gpage(phys_addr, block_size, expected_pages);
 	}
 	return 0;
 }
@@ -1019,6 +1020,7 @@ void __init hash__early_init_mmu(void)
 	__kernel_virt_size = H_KERN_VIRT_SIZE;
 	__vmalloc_start = H_VMALLOC_START;
 	__vmalloc_end = H_VMALLOC_END;
+	__kernel_io_start = H_KERN_IO_START;
 	vmemmap = (struct page *)H_VMEMMAP_BASE;
 	ioremap_bot = IOREMAP_BASE;
 
@@ -1228,7 +1230,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 	unsigned long vsid;
 	pte_t *ptep;
 	unsigned hugeshift;
-	const struct cpumask *tmp;
 	int rc, user_region = 0;
 	int psize, ssize;
 
@@ -1280,8 +1281,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 	}
 
 	/* Check CPU locality */
-	tmp = cpumask_of(smp_processor_id());
-	if (user_region && cpumask_equal(mm_cpumask(mm), tmp))
+	if (user_region && mm_is_thread_local(mm))
 		flags |= HPTE_LOCAL_UPDATE;
 
 #ifndef CONFIG_PPC_64K_PAGES
@@ -1297,7 +1297,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 #endif /* CONFIG_PPC_64K_PAGES */
 
 	/* Get PTE and page size from page tables */
-	ptep = __find_linux_pte_or_hugepte(pgdir, ea, &is_thp, &hugeshift);
+	ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift);
 	if (ptep == NULL || !pte_present(*ptep)) {
 		DBG_LOW(" no PTE !\n");
 		rc = 1;
@@ -1526,7 +1526,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 	 * THP pages use update_mmu_cache_pmd. We don't do
 	 * hash preload there. Hence can ignore THP here
 	 */
-	ptep = find_linux_pte_or_hugepte(pgdir, ea, NULL, &hugepage_shift);
+	ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift);
 	if (!ptep)
 		goto out_exit;
 
@@ -1543,7 +1543,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 #endif /* CONFIG_PPC_64K_PAGES */
 
 	/* Is that local to this CPU ? */
-	if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+	if (mm_is_thread_local(mm))
 		update_flags |= HPTE_LOCAL_UPDATE;
 
 	/* Hash it in */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index e1bf5ca397fe..1571a498a33f 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -24,6 +24,8 @@
 #include <asm/tlb.h>
 #include <asm/setup.h>
 #include <asm/hugetlb.h>
+#include <asm/pte-walk.h>
+
 
 #ifdef CONFIG_HUGETLB_PAGE
 
@@ -36,32 +38,15 @@
 unsigned int HPAGE_SHIFT;
 EXPORT_SYMBOL(HPAGE_SHIFT);
 
-/*
- * Tracks gpages after the device tree is scanned and before the
- * huge_boot_pages list is ready.  On non-Freescale implementations, this is
- * just used to track 16G pages and so is a single array.  FSL-based
- * implementations may have more than one gpage size, so we need multiple
- * arrays
- */
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
-#define MAX_NUMBER_GPAGES	128
-struct psize_gpages {
-	u64 gpage_list[MAX_NUMBER_GPAGES];
-	unsigned int nr_gpages;
-};
-static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
-#else
-#define MAX_NUMBER_GPAGES	1024
-static u64 gpage_freearray[MAX_NUMBER_GPAGES];
-static unsigned nr_gpages;
-#endif
-
 #define hugepd_none(hpd)	(hpd_val(hpd) == 0)
 
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
-	/* Only called for hugetlbfs pages, hence can ignore THP */
-	return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
+	/*
+	 * Only called for hugetlbfs pages, hence can ignore THP and the
+	 * irq disabled walk.
+	 */
+	return __find_linux_pte(mm->pgd, addr, NULL, NULL);
 }
 
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
@@ -210,145 +195,20 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 	return hugepte_offset(*hpdp, addr, pdshift);
 }
 
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
-/* Build list of addresses of gigantic pages.  This function is used in early
- * boot before the buddy allocator is setup.
- */
-void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
-{
-	unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
-	int i;
-
-	if (addr == 0)
-		return;
-
-	gpage_freearray[idx].nr_gpages = number_of_pages;
-
-	for (i = 0; i < number_of_pages; i++) {
-		gpage_freearray[idx].gpage_list[i] = addr;
-		addr += page_size;
-	}
-}
-
-/*
- * Moves the gigantic page addresses from the temporary list to the
- * huge_boot_pages list.
- */
-int alloc_bootmem_huge_page(struct hstate *hstate)
-{
-	struct huge_bootmem_page *m;
-	int idx = shift_to_mmu_psize(huge_page_shift(hstate));
-	int nr_gpages = gpage_freearray[idx].nr_gpages;
-
-	if (nr_gpages == 0)
-		return 0;
-
-#ifdef CONFIG_HIGHMEM
-	/*
-	 * If gpages can be in highmem we can't use the trick of storing the
-	 * data structure in the page; allocate space for this
-	 */
-	m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0);
-	m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
-#else
-	m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
-#endif
-
-	list_add(&m->list, &huge_boot_pages);
-	gpage_freearray[idx].nr_gpages = nr_gpages;
-	gpage_freearray[idx].gpage_list[nr_gpages] = 0;
-	m->hstate = hstate;
-
-	return 1;
-}
+#ifdef CONFIG_PPC_BOOK3S_64
 /*
- * Scan the command line hugepagesz= options for gigantic pages; store those in
- * a list that we use to allocate the memory once all options are parsed.
+ * Tracks gpages after the device tree is scanned and before the
+ * huge_boot_pages list is ready on pseries.
  */
-
-unsigned long gpage_npages[MMU_PAGE_COUNT];
-
-static int __init do_gpage_early_setup(char *param, char *val,
-				       const char *unused, void *arg)
-{
-	static phys_addr_t size;
-	unsigned long npages;
-
-	/*
-	 * The hugepagesz and hugepages cmdline options are interleaved.  We
-	 * use the size variable to keep track of whether or not this was done
-	 * properly and skip over instances where it is incorrect.  Other
-	 * command-line parsing code will issue warnings, so we don't need to.
-	 *
-	 */
-	if ((strcmp(param, "default_hugepagesz") == 0) ||
-	    (strcmp(param, "hugepagesz") == 0)) {
-		size = memparse(val, NULL);
-	} else if (strcmp(param, "hugepages") == 0) {
-		if (size != 0) {
-			if (sscanf(val, "%lu", &npages) <= 0)
-				npages = 0;
-			if (npages > MAX_NUMBER_GPAGES) {
-				pr_warn("MMU: %lu pages requested for page "
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
-					"size %llu KB, limiting to "
-#else
-					"size %u KB, limiting to "
-#endif
-					__stringify(MAX_NUMBER_GPAGES) "\n",
-					npages, size / 1024);
-				npages = MAX_NUMBER_GPAGES;
-			}
-			gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
-			size = 0;
-		}
-	}
-	return 0;
-}
-
+#define MAX_NUMBER_GPAGES	1024
+__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
+__initdata static unsigned nr_gpages;
 
 /*
- * This function allocates physical space for pages that are larger than the
- * buddy allocator can handle.  We want to allocate these in highmem because
- * the amount of lowmem is limited.  This means that this function MUST be
- * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
- * allocate to grab highmem.
- */
-void __init reserve_hugetlb_gpages(void)
-{
-	static __initdata char cmdline[COMMAND_LINE_SIZE];
-	phys_addr_t size, base;
-	int i;
-
-	strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
-	parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
-			NULL, &do_gpage_early_setup);
-
-	/*
-	 * Walk gpage list in reverse, allocating larger page sizes first.
-	 * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
-	 * When we reach the point in the list where pages are no longer
-	 * considered gpages, we're done.
-	 */
-	for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
-		if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
-			continue;
-		else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
-			break;
-
-		size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
-		base = memblock_alloc_base(size * gpage_npages[i], size,
-					   MEMBLOCK_ALLOC_ANYWHERE);
-		add_gpage(base, size, gpage_npages[i]);
-	}
-}
-
-#else /* !PPC_FSL_BOOK3E */
-
-/* Build list of addresses of gigantic pages.  This function is used in early
+ * Build list of addresses of gigantic pages.  This function is used in early
  * boot before the buddy allocator is setup.
  */
-void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
+void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
 {
 	if (!addr)
 		return;
@@ -360,10 +220,7 @@ void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
 	}
 }
 
-/* Moves the gigantic page addresses from the temporary list to the
- * huge_boot_pages list.
- */
-int alloc_bootmem_huge_page(struct hstate *hstate)
+int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
 {
 	struct huge_bootmem_page *m;
 	if (nr_gpages == 0)
@@ -376,6 +233,17 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
 }
 #endif
 
+
+int __init alloc_bootmem_huge_page(struct hstate *h)
+{
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
+		return pseries_alloc_bootmem_huge_page(h);
+#endif
+	return __alloc_bootmem_huge_page(h);
+}
+
 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
 #define HUGEPD_FREELIST_SIZE \
 	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
@@ -407,8 +275,7 @@ static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
 	batchp = &get_cpu_var(hugepd_freelist_cur);
 
 	if (atomic_read(&tlb->mm->mm_users) < 2 ||
-	    cpumask_equal(mm_cpumask(tlb->mm),
-			  cpumask_of(smp_processor_id()))) {
+	    mm_is_thread_local(tlb->mm)) {
 		kmem_cache_free(hugepte_cache, hugepte);
 		put_cpu_var(hugepd_freelist_cur);
 		return;
@@ -886,9 +753,8 @@ void flush_dcache_icache_hugepage(struct page *page)
  * This function need to be called with interrupts disabled. We use this variant
  * when we have MSR[EE] = 0 but the paca->soft_enabled = 1
  */
-
-pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-				   bool *is_thp, unsigned *shift)
+pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
+			bool *is_thp, unsigned *hpage_shift)
 {
 	pgd_t pgd, *pgdp;
 	pud_t pud, *pudp;
@@ -897,8 +763,8 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 	hugepd_t *hpdp = NULL;
 	unsigned pdshift = PGDIR_SHIFT;
 
-	if (shift)
-		*shift = 0;
+	if (hpage_shift)
+		*hpage_shift = 0;
 
 	if (is_thp)
 		*is_thp = false;
@@ -968,16 +834,15 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
 	pdshift = hugepd_shift(*hpdp);
 out:
-	if (shift)
-		*shift = pdshift;
+	if (hpage_shift)
+		*hpage_shift = pdshift;
 	return ret_pte;
 }
-EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte);
+EXPORT_SYMBOL_GPL(__find_linux_pte);
 
 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
-	unsigned long mask;
 	unsigned long pte_end;
 	struct page *head, *page;
 	pte_t pte;
@@ -988,18 +853,10 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 		end = pte_end;
 
 	pte = READ_ONCE(*ptep);
-	mask = _PAGE_PRESENT | _PAGE_READ;
 
-	/*
-	 * On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined
-	 * as 0 and _PAGE_RO has to be set when a page is not writable
-	 */
-	if (write)
-		mask |= _PAGE_WRITE;
-	else
-		mask |= _PAGE_RO;
-
-	if ((pte_val(pte) & mask) != mask)
+	if (!pte_present(pte) || !pte_read(pte))
+		return 0;
+	if (write && !pte_write(pte))
 		return 0;
 
 	/* hugepages are never "special" */
diff --git a/arch/powerpc/mm/icswx.c b/arch/powerpc/mm/icswx.c
deleted file mode 100644
index 1fa794d7d59f..000000000000
--- a/arch/powerpc/mm/icswx.c
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- *  ICSWX and ACOP Management
- *
- *  Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-
-#include "icswx.h"
-
-/*
- * The processor and its L2 cache cause the icswx instruction to
- * generate a COP_REQ transaction on PowerBus. The transaction has no
- * address, and the processor does not perform an MMU access to
- * authenticate the transaction. The command portion of the PowerBus
- * COP_REQ transaction includes the LPAR_ID (LPID) and the coprocessor
- * Process ID (PID), which the coprocessor compares to the authorized
- * LPID and PID held in the coprocessor, to determine if the process
- * is authorized to generate the transaction.  The data of the COP_REQ
- * transaction is 128-byte or less in size and is placed in cacheable
- * memory on a 128-byte cache line boundary.
- *
- * The task to use a coprocessor should use use_cop() to mark the use
- * of the Coprocessor Type (CT) and context switching. On a server
- * class processor, the PID register is used only for coprocessor
- * management + * and so a coprocessor PID is allocated before
- * executing icswx + * instruction. Drop_cop() is used to free the
- * coprocessor PID.
- *
- * Example:
- * Host Fabric Interface (HFI) is a PowerPC network coprocessor.
- * Each HFI have multiple windows. Each HFI window serves as a
- * network device sending to and receiving from HFI network.
- * HFI immediate send function uses icswx instruction. The immediate
- * send function allows small (single cache-line) packets be sent
- * without using the regular HFI send FIFO and doorbell, which are
- * much slower than immediate send.
- *
- * For each task intending to use HFI immediate send, the HFI driver
- * calls use_cop() to obtain a coprocessor PID for the task.
- * The HFI driver then allocate a free HFI window and save the
- * coprocessor PID to the HFI window to allow the task to use the
- * HFI window.
- *
- * The HFI driver repeatedly creates immediate send packets and
- * issues icswx instruction to send data through the HFI window.
- * The HFI compares the coprocessor PID in the CPU PID register
- * to the PID held in the HFI window to determine if the transaction
- * is allowed.
- *
- * When the task to release the HFI window, the HFI driver calls
- * drop_cop() to release the coprocessor PID.
- */
-
-void switch_cop(struct mm_struct *next)
-{
-#ifdef CONFIG_PPC_ICSWX_PID
-	mtspr(SPRN_PID, next->context.cop_pid);
-#endif
-	mtspr(SPRN_ACOP, next->context.acop);
-}
-
-/**
- * Start using a coprocessor.
- * @acop: mask of coprocessor to be used.
- * @mm: The mm the coprocessor to associate with. Most likely current mm.
- *
- * Return a positive PID if successful. Negative errno otherwise.
- * The returned PID will be fed to the coprocessor to determine if an
- * icswx transaction is authenticated.
- */
-int use_cop(unsigned long acop, struct mm_struct *mm)
-{
-	int ret;
-
-	if (!cpu_has_feature(CPU_FTR_ICSWX))
-		return -ENODEV;
-
-	if (!mm || !acop)
-		return -EINVAL;
-
-	/* The page_table_lock ensures mm_users won't change under us */
-	spin_lock(&mm->page_table_lock);
-	spin_lock(mm->context.cop_lockp);
-
-	ret = get_cop_pid(mm);
-	if (ret < 0)
-		goto out;
-
-	/* update acop */
-	mm->context.acop |= acop;
-
-	sync_cop(mm);
-
-	/*
-	 * If this is a threaded process then there might be other threads
-	 * running. We need to send an IPI to force them to pick up any
-	 * change in PID and ACOP.
-	 */
-	if (atomic_read(&mm->mm_users) > 1)
-		smp_call_function(sync_cop, mm, 1);
-
-out:
-	spin_unlock(mm->context.cop_lockp);
-	spin_unlock(&mm->page_table_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(use_cop);
-
-/**
- * Stop using a coprocessor.
- * @acop: mask of coprocessor to be stopped.
- * @mm: The mm the coprocessor associated with.
- */
-void drop_cop(unsigned long acop, struct mm_struct *mm)
-{
-	int free_pid;
-
-	if (!cpu_has_feature(CPU_FTR_ICSWX))
-		return;
-
-	if (WARN_ON_ONCE(!mm))
-		return;
-
-	/* The page_table_lock ensures mm_users won't change under us */
-	spin_lock(&mm->page_table_lock);
-	spin_lock(mm->context.cop_lockp);
-
-	mm->context.acop &= ~acop;
-
-	free_pid = disable_cop_pid(mm);
-	sync_cop(mm);
-
-	/*
-	 * If this is a threaded process then there might be other threads
-	 * running. We need to send an IPI to force them to pick up any
-	 * change in PID and ACOP.
-	 */
-	if (atomic_read(&mm->mm_users) > 1)
-		smp_call_function(sync_cop, mm, 1);
-
-	if (free_pid != COP_PID_NONE)
-		free_cop_pid(free_pid);
-
-	spin_unlock(mm->context.cop_lockp);
-	spin_unlock(&mm->page_table_lock);
-}
-EXPORT_SYMBOL_GPL(drop_cop);
-
-static int acop_use_cop(int ct)
-{
-	/* There is no alternate policy, yet */
-	return -1;
-}
-
-/*
- * Get the instruction word at the NIP
- */
-static u32 acop_get_inst(struct pt_regs *regs)
-{
-	u32 inst;
-	u32 __user *p;
-
-	p = (u32 __user *)regs->nip;
-	if (!access_ok(VERIFY_READ, p, sizeof(*p)))
-		return 0;
-
-	if (__get_user(inst, p))
-		return 0;
-
-	return inst;
-}
-
-/**
- * @regs: registers at time of interrupt
- * @address: storage address
- * @error_code: Fault code, usually the DSISR or ESR depending on
- *		processor type
- *
- * Return 0 if we are able to resolve the data storage fault that
- * results from a CT miss in the ACOP register.
- */
-int acop_handle_fault(struct pt_regs *regs, unsigned long address,
-		      unsigned long error_code)
-{
-	int ct;
-	u32 inst = 0;
-
-	if (!cpu_has_feature(CPU_FTR_ICSWX)) {
-		pr_info("No coprocessors available");
-		_exception(SIGILL, regs, ILL_ILLOPN, address);
-	}
-
-	if (!user_mode(regs)) {
-		/* this could happen if the HV denies the
-		 * kernel access, for now we just die */
-		die("ICSWX from kernel failed", regs, SIGSEGV);
-	}
-
-	/* Some implementations leave us a hint for the CT */
-	ct = ICSWX_GET_CT_HINT(error_code);
-	if (ct < 0) {
-		/* we have to peek at the instruction word to figure out CT */
-		u32 ccw;
-		u32 rs;
-
-		inst = acop_get_inst(regs);
-		if (inst == 0)
-			return -1;
-
-		rs = (inst >> (31 - 10)) & 0x1f;
-		ccw = regs->gpr[rs];
-		ct = (ccw >> 16) & 0x3f;
-	}
-
-	/*
-	 * We could be here because another thread has enabled acop
-	 * but the ACOP register has yet to be updated.
-	 *
-	 * This should have been taken care of by the IPI to sync all
-	 * the threads (see smp_call_function(sync_cop, mm, 1)), but
-	 * that could take forever if there are a significant amount
-	 * of threads.
-	 *
-	 * Given the number of threads on some of these systems,
-	 * perhaps this is the best way to sync ACOP rather than whack
-	 * every thread with an IPI.
-	 */
-	if ((acop_copro_type_bit(ct) & current->active_mm->context.acop) != 0) {
-		sync_cop(current->active_mm);
-		return 0;
-	}
-
-	/* check for alternate policy */
-	if (!acop_use_cop(ct))
-		return 0;
-
-	/* at this point the CT is unknown to the system */
-	pr_warn("%s[%d]: Coprocessor %d is unavailable\n",
-		current->comm, current->pid, ct);
-
-	/* get inst if we don't already have it */
-	if (inst == 0) {
-		inst = acop_get_inst(regs);
-		if (inst == 0)
-			return -1;
-	}
-
-	/* Check if the instruction is the "record form" */
-	if (inst & 1) {
-		/*
-		 * the instruction is "record" form so we can reject
-		 * using CR0
-		 */
-		regs->ccr &= ~(0xful << 28);
-		regs->ccr |= ICSWX_RC_NOT_FOUND << 28;
-
-		/* Move on to the next instruction */
-		regs->nip += 4;
-	} else {
-		/*
-		 * There is no architected mechanism to report a bad
-		 * CT so we could either SIGILL or report nothing.
-		 * Since the non-record version should only bu used
-		 * for "hints" or "don't care" we should probably do
-		 * nothing.  However, I could see how some people
-		 * might want an SIGILL so it here if you want it.
-		 */
-#ifdef CONFIG_PPC_ICSWX_USE_SIGILL
-		_exception(SIGILL, regs, ILL_ILLOPN, address);
-#else
-		regs->nip += 4;
-#endif
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(acop_handle_fault);
diff --git a/arch/powerpc/mm/icswx.h b/arch/powerpc/mm/icswx.h
deleted file mode 100644
index 6dedc08e62c8..000000000000
--- a/arch/powerpc/mm/icswx.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef _ARCH_POWERPC_MM_ICSWX_H_
-#define _ARCH_POWERPC_MM_ICSWX_H_
-
-/*
- *  ICSWX and ACOP Management
- *
- *  Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <asm/mmu_context.h>
-
-/* also used to denote that PIDs are not used */
-#define COP_PID_NONE 0
-
-static inline void sync_cop(void *arg)
-{
-	struct mm_struct *mm = arg;
-
-	if (mm == current->active_mm)
-		switch_cop(current->active_mm);
-}
-
-#ifdef CONFIG_PPC_ICSWX_PID
-extern int get_cop_pid(struct mm_struct *mm);
-extern int disable_cop_pid(struct mm_struct *mm);
-extern void free_cop_pid(int free_pid);
-#else
-#define get_cop_pid(m) (COP_PID_NONE)
-#define disable_cop_pid(m) (COP_PID_NONE)
-#define free_cop_pid(p)
-#endif
-
-/*
- * These are implementation bits for architected registers.  If this
- * ever becomes architecture the should be moved to reg.h et. al.
- */
-/* UCT is the same bit for Server and Embedded */
-#define ICSWX_DSI_UCT		0x00004000  /* Unavailable Coprocessor Type */
-
-#ifdef CONFIG_PPC_BOOK3E
-/* Embedded implementation gives us no hints as to what the CT is */
-#define ICSWX_GET_CT_HINT(x) (-1)
-#else
-/* Server implementation contains the CT value in the DSISR */
-#define ICSWX_DSISR_CTMASK	0x00003f00
-#define ICSWX_GET_CT_HINT(x)	(((x) & ICSWX_DSISR_CTMASK) >> 8)
-#endif
-
-#define ICSWX_RC_STARTED	0x8	/* The request has been started */
-#define ICSWX_RC_NOT_IDLE	0x4	/* No coprocessor found idle */
-#define ICSWX_RC_NOT_FOUND	0x2	/* No coprocessor found */
-#define ICSWX_RC_UNDEFINED	0x1	/* Reserved */
-
-extern int acop_handle_fault(struct pt_regs *regs, unsigned long address,
-			     unsigned long error_code);
-
-static inline u64 acop_copro_type_bit(unsigned int type)
-{
-	return 1ULL << (63 - type);
-}
-
-#endif /* !_ARCH_POWERPC_MM_ICSWX_H_ */
diff --git a/arch/powerpc/mm/icswx_pid.c b/arch/powerpc/mm/icswx_pid.c
deleted file mode 100644
index 91e30eb7d054..000000000000
--- a/arch/powerpc/mm/icswx_pid.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  ICSWX and ACOP/PID Management
- *
- *  Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/module.h>
-#include "icswx.h"
-
-#define COP_PID_MIN (COP_PID_NONE + 1)
-#define COP_PID_MAX (0xFFFF)
-
-static DEFINE_SPINLOCK(mmu_context_acop_lock);
-static DEFINE_IDA(cop_ida);
-
-static int new_cop_pid(struct ida *ida, int min_id, int max_id,
-		       spinlock_t *lock)
-{
-	int index;
-	int err;
-
-again:
-	if (!ida_pre_get(ida, GFP_KERNEL))
-		return -ENOMEM;
-
-	spin_lock(lock);
-	err = ida_get_new_above(ida, min_id, &index);
-	spin_unlock(lock);
-
-	if (err == -EAGAIN)
-		goto again;
-	else if (err)
-		return err;
-
-	if (index > max_id) {
-		spin_lock(lock);
-		ida_remove(ida, index);
-		spin_unlock(lock);
-		return -ENOMEM;
-	}
-
-	return index;
-}
-
-int get_cop_pid(struct mm_struct *mm)
-{
-	int pid;
-
-	if (mm->context.cop_pid == COP_PID_NONE) {
-		pid = new_cop_pid(&cop_ida, COP_PID_MIN, COP_PID_MAX,
-				  &mmu_context_acop_lock);
-		if (pid >= 0)
-			mm->context.cop_pid = pid;
-	}
-	return mm->context.cop_pid;
-}
-
-int disable_cop_pid(struct mm_struct *mm)
-{
-	int free_pid = COP_PID_NONE;
-
-	if ((!mm->context.acop) && (mm->context.cop_pid != COP_PID_NONE)) {
-		free_pid = mm->context.cop_pid;
-		mm->context.cop_pid = COP_PID_NONE;
-	}
-	return free_pid;
-}
-
-void free_cop_pid(int free_pid)
-{
-	spin_lock(&mmu_context_acop_lock);
-	ida_remove(&cop_ida, free_pid);
-	spin_unlock(&mmu_context_acop_lock);
-}
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 8a7c38b8d335..6419b33ca309 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -113,6 +113,12 @@ void __init MMU_setup(void)
 		__map_without_bats = 1;
 		__map_without_ltlbs = 1;
 	}
+#ifdef CONFIG_STRICT_KERNEL_RWX
+	if (rodata_enabled) {
+		__map_without_bats = 1;
+		__map_without_ltlbs = 1;
+	}
+#endif
 }
 
 /*
@@ -132,8 +138,6 @@ void __init MMU_init(void)
 	 * Reserve gigantic pages for hugetlb.  This MUST occur before
 	 * lowmem_end_addr is initialized below.
 	 */
-	reserve_hugetlb_gpages();
-
 	if (memblock.memory.cnt > 1) {
 #ifndef CONFIG_WII
 		memblock_enforce_memory_limit(memblock.memory.regions[0].size);
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 5b4c25d12ff3..588a521966ec 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -356,7 +356,7 @@ struct page *realmode_pfn_to_page(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
 
-#elif defined(CONFIG_FLATMEM)
+#else
 
 struct page *realmode_pfn_to_page(unsigned long pfn)
 {
@@ -365,7 +365,7 @@ struct page *realmode_pfn_to_page(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
 
-#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 #ifdef CONFIG_PPC_STD_MMU_64
 static bool disable_radix;
@@ -381,7 +381,7 @@ early_param("disable_radix", parse_disable_radix);
  * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do
  * radix.  If not, we clear the radix feature bit so we fall back to hash.
  */
-static void early_check_vec5(void)
+static void __init early_check_vec5(void)
 {
 	unsigned long root, chosen;
 	int size;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 46b4e67d2372..4362b86ef84c 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -436,7 +436,7 @@ void flush_dcache_icache_page(struct page *page)
 		return;
 	}
 #endif
-#if defined(CONFIG_8xx) || defined(CONFIG_PPC64)
+#if defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC64)
 	/* On 8xx there is no need to kmap since highmem is not supported */
 	__flush_dcache_icache(page_address(page));
 #else
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
new file mode 100644
index 000000000000..0f613bc63c50
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context.c
@@ -0,0 +1,99 @@
+/*
+ *  Common implementation of switch_mm_irqs_off
+ *
+ *  Copyright IBM Corp. 2017
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+
+#include <asm/mmu_context.h>
+
+#if defined(CONFIG_PPC32)
+static inline void switch_mm_pgdir(struct task_struct *tsk,
+				   struct mm_struct *mm)
+{
+	/* 32-bit keeps track of the current PGDIR in the thread struct */
+	tsk->thread.pgdir = mm->pgd;
+}
+#elif defined(CONFIG_PPC_BOOK3E_64)
+static inline void switch_mm_pgdir(struct task_struct *tsk,
+				   struct mm_struct *mm)
+{
+	/* 64-bit Book3E keeps track of current PGD in the PACA */
+	get_paca()->pgd = mm->pgd;
+}
+#else
+static inline void switch_mm_pgdir(struct task_struct *tsk,
+				   struct mm_struct *mm) { }
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static inline void inc_mm_active_cpus(struct mm_struct *mm)
+{
+	atomic_inc(&mm->context.active_cpus);
+}
+#else
+static inline void inc_mm_active_cpus(struct mm_struct *mm) { }
+#endif
+
+void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+			struct task_struct *tsk)
+{
+	bool new_on_cpu = false;
+
+	/* Mark this context has been used on the new CPU */
+	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) {
+		cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
+		inc_mm_active_cpus(next);
+
+		/*
+		 * This full barrier orders the store to the cpumask above vs
+		 * a subsequent operation which allows this CPU to begin loading
+		 * translations for next.
+		 *
+		 * When using the radix MMU that operation is the load of the
+		 * MMU context id, which is then moved to SPRN_PID.
+		 *
+		 * For the hash MMU it is either the first load from slb_cache
+		 * in switch_slb(), and/or the store of paca->mm_ctx_id in
+		 * copy_mm_to_paca().
+		 *
+		 * On the read side the barrier is in pte_xchg(), which orders
+		 * the store to the PTE vs the load of mm_cpumask.
+		 */
+		smp_mb();
+
+		new_on_cpu = true;
+	}
+
+	/* Some subarchs need to track the PGD elsewhere */
+	switch_mm_pgdir(tsk, next);
+
+	/* Nothing else to do if we aren't actually switching */
+	if (prev == next)
+		return;
+
+	/*
+	 * We must stop all altivec streams before changing the HW
+	 * context
+	 */
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		asm volatile ("dssall");
+
+	if (new_on_cpu)
+		radix_kvm_prefetch_workaround(next);
+
+	/*
+	 * The actual HW switching method differs between the various
+	 * sub architectures. Out of line for now
+	 */
+	switch_mmu_context(prev, next, tsk);
+}
+
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index a75f63833284..05e15386d4cb 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -25,8 +25,6 @@
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
 
-#include "icswx.h"
-
 static DEFINE_SPINLOCK(mmu_context_lock);
 static DEFINE_IDA(mmu_context_ida);
 
@@ -165,16 +163,6 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 		return index;
 
 	mm->context.id = index;
-#ifdef CONFIG_PPC_ICSWX
-	mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
-	if (!mm->context.cop_lockp) {
-		__destroy_context(index);
-		subpage_prot_free(mm);
-		mm->context.id = MMU_NO_CONTEXT;
-		return -ENOMEM;
-	}
-	spin_lock_init(mm->context.cop_lockp);
-#endif /* CONFIG_PPC_ICSWX */
 
 #ifdef CONFIG_PPC_64K_PAGES
 	mm->context.pte_frag = NULL;
@@ -182,6 +170,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 	mm_iommu_init(mm);
 #endif
+	atomic_set(&mm->context.active_cpus, 0);
+
 	return 0;
 }
 
@@ -226,12 +216,6 @@ void destroy_context(struct mm_struct *mm)
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 	WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
 #endif
-#ifdef CONFIG_PPC_ICSWX
-	drop_cop(mm->context.acop, mm);
-	kfree(mm->context.cop_lockp);
-	mm->context.cop_lockp = NULL;
-#endif /* CONFIG_PPC_ICSWX */
-
 	if (radix_enabled()) {
 		/*
 		 * Radix doesn't have a valid bit in the process table
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index d46128b22150..57fbc554c785 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -27,7 +27,7 @@
 /*
  * On 40x and 8xx, we directly inline tlbia and tlbivax
  */
-#if defined(CONFIG_40x) || defined(CONFIG_8xx)
+#if defined(CONFIG_40x) || defined(CONFIG_PPC_8xx)
 static inline void _tlbil_all(void)
 {
 	asm volatile ("sync; tlbia; isync" : : : "memory");
@@ -38,7 +38,7 @@ static inline void _tlbil_pid(unsigned int pid)
 }
 #define _tlbil_pid_noind(pid)	_tlbil_pid(pid)
 
-#else /* CONFIG_40x || CONFIG_8xx */
+#else /* CONFIG_40x || CONFIG_PPC_8xx */
 extern void _tlbil_all(void);
 extern void _tlbil_pid(unsigned int pid);
 #ifdef CONFIG_PPC_BOOK3E
@@ -46,12 +46,12 @@ extern void _tlbil_pid_noind(unsigned int pid);
 #else
 #define _tlbil_pid_noind(pid)	_tlbil_pid(pid)
 #endif
-#endif /* !(CONFIG_40x || CONFIG_8xx) */
+#endif /* !(CONFIG_40x || CONFIG_PPC_8xx) */
 
 /*
  * On 8xx, we directly inline tlbie, on others, it's extern
  */
-#ifdef CONFIG_8xx
+#ifdef CONFIG_PPC_8xx
 static inline void _tlbil_va(unsigned long address, unsigned int pid,
 			     unsigned int tsize, unsigned int ind)
 {
@@ -67,7 +67,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid,
 {
 	__tlbil_va(address, pid);
 }
-#endif /* CONFIG_8xx */
+#endif /* CONFIG_PPC_8xx */
 
 #if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x)
 extern void _tlbivax_bcast(unsigned long address, unsigned int pid,
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index 31eed8fa8e99..3b65917785a5 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -9,6 +9,7 @@
 
 #include <linux/sched.h>
 #include <linux/mm_types.h>
+#include <misc/cxl-base.h>
 
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
@@ -64,6 +65,27 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 	trace_hugepage_set_pmd(addr, pmd_val(pmd));
 	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
 }
+
+static void do_nothing(void *unused)
+{
+
+}
+/*
+ * Serialize against find_current_mm_pte which does lock-less
+ * lookup in page tables with local interrupts disabled. For huge pages
+ * it casts pmd_t to pte_t. Since format of pte_t is different from
+ * pmd_t we want to prevent transit from pmd pointing to page table
+ * to pmd pointing to huge page (and back) while interrupts are disabled.
+ * We clear pmd to possibly replace it with page table pointer in
+ * different code paths. So make sure we wait for the parallel
+ * find_current_mm_pte to finish.
+ */
+void serialize_against_pte_lookup(struct mm_struct *mm)
+{
+	smp_mb();
+	smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
+}
+
 /*
  * We use this to invalidate a pmdp entry before switching from a
  * hugepte to regular pmd entry.
@@ -77,7 +99,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 	 * This ensures that generic code that rely on IRQ disabling
 	 * to prevent a parallel THP split work as expected.
 	 */
-	kick_all_cpus_sync();
+	serialize_against_pte_lookup(vma->vm_mm);
 }
 
 static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index 443a2c66a304..ec277913e01b 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -239,7 +239,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres
 	 * by sending an IPI to all the cpus and executing a dummy
 	 * function there.
 	 */
-	kick_all_cpus_sync();
+	serialize_against_pte_lookup(vma->vm_mm);
 	/*
 	 * Now invalidate the hpte entries in the range
 	 * covered by pmd. This make sure we take a
@@ -329,7 +329,6 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 	unsigned int psize;
 	unsigned long vsid;
 	unsigned long flags = 0;
-	const struct cpumask *tmp;
 
 	/* get the base page size,vsid and segment size */
 #ifdef CONFIG_DEBUG_VM
@@ -350,8 +349,7 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 		ssize = mmu_kernel_ssize;
 	}
 
-	tmp = cpumask_of(smp_processor_id());
-	if (cpumask_equal(mm_cpumask(mm), tmp))
+	if (mm_is_thread_local(mm))
 		flags |= HPTE_LOCAL_UPDATE;
 
 	return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
@@ -380,16 +378,16 @@ pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
 	 */
 	memset(pgtable, 0, PTE_FRAG_SIZE);
 	/*
-	 * Serialize against find_linux_pte_or_hugepte which does lock-less
+	 * Serialize against find_current_mm_pte variants which does lock-less
 	 * lookup in page tables with local interrupts disabled. For huge pages
 	 * it casts pmd_t to pte_t. Since format of pte_t is different from
 	 * pmd_t we want to prevent transit from pmd pointing to page table
 	 * to pmd pointing to huge page (and back) while interrupts are disabled.
 	 * We clear pmd to possibly replace it with page table pointer in
 	 * different code paths. So make sure we wait for the parallel
-	 * find_linux_pte_or_hugepage to finish.
+	 * find_curren_mm_pte to finish.
 	 */
-	kick_all_cpus_sync();
+	serialize_against_pte_lookup(mm);
 	return old_pmd;
 }
 
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 671a45d86c18..39c252b54d16 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -8,10 +8,15 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+
+#define pr_fmt(fmt) "radix-mmu: " fmt
+
+#include <linux/kernel.h>
 #include <linux/sched/mm.h>
 #include <linux/memblock.h>
 #include <linux/of_fdt.h>
 #include <linux/mm.h>
+#include <linux/string_helpers.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -31,9 +36,13 @@ unsigned int mmu_base_pid;
 static int native_register_process_table(unsigned long base, unsigned long pg_sz,
 					 unsigned long table_size)
 {
-	unsigned long patb1 = base | table_size | PATB_GR;
+	unsigned long patb0, patb1;
+
+	patb0 = be64_to_cpu(partition_tb[0].patb0);
+	patb1 = base | table_size | PATB_GR;
+
+	mmu_partition_table_set_entry(0, patb0, patb1);
 
-	partition_tb->patb1 = cpu_to_be64(patb1);
 	return 0;
 }
 
@@ -179,10 +188,14 @@ static inline void __meminit print_mapping(unsigned long start,
 					   unsigned long end,
 					   unsigned long size)
 {
+	char buf[10];
+
 	if (end <= start)
 		return;
 
-	pr_info("Mapped range 0x%lx - 0x%lx with 0x%lx\n", start, end, size);
+	string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
+
+	pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf);
 }
 
 static int __meminit create_physical_mapping(unsigned long start,
@@ -526,6 +539,7 @@ void __init radix__early_init_mmu(void)
 	__kernel_virt_size = RADIX_KERN_VIRT_SIZE;
 	__vmalloc_start = RADIX_VMALLOC_START;
 	__vmalloc_end = RADIX_VMALLOC_END;
+	__kernel_io_start = RADIX_KERN_IO_START;
 	vmemmap = (struct page *)RADIX_VMEMMAP_BASE;
 	ioremap_bot = IOREMAP_BASE;
 
@@ -836,9 +850,12 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre
 	 */
 	pmd = *pmdp;
 	pmd_clear(pmdp);
+
 	/*FIXME!!  Verify whether we need this kick below */
-	kick_all_cpus_sync();
-	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+	serialize_against_pte_lookup(vma->vm_mm);
+
+	radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
+
 	return pmd;
 }
 
@@ -897,16 +914,16 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
 	old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
 	old_pmd = __pmd(old);
 	/*
-	 * Serialize against find_linux_pte_or_hugepte which does lock-less
+	 * Serialize against find_current_mm_pte which does lock-less
 	 * lookup in page tables with local interrupts disabled. For huge pages
 	 * it casts pmd_t to pte_t. Since format of pte_t is different from
 	 * pmd_t we want to prevent transit from pmd pointing to page table
 	 * to pmd pointing to huge page (and back) while interrupts are disabled.
 	 * We clear pmd to possibly replace it with page table pointer in
 	 * different code paths. So make sure we wait for the parallel
-	 * find_linux_pte_or_hugepage to finish.
+	 * find_current_mm_pte to finish.
 	 */
-	kick_all_cpus_sync();
+	serialize_against_pte_lookup(mm);
 	return old_pmd;
 }
 
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index a9e4bfc025bc..65eda1997c3f 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -34,6 +34,7 @@
 #include <asm/fixmap.h>
 #include <asm/io.h>
 #include <asm/setup.h>
+#include <asm/sections.h>
 
 #include "mmu_decl.h"
 
@@ -242,7 +243,7 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, int flags)
 /*
  * Map in a chunk of physical memory starting at start.
  */
-void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
+static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
 {
 	unsigned long v, s, f;
 	phys_addr_t p;
@@ -294,7 +295,7 @@ void __init mapin_ram(void)
  * Returns true (1) if PTE was found, zero otherwise.  The pointer to
  * the PTE pointer is unmodified if PTE is not found.
  */
-int
+static int
 get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp)
 {
         pgd_t	*pgd;
@@ -323,9 +324,7 @@ get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp)
         return(retval);
 }
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
-
-static int __change_page_attr(struct page *page, pgprot_t prot)
+static int __change_page_attr_noflush(struct page *page, pgprot_t prot)
 {
 	pte_t *kpte;
 	pmd_t *kpmd;
@@ -339,8 +338,6 @@ static int __change_page_attr(struct page *page, pgprot_t prot)
 	if (!get_pteptr(&init_mm, address, &kpte, &kpmd))
 		return -EINVAL;
 	__set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0);
-	wmb();
-	flush_tlb_page(NULL, address);
 	pte_unmap(kpte);
 
 	return 0;
@@ -349,44 +346,65 @@ static int __change_page_attr(struct page *page, pgprot_t prot)
 /*
  * Change the page attributes of an page in the linear mapping.
  *
- * THIS CONFLICTS WITH BAT MAPPINGS, DEBUG USE ONLY
+ * THIS DOES NOTHING WITH BAT MAPPINGS, DEBUG USE ONLY
  */
 static int change_page_attr(struct page *page, int numpages, pgprot_t prot)
 {
 	int i, err = 0;
 	unsigned long flags;
+	struct page *start = page;
 
 	local_irq_save(flags);
 	for (i = 0; i < numpages; i++, page++) {
-		err = __change_page_attr(page, prot);
+		err = __change_page_attr_noflush(page, prot);
 		if (err)
 			break;
 	}
+	wmb();
+	flush_tlb_kernel_range((unsigned long)page_address(start),
+			       (unsigned long)page_address(page));
 	local_irq_restore(flags);
 	return err;
 }
 
-
-void __kernel_map_pages(struct page *page, int numpages, int enable)
+void mark_initmem_nx(void)
 {
-	if (PageHighMem(page))
-		return;
+	struct page *page = virt_to_page(_sinittext);
+	unsigned long numpages = PFN_UP((unsigned long)_einittext) -
+				 PFN_DOWN((unsigned long)_sinittext);
 
-	change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
+	change_page_attr(page, numpages, PAGE_KERNEL);
 }
-#endif /* CONFIG_DEBUG_PAGEALLOC */
 
-static int fixmaps;
-
-void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void mark_rodata_ro(void)
 {
-	unsigned long address = __fix_to_virt(idx);
+	struct page *page;
+	unsigned long numpages;
+
+	page = virt_to_page(_stext);
+	numpages = PFN_UP((unsigned long)_etext) -
+		   PFN_DOWN((unsigned long)_stext);
 
-	if (idx >= __end_of_fixed_addresses) {
-		BUG();
+	change_page_attr(page, numpages, PAGE_KERNEL_ROX);
+	/*
+	 * mark .rodata as read only. Use __init_begin rather than __end_rodata
+	 * to cover NOTES and EXCEPTION_TABLE.
+	 */
+	page = virt_to_page(__start_rodata);
+	numpages = PFN_UP((unsigned long)__init_begin) -
+		   PFN_DOWN((unsigned long)__start_rodata);
+
+	change_page_attr(page, numpages, PAGE_KERNEL_RO);
+}
+#endif
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	if (PageHighMem(page))
 		return;
-	}
 
-	map_kernel_page(address, phys, pgprot_val(flags));
-	fixmaps++;
+	change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
 }
+#endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 0736e94c7615..ac0717a90ca6 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -104,6 +104,8 @@ unsigned long __vmalloc_start;
 EXPORT_SYMBOL(__vmalloc_start);
 unsigned long __vmalloc_end;
 EXPORT_SYMBOL(__vmalloc_end);
+unsigned long __kernel_io_start;
+EXPORT_SYMBOL(__kernel_io_start);
 struct page *vmemmap;
 EXPORT_SYMBOL(vmemmap);
 unsigned long __pte_frag_nr;
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index bde378559d01..906a86fe457b 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -121,12 +121,25 @@ slb_miss_kernel_load_vmemmap:
 1:
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
-	/* vmalloc mapping gets the encoding from the PACA as the mapping
-	 * can be demoted from 64K -> 4K dynamically on some machines
+	/*
+	 * r10 contains the ESID, which is the original faulting EA shifted
+	 * right by 28 bits. We need to compare that with (H_VMALLOC_END >> 28)
+	 * which is 0xd00038000. That can't be used as an immediate, even if we
+	 * ignored the 0xd, so we have to load it into a register, and we only
+	 * have one register free. So we must load all of (H_VMALLOC_END >> 28)
+	 * into a register and compare ESID against that.
+	 */
+	lis	r11,(H_VMALLOC_END >> 32)@h	// r11 = 0xffffffffd0000000
+	ori	r11,r11,(H_VMALLOC_END >> 32)@l	// r11 = 0xffffffffd0003800
+	// Rotate left 4, then mask with 0xffffffff0
+	rldic	r11,r11,4,28			// r11 = 0xd00038000
+	cmpld	r10,r11				// if r10 >= r11
+	bge	5f				//   goto io_mapping
+
+	/*
+	 * vmalloc mapping gets the encoding from the PACA as the mapping
+	 * can be demoted from 64K -> 4K dynamically on some machines.
 	 */
-	clrldi	r11,r10,48
-	cmpldi	r11,(H_VMALLOC_SIZE >> 28) - 1
-	bgt	5f
 	lhz	r11,PACAVMALLOCSLLP(r13)
 	b	6f
 5:
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 16ae1bbe13f0..b3e849c4886e 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -54,23 +54,15 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
 	 */
 	__tlbiel_pid(pid, 0, ric);
 
-	if (ric == RIC_FLUSH_ALL)
-		/* For the remaining sets, just flush the TLB */
-		ric = RIC_FLUSH_TLB;
+	/* For PWC, only one flush is needed */
+	if (ric == RIC_FLUSH_PWC) {
+		asm volatile("ptesync": : :"memory");
+		return;
+	}
 
+	/* For the remaining sets, just flush the TLB */
 	for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
-		__tlbiel_pid(pid, set, ric);
-
-	asm volatile("ptesync": : :"memory");
-	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
-}
-
-static inline void tlbiel_pwc(unsigned long pid)
-{
-	asm volatile("ptesync": : :"memory");
-
-	/* For PWC flush, we don't look at set number */
-	__tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
+		__tlbiel_pid(pid, set, RIC_FLUSH_TLB);
 
 	asm volatile("ptesync": : :"memory");
 	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
@@ -146,31 +138,23 @@ void radix__local_flush_tlb_mm(struct mm_struct *mm)
 	preempt_disable();
 	pid = mm->context.id;
 	if (pid != MMU_NO_CONTEXT)
-		_tlbiel_pid(pid, RIC_FLUSH_ALL);
+		_tlbiel_pid(pid, RIC_FLUSH_TLB);
 	preempt_enable();
 }
 EXPORT_SYMBOL(radix__local_flush_tlb_mm);
 
-void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
+#ifndef CONFIG_SMP
+static void radix__local_flush_all_mm(struct mm_struct *mm)
 {
 	unsigned long pid;
-	struct mm_struct *mm = tlb->mm;
-	/*
-	 * If we are doing a full mm flush, we will do a tlb flush
-	 * with RIC_FLUSH_ALL later.
-	 */
-	if (tlb->fullmm)
-		return;
 
 	preempt_disable();
-
 	pid = mm->context.id;
 	if (pid != MMU_NO_CONTEXT)
-		tlbiel_pwc(pid);
-
+		_tlbiel_pid(pid, RIC_FLUSH_ALL);
 	preempt_enable();
 }
-EXPORT_SYMBOL(radix__local_flush_tlb_pwc);
+#endif /* CONFIG_SMP */
 
 void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 				       int psize)
@@ -208,38 +192,35 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 		goto no_context;
 
 	if (!mm_is_thread_local(mm))
-		_tlbie_pid(pid, RIC_FLUSH_ALL);
+		_tlbie_pid(pid, RIC_FLUSH_TLB);
 	else
-		_tlbiel_pid(pid, RIC_FLUSH_ALL);
+		_tlbiel_pid(pid, RIC_FLUSH_TLB);
 no_context:
 	preempt_enable();
 }
 EXPORT_SYMBOL(radix__flush_tlb_mm);
 
-void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
+static void radix__flush_all_mm(struct mm_struct *mm)
 {
 	unsigned long pid;
-	struct mm_struct *mm = tlb->mm;
 
-	/*
-	 * If we are doing a full mm flush, we will do a tlb flush
-	 * with RIC_FLUSH_ALL later.
-	 */
-	if (tlb->fullmm)
-		return;
 	preempt_disable();
-
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto no_context;
 
 	if (!mm_is_thread_local(mm))
-		_tlbie_pid(pid, RIC_FLUSH_PWC);
+		_tlbie_pid(pid, RIC_FLUSH_ALL);
 	else
-		tlbiel_pwc(pid);
+		_tlbiel_pid(pid, RIC_FLUSH_ALL);
 no_context:
 	preempt_enable();
 }
+
+void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
+{
+	tlb->need_flush_all = 1;
+}
 EXPORT_SYMBOL(radix__flush_tlb_pwc);
 
 void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
@@ -271,6 +252,8 @@ void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 }
 EXPORT_SYMBOL(radix__flush_tlb_page);
 
+#else /* CONFIG_SMP */
+#define radix__flush_all_mm radix__local_flush_all_mm
 #endif /* CONFIG_SMP */
 
 void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
@@ -288,6 +271,7 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 
 {
 	struct mm_struct *mm = vma->vm_mm;
+
 	radix__flush_tlb_mm(mm);
 }
 EXPORT_SYMBOL(radix__flush_tlb_range);
@@ -319,7 +303,10 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 	 */
 	if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all)
 		radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize);
-	else
+	else if (tlb->need_flush_all) {
+		tlb->need_flush_all = 0;
+		radix__flush_all_mm(mm);
+	} else
 		radix__flush_tlb_mm(mm);
 }
 
@@ -364,6 +351,43 @@ err_out:
 	preempt_enable();
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
+{
+	int local = mm_is_thread_local(mm);
+	unsigned long ap = mmu_get_ap(mmu_virtual_psize);
+	unsigned long pid, end;
+
+
+	pid = mm ? mm->context.id : 0;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		goto no_context;
+
+	/* 4k page size, just blow the world */
+	if (PAGE_SIZE == 0x1000) {
+		radix__flush_all_mm(mm);
+		return;
+	}
+
+	/* Otherwise first do the PWC */
+	if (local)
+		_tlbiel_pid(pid, RIC_FLUSH_PWC);
+	else
+		_tlbie_pid(pid, RIC_FLUSH_PWC);
+
+	/* Then iterate the pages */
+	end = addr + HPAGE_PMD_SIZE;
+	for (; addr < end; addr += PAGE_SIZE) {
+		if (local)
+			_tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+		else
+			_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+	}
+no_context:
+	preempt_enable();
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa,
 			      unsigned long page_size)
 {
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index b5b0fb97b9c0..881ebd53ffc2 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -29,6 +29,8 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/bug.h>
+#include <asm/pte-walk.h>
+
 
 #include <trace/events/thp.h>
 
@@ -138,13 +140,10 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
  */
 void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
 {
-	const struct cpumask *tmp;
-	int i, local = 0;
+	int i, local;
 
 	i = batch->index;
-	tmp = cpumask_of(smp_processor_id());
-	if (cpumask_equal(mm_cpumask(batch->mm), tmp))
-		local = 1;
+	local = mm_is_thread_local(batch->mm);
 	if (i == 1)
 		flush_hash_page(batch->vpn[0], batch->pte[0],
 				batch->psize, batch->ssize, local);
@@ -207,8 +206,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 	local_irq_save(flags);
 	arch_enter_lazy_mmu_mode();
 	for (; start < end; start += PAGE_SIZE) {
-		pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, &is_thp,
-							&hugepage_shift);
+		pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp,
+						  &hugepage_shift);
 		unsigned long pte;
 
 		if (ptep == NULL)
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
index eabecfcaef7c..048b8e9f4492 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -60,7 +60,7 @@ _GLOBAL(__tlbil_va)
 	isync
 1:	blr
 
-#elif defined(CONFIG_8xx)
+#elif defined(CONFIG_PPC_8xx)
 
 /*
  * Nothing to do for 8xx, everything is inline