summaryrefslogtreecommitdiff
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/8xx_mmu.c29
-rw-r--r--arch/powerpc/mm/Makefile4
-rw-r--r--arch/powerpc/mm/dump_hashpagetable.c2
-rw-r--r--arch/powerpc/mm/dump_linuxpagetables.c2
-rw-r--r--arch/powerpc/mm/fault.c562
-rw-r--r--arch/powerpc/mm/hash_low_32.S2
-rw-r--r--arch/powerpc/mm/hash_utils_64.c16
-rw-r--r--arch/powerpc/mm/hugetlbpage.c219
-rw-r--r--arch/powerpc/mm/icswx.c292
-rw-r--r--arch/powerpc/mm/icswx.h68
-rw-r--r--arch/powerpc/mm/icswx_pid.c87
-rw-r--r--arch/powerpc/mm/init_32.c8
-rw-r--r--arch/powerpc/mm/init_64.c6
-rw-r--r--arch/powerpc/mm/mem.c2
-rw-r--r--arch/powerpc/mm/mmu_context.c99
-rw-r--r--arch/powerpc/mm/mmu_context_book3s64.c20
-rw-r--r--arch/powerpc/mm/mmu_decl.h10
-rw-r--r--arch/powerpc/mm/pgtable-book3s64.c24
-rw-r--r--arch/powerpc/mm/pgtable-hash64.c12
-rw-r--r--arch/powerpc/mm/pgtable-radix.c33
-rw-r--r--arch/powerpc/mm/pgtable_32.c66
-rw-r--r--arch/powerpc/mm/pgtable_64.c2
-rw-r--r--arch/powerpc/mm/slb_low.S23
-rw-r--r--arch/powerpc/mm/tlb-radix.c108
-rw-r--r--arch/powerpc/mm/tlb_hash64.c13
-rw-r--r--arch/powerpc/mm/tlb_nohash_low.S2
26 files changed, 670 insertions, 1041 deletions
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
index f4c6472f2fc4..f29212e40f40 100644
--- a/arch/powerpc/mm/8xx_mmu.c
+++ b/arch/powerpc/mm/8xx_mmu.c
@@ -22,8 +22,11 @@
extern int __map_without_ltlbs;
+static unsigned long block_mapped_ram;
+
/*
- * Return PA for this VA if it is in IMMR area, or 0
+ * Return PA for this VA if it is in an area mapped with LTLBs.
+ * Otherwise, returns 0
*/
phys_addr_t v_block_mapped(unsigned long va)
{
@@ -33,11 +36,13 @@ phys_addr_t v_block_mapped(unsigned long va)
return 0;
if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE)
return p + va - VIRT_IMMR_BASE;
+ if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram)
+ return __pa(va);
return 0;
}
/*
- * Return VA for a given PA or 0 if not mapped
+ * Return VA for a given PA mapped with LTLBs or 0 if not mapped
*/
unsigned long p_block_mapped(phys_addr_t pa)
{
@@ -47,6 +52,8 @@ unsigned long p_block_mapped(phys_addr_t pa)
return 0;
if (pa >= p && pa < p + IMMR_SIZE)
return VIRT_IMMR_BASE + pa - p;
+ if (pa < block_mapped_ram)
+ return (unsigned long)__va(pa);
return 0;
}
@@ -58,7 +65,7 @@ unsigned long p_block_mapped(phys_addr_t pa)
void __init MMU_init_hw(void)
{
/* PIN up to the 3 first 8Mb after IMMR in DTLB table */
-#ifdef CONFIG_PIN_TLB
+#ifdef CONFIG_PIN_TLB_DATA
unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000;
unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SHARED | _PAGE_DIRTY;
#ifdef CONFIG_PIN_TLB_IMMR
@@ -80,7 +87,7 @@ void __init MMU_init_hw(void)
#endif
}
-static void mmu_mapin_immr(void)
+static void __init mmu_mapin_immr(void)
{
unsigned long p = PHYS_IMMR_BASE;
unsigned long v = VIRT_IMMR_BASE;
@@ -96,8 +103,11 @@ static void mmu_mapin_immr(void)
extern unsigned int DTLBMiss_jmp;
#endif
extern unsigned int DTLBMiss_cmp, FixupDAR_cmp;
+#ifndef CONFIG_PIN_TLB_TEXT
+extern unsigned int ITLBMiss_cmp;
+#endif
-void mmu_patch_cmp_limit(unsigned int *addr, unsigned long mapped)
+static void __init mmu_patch_cmp_limit(unsigned int *addr, unsigned long mapped)
{
unsigned int instr = *addr;
@@ -116,6 +126,9 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
#ifndef CONFIG_PIN_TLB_IMMR
patch_instruction(&DTLBMiss_jmp, PPC_INST_NOP);
#endif
+#ifndef CONFIG_PIN_TLB_TEXT
+ mmu_patch_cmp_limit(&ITLBMiss_cmp, 0);
+#endif
} else {
mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
}
@@ -133,11 +146,13 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
if (mapped)
memblock_set_current_limit(mapped);
+ block_mapped_ram = mapped;
+
return mapped;
}
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size)
+void __init setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
{
/* We don't currently support the first MEMBLOCK not mapping 0
* physical on those processors
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 7414034df1c3..fb844d2f266e 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -8,7 +8,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
obj-y := fault.o mem.o pgtable.o mmap.o \
init_$(BITS).o pgtable_$(BITS).o \
- init-common.o
+ init-common.o mmu_context.o
obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
tlb_nohash_low.o
obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o
@@ -22,8 +22,6 @@ ifeq ($(CONFIG_PPC_STD_MMU_64),y)
obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o
obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o
endif
-obj-$(CONFIG_PPC_ICSWX) += icswx.o
-obj-$(CONFIG_PPC_ICSWX_PID) += icswx_pid.o
obj-$(CONFIG_40x) += 40x_mmu.o
obj-$(CONFIG_44x) += 44x_mmu.o
obj-$(CONFIG_PPC_8xx) += 8xx_mmu.o
diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c
index b1c144b03fcf..5c4c93dcff19 100644
--- a/arch/powerpc/mm/dump_hashpagetable.c
+++ b/arch/powerpc/mm/dump_hashpagetable.c
@@ -205,7 +205,7 @@ static void dump_hpte_info(struct pg_state *st, unsigned long ea, u64 v, u64 r,
aps_index = calculate_pagesize(st, aps, "actual");
if (aps_index != 2)
seq_printf(st->seq, "LP enc: %lx", lp);
- seq_puts(st->seq, "\n");
+ seq_putc(st->seq, '\n');
}
diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c
index 44fe4833910f..c9282d27b203 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -350,7 +350,7 @@ static void note_page(struct pg_state *st, unsigned long addr,
st->current_flags,
pg_level[st->level].num);
- seq_puts(st->seq, "\n");
+ seq_putc(st->seq, '\n');
}
/*
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 4c422632047b..4797d08581ce 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -45,43 +45,39 @@
#include <asm/siginfo.h>
#include <asm/debug.h>
-#include "icswx.h"
-
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs)
+static inline bool notify_page_fault(struct pt_regs *regs)
{
- int ret = 0;
+ bool ret = false;
+#ifdef CONFIG_KPROBES
/* kprobe_running() needs smp_processor_id() */
if (!user_mode(regs)) {
preempt_disable();
if (kprobe_running() && kprobe_fault_handler(regs, 11))
- ret = 1;
+ ret = true;
preempt_enable();
}
+#endif /* CONFIG_KPROBES */
+
+ if (unlikely(debugger_fault_handler(regs)))
+ ret = true;
return ret;
}
-#else
-static inline int notify_page_fault(struct pt_regs *regs)
-{
- return 0;
-}
-#endif
/*
* Check whether the instruction at regs->nip is a store using
* an update addressing form which will update r1.
*/
-static int store_updates_sp(struct pt_regs *regs)
+static bool store_updates_sp(struct pt_regs *regs)
{
unsigned int inst;
if (get_user(inst, (unsigned int __user *)regs->nip))
- return 0;
+ return false;
/* check for 1 in the rA field */
if (((inst >> 16) & 0x1f) != 1)
- return 0;
+ return false;
/* check major opcode */
switch (inst >> 26) {
case 37: /* stwu */
@@ -89,7 +85,7 @@ static int store_updates_sp(struct pt_regs *regs)
case 45: /* sthu */
case 53: /* stfsu */
case 55: /* stfdu */
- return 1;
+ return true;
case 62: /* std or stdu */
return (inst & 3) == 1;
case 31:
@@ -101,18 +97,53 @@ static int store_updates_sp(struct pt_regs *regs)
case 439: /* sthux */
case 695: /* stfsux */
case 759: /* stfdux */
- return 1;
+ return true;
}
}
- return 0;
+ return false;
}
/*
* do_page_fault error handling helpers
*/
-#define MM_FAULT_RETURN 0
-#define MM_FAULT_CONTINUE -1
-#define MM_FAULT_ERR(sig) (sig)
+static int
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code)
+{
+ /*
+ * If we are in kernel mode, bail out with a SEGV, this will
+ * be caught by the assembly which will restore the non-volatile
+ * registers before calling bad_page_fault()
+ */
+ if (!user_mode(regs))
+ return SIGSEGV;
+
+ _exception(SIGSEGV, regs, si_code, address);
+
+ return 0;
+}
+
+static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address)
+{
+ return __bad_area_nosemaphore(regs, address, SEGV_MAPERR);
+}
+
+static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code)
+{
+ struct mm_struct *mm = current->mm;
+
+ /*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+ up_read(&mm->mmap_sem);
+
+ return __bad_area_nosemaphore(regs, address, si_code);
+}
+
+static noinline int bad_area(struct pt_regs *regs, unsigned long address)
+{
+ return __bad_area(regs, address, SEGV_MAPERR);
+}
static int do_sigbus(struct pt_regs *regs, unsigned long address,
unsigned int fault)
@@ -121,7 +152,7 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
unsigned int lsb = 0;
if (!user_mode(regs))
- return MM_FAULT_ERR(SIGBUS);
+ return SIGBUS;
current->thread.trap_nr = BUS_ADRERR;
info.si_signo = SIGBUS;
@@ -142,25 +173,17 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
#endif
info.si_addr_lsb = lsb;
force_sig_info(SIGBUS, &info, current);
- return MM_FAULT_RETURN;
+ return 0;
}
static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
{
/*
- * Pagefault was interrupted by SIGKILL. We have no reason to
- * continue the pagefault.
+ * Kernel page fault interrupted by SIGKILL. We have no reason to
+ * continue processing.
*/
- if (fatal_signal_pending(current)) {
- /* Coming from kernel, we need to deal with uaccess fixups */
- if (user_mode(regs))
- return MM_FAULT_RETURN;
- return MM_FAULT_ERR(SIGKILL);
- }
-
- /* No fault: be happy */
- if (!(fault & VM_FAULT_ERROR))
- return MM_FAULT_CONTINUE;
+ if (fatal_signal_pending(current) && !user_mode(regs))
+ return SIGKILL;
/* Out of memory */
if (fault & VM_FAULT_OOM) {
@@ -169,19 +192,176 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
* made us unable to handle the page fault gracefully.
*/
if (!user_mode(regs))
- return MM_FAULT_ERR(SIGKILL);
+ return SIGSEGV;
pagefault_out_of_memory();
- return MM_FAULT_RETURN;
+ } else {
+ if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+ VM_FAULT_HWPOISON_LARGE))
+ return do_sigbus(regs, addr, fault);
+ else if (fault & VM_FAULT_SIGSEGV)
+ return bad_area_nosemaphore(regs, addr);
+ else
+ BUG();
+ }
+ return 0;
+}
+
+/* Is this a bad kernel fault ? */
+static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
+ unsigned long address)
+{
+ if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT))) {
+ printk_ratelimited(KERN_CRIT "kernel tried to execute"
+ " exec-protected page (%lx) -"
+ "exploit attempt? (uid: %d)\n",
+ address, from_kuid(&init_user_ns,
+ current_uid()));
}
+ return is_exec || (address >= TASK_SIZE);
+}
- if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE))
- return do_sigbus(regs, addr, fault);
+static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
+ struct vm_area_struct *vma,
+ bool store_update_sp)
+{
+ /*
+ * N.B. The POWER/Open ABI allows programs to access up to
+ * 288 bytes below the stack pointer.
+ * The kernel signal delivery code writes up to about 1.5kB
+ * below the stack pointer (r1) before decrementing it.
+ * The exec code can write slightly over 640kB to the stack
+ * before setting the user r1. Thus we allow the stack to
+ * expand to 1MB without further checks.
+ */
+ if (address + 0x100000 < vma->vm_end) {
+ /* get user regs even if this fault is in kernel mode */
+ struct pt_regs *uregs = current->thread.regs;
+ if (uregs == NULL)
+ return true;
- /* We don't understand the fault code, this is fatal */
- BUG();
- return MM_FAULT_CONTINUE;
+ /*
+ * A user-mode access to an address a long way below
+ * the stack pointer is only valid if the instruction
+ * is one which would update the stack pointer to the
+ * address accessed if the instruction completed,
+ * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
+ * (or the byte, halfword, float or double forms).
+ *
+ * If we don't check this then any write to the area
+ * between the last mapped region and the stack will
+ * expand the stack rather than segfaulting.
+ */
+ if (address + 2048 < uregs->gpr[1] && !store_update_sp)
+ return true;
+ }
+ return false;
+}
+
+static bool access_error(bool is_write, bool is_exec,
+ struct vm_area_struct *vma)
+{
+ /*
+ * Allow execution from readable areas if the MMU does not
+ * provide separate controls over reading and executing.
+ *
+ * Note: That code used to not be enabled for 4xx/BookE.
+ * It is now as I/D cache coherency for these is done at
+ * set_pte_at() time and I see no reason why the test
+ * below wouldn't be valid on those processors. This -may-
+ * break programs compiled with a really old ABI though.
+ */
+ if (is_exec) {
+ return !(vma->vm_flags & VM_EXEC) &&
+ (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
+ !(vma->vm_flags & (VM_READ | VM_WRITE)));
+ }
+
+ if (is_write) {
+ if (unlikely(!(vma->vm_flags & VM_WRITE)))
+ return true;
+ return false;
+ }
+
+ if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+ return true;
+
+ return false;
}
+#ifdef CONFIG_PPC_SMLPAR
+static inline void cmo_account_page_fault(void)
+{
+ if (firmware_has_feature(FW_FEATURE_CMO)) {
+ u32 page_ins;
+
+ preempt_disable();
+ page_ins = be32_to_cpu(get_lppaca()->page_ins);
+ page_ins += 1 << PAGE_FACTOR;
+ get_lppaca()->page_ins = cpu_to_be32(page_ins);
+ preempt_enable();
+ }
+}
+#else
+static inline void cmo_account_page_fault(void) { }
+#endif /* CONFIG_PPC_SMLPAR */
+
+#ifdef CONFIG_PPC_STD_MMU
+static void sanity_check_fault(bool is_write, unsigned long error_code)
+{
+ /*
+ * For hash translation mode, we should never get a
+ * PROTFAULT. Any update to pte to reduce access will result in us
+ * removing the hash page table entry, thus resulting in a DSISR_NOHPTE
+ * fault instead of DSISR_PROTFAULT.
+ *
+ * A pte update to relax the access will not result in a hash page table
+ * entry invalidate and hence can result in DSISR_PROTFAULT.
+ * ptep_set_access_flags() doesn't do a hpte flush. This is why we have
+ * the special !is_write in the below conditional.
+ *
+ * For platforms that doesn't supports coherent icache and do support
+ * per page noexec bit, we do setup things such that we do the
+ * sync between D/I cache via fault. But that is handled via low level
+ * hash fault code (hash_page_do_lazy_icache()) and we should not reach
+ * here in such case.
+ *
+ * For wrong access that can result in PROTFAULT, the above vma->vm_flags
+ * check should handle those and hence we should fall to the bad_area
+ * handling correctly.
+ *
+ * For embedded with per page exec support that doesn't support coherent
+ * icache we do get PROTFAULT and we handle that D/I cache sync in
+ * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON
+ * is conditional for server MMU.
+ *
+ * For radix, we can get prot fault for autonuma case, because radix
+ * page table will have them marked noaccess for user.
+ */
+ if (!radix_enabled() && !is_write)
+ WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
+}
+#else
+static void sanity_check_fault(bool is_write, unsigned long error_code) { }
+#endif /* CONFIG_PPC_STD_MMU */
+
+/*
+ * Define the correct "is_write" bit in error_code based
+ * on the processor family
+ */
+#if (defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+#define page_fault_is_write(__err) ((__err) & ESR_DST)
+#define page_fault_is_bad(__err) (0)
+#else
+#define page_fault_is_write(__err) ((__err) & DSISR_ISSTORE)
+#if defined(CONFIG_PPC_8xx)
+#define page_fault_is_bad(__err) ((__err) & DSISR_NOEXEC_OR_G)
+#elif defined(CONFIG_PPC64)
+#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_64S)
+#else
+#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S)
+#endif
+#endif
+
/*
* For 600- and 800-family processors, the error_code parameter is DSISR
* for a data fault, SRR1 for an instruction fault. For 400-family processors
@@ -195,92 +375,56 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
* The return value is 0 if the fault was handled, or the signal
* number if this is a kernel fault that can't be handled here.
*/
-int do_page_fault(struct pt_regs *regs, unsigned long address,
- unsigned long error_code)
+static int __do_page_fault(struct pt_regs *regs, unsigned long address,
+ unsigned long error_code)
{
- enum ctx_state prev_state = exception_enter();
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
- int code = SEGV_MAPERR;
- int is_write = 0;
- int trap = TRAP(regs);
- int is_exec = trap == 0x400;
+ int is_exec = TRAP(regs) == 0x400;
int is_user = user_mode(regs);
- int fault;
- int rc = 0, store_update_sp = 0;
+ int is_write = page_fault_is_write(error_code);
+ int fault, major = 0;
+ bool store_update_sp = false;
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
- /*
- * Fortunately the bit assignments in SRR1 for an instruction
- * fault and DSISR for a data fault are mostly the same for the
- * bits we are interested in. But there are some bits which
- * indicate errors in DSISR but can validly be set in SRR1.
- */
- if (is_exec)
- error_code &= 0x48200000;
- else
- is_write = error_code & DSISR_ISSTORE;
-#else
- is_write = error_code & ESR_DST;
-#endif /* CONFIG_4xx || CONFIG_BOOKE */
+ if (notify_page_fault(regs))
+ return 0;
-#ifdef CONFIG_PPC_ICSWX
- /*
- * we need to do this early because this "data storage
- * interrupt" does not update the DAR/DEAR so we don't want to
- * look at it
- */
- if (error_code & ICSWX_DSI_UCT) {
- rc = acop_handle_fault(regs, address, error_code);
- if (rc)
- goto bail;
+ if (unlikely(page_fault_is_bad(error_code))) {
+ if (is_user) {
+ _exception(SIGBUS, regs, BUS_OBJERR, address);
+ return 0;
+ }
+ return SIGBUS;
}
-#endif /* CONFIG_PPC_ICSWX */
-
- if (notify_page_fault(regs))
- goto bail;
- if (unlikely(debugger_fault_handler(regs)))
- goto bail;
+ /* Additional sanity check(s) */
+ sanity_check_fault(is_write, error_code);
/*
* The kernel should never take an execute fault nor should it
* take a page fault to a kernel address.
*/
- if (!is_user && (is_exec || (address >= TASK_SIZE))) {
- rc = SIGSEGV;
- goto bail;
- }
+ if (unlikely(!is_user && bad_kernel_fault(is_exec, error_code, address)))
+ return SIGSEGV;
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
- defined(CONFIG_PPC_BOOK3S_64) || defined(CONFIG_PPC_8xx))
- if (error_code & DSISR_DABRMATCH) {
- /* breakpoint match */
- do_break(regs, address, error_code);
- goto bail;
+ /*
+ * If we're in an interrupt, have no user context or are running
+ * in a region with pagefaults disabled then we must not take the fault
+ */
+ if (unlikely(faulthandler_disabled() || !mm)) {
+ if (is_user)
+ printk_ratelimited(KERN_ERR "Page fault in user mode"
+ " with faulthandler_disabled()=%d"
+ " mm=%p\n",
+ faulthandler_disabled(), mm);
+ return bad_area_nosemaphore(regs, address);
}
-#endif
/* We restore the interrupt state now */
if (!arch_irq_disabled_regs(regs))
local_irq_enable();
- if (faulthandler_disabled() || mm == NULL) {
- if (!is_user) {
- rc = SIGSEGV;
- goto bail;
- }
- /* faulthandler_disabled() in user mode is really bad,
- as is current->mm == NULL. */
- printk(KERN_EMERG "Page fault in user mode with "
- "faulthandler_disabled() = %d mm = %p\n",
- faulthandler_disabled(), mm);
- printk(KERN_EMERG "NIP = %lx MSR = %lx\n",
- regs->nip, regs->msr);
- die("Weird page fault", regs, SIGSEGV);
- }
-
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
/*
@@ -293,6 +437,10 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
if (is_user)
flags |= FAULT_FLAG_USER;
+ if (is_write)
+ flags |= FAULT_FLAG_WRITE;
+ if (is_exec)
+ flags |= FAULT_FLAG_INSTRUCTION;
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
@@ -309,9 +457,9 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
* source. If this is invalid we can skip the address space check,
* thus avoiding the deadlock.
*/
- if (!down_read_trylock(&mm->mmap_sem)) {
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if (!is_user && !search_exception_tables(regs->nip))
- goto bad_area_nosemaphore;
+ return bad_area_nosemaphore(regs, address);
retry:
down_read(&mm->mmap_sem);
@@ -325,122 +473,24 @@ retry:
}
vma = find_vma(mm, address);
- if (!vma)
- goto bad_area;
- if (vma->vm_start <= address)
+ if (unlikely(!vma))
+ return bad_area(regs, address);
+ if (likely(vma->vm_start <= address))
goto good_area;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto bad_area;
+ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+ return bad_area(regs, address);
- /*
- * N.B. The POWER/Open ABI allows programs to access up to
- * 288 bytes below the stack pointer.
- * The kernel signal delivery code writes up to about 1.5kB
- * below the stack pointer (r1) before decrementing it.
- * The exec code can write slightly over 640kB to the stack
- * before setting the user r1. Thus we allow the stack to
- * expand to 1MB without further checks.
- */
- if (address + 0x100000 < vma->vm_end) {
- /* get user regs even if this fault is in kernel mode */
- struct pt_regs *uregs = current->thread.regs;
- if (uregs == NULL)
- goto bad_area;
+ /* The stack is being expanded, check if it's valid */
+ if (unlikely(bad_stack_expansion(regs, address, vma, store_update_sp)))
+ return bad_area(regs, address);
- /*
- * A user-mode access to an address a long way below
- * the stack pointer is only valid if the instruction
- * is one which would update the stack pointer to the
- * address accessed if the instruction completed,
- * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
- * (or the byte, halfword, float or double forms).
- *
- * If we don't check this then any write to the area
- * between the last mapped region and the stack will
- * expand the stack rather than segfaulting.
- */
- if (address + 2048 < uregs->gpr[1] && !store_update_sp)
- goto bad_area;
- }
- if (expand_stack(vma, address))
- goto bad_area;
+ /* Try to expand it */
+ if (unlikely(expand_stack(vma, address)))
+ return bad_area(regs, address);
good_area:
- code = SEGV_ACCERR;
-#if defined(CONFIG_6xx)
- if (error_code & 0x95700000)
- /* an error such as lwarx to I/O controller space,
- address matching DABR, eciwx, etc. */
- goto bad_area;
-#endif /* CONFIG_6xx */
-#if defined(CONFIG_8xx)
- /* The MPC8xx seems to always set 0x80000000, which is
- * "undefined". Of those that can be set, this is the only
- * one which seems bad.
- */
- if (error_code & 0x10000000)
- /* Guarded storage error. */
- goto bad_area;
-#endif /* CONFIG_8xx */
-
- if (is_exec) {
- /*
- * Allow execution from readable areas if the MMU does not
- * provide separate controls over reading and executing.
- *
- * Note: That code used to not be enabled for 4xx/BookE.
- * It is now as I/D cache coherency for these is done at
- * set_pte_at() time and I see no reason why the test
- * below wouldn't be valid on those processors. This -may-
- * break programs compiled with a really old ABI though.
- */
- if (!(vma->vm_flags & VM_EXEC) &&
- (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
- !(vma->vm_flags & (VM_READ | VM_WRITE))))
- goto bad_area;
- /* a write */
- } else if (is_write) {
- if (!(vma->vm_flags & VM_WRITE))
- goto bad_area;
- flags |= FAULT_FLAG_WRITE;
- /* a read */
- } else {
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
- goto bad_area;
- }
-#ifdef CONFIG_PPC_STD_MMU
- /*
- * For hash translation mode, we should never get a
- * PROTFAULT. Any update to pte to reduce access will result in us
- * removing the hash page table entry, thus resulting in a DSISR_NOHPTE
- * fault instead of DSISR_PROTFAULT.
- *
- * A pte update to relax the access will not result in a hash page table
- * entry invalidate and hence can result in DSISR_PROTFAULT.
- * ptep_set_access_flags() doesn't do a hpte flush. This is why we have
- * the special !is_write in the below conditional.
- *
- * For platforms that doesn't supports coherent icache and do support
- * per page noexec bit, we do setup things such that we do the
- * sync between D/I cache via fault. But that is handled via low level
- * hash fault code (hash_page_do_lazy_icache()) and we should not reach
- * here in such case.
- *
- * For wrong access that can result in PROTFAULT, the above vma->vm_flags
- * check should handle those and hence we should fall to the bad_area
- * handling correctly.
- *
- * For embedded with per page exec support that doesn't support coherent
- * icache we do get PROTFAULT and we handle that D/I cache sync in
- * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON
- * is conditional for server MMU.
- *
- * For radix, we can get prot fault for autonuma case, because radix
- * page table will have them marked noaccess for user.
- */
- if (!radix_enabled() && !is_write)
- WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
-#endif /* CONFIG_PPC_STD_MMU */
+ if (unlikely(access_error(is_write, is_exec, vma)))
+ return bad_area(regs, address);
/*
* If for any reason at all we couldn't handle the fault,
@@ -448,6 +498,7 @@ good_area:
* the fault.
*/
fault = handle_mm_fault(vma, address, flags);
+ major |= fault & VM_FAULT_MAJOR;
/*
* Handle the retry right now, the mmap_sem has been released in that
@@ -465,64 +516,39 @@ good_area:
if (!fatal_signal_pending(current))
goto retry;
}
- /* We will enter mm_fault_error() below */
- } else
- up_read(&current->mm->mmap_sem);
-
- if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
- if (fault & VM_FAULT_SIGSEGV)
- goto bad_area_nosemaphore;
- rc = mm_fault_error(regs, address, fault);
- if (rc >= MM_FAULT_RETURN)
- goto bail;
- else
- rc = 0;
+
+ /*
+ * User mode? Just return to handle the fatal exception otherwise
+ * return to bad_page_fault
+ */
+ return is_user ? 0 : SIGBUS;
}
+ up_read(&current->mm->mmap_sem);
+
+ if (unlikely(fault & VM_FAULT_ERROR))
+ return mm_fault_error(regs, address, fault);
+
/*
* Major/minor page fault accounting.
*/
- if (fault & VM_FAULT_MAJOR) {
+ if (major) {
current->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
- regs, address);
-#ifdef CONFIG_PPC_SMLPAR
- if (firmware_has_feature(FW_FEATURE_CMO)) {
- u32 page_ins;
-
- preempt_disable();
- page_ins = be32_to_cpu(get_lppaca()->page_ins);
- page_ins += 1 << PAGE_FACTOR;
- get_lppaca()->page_ins = cpu_to_be32(page_ins);
- preempt_enable();
- }
-#endif /* CONFIG_PPC_SMLPAR */
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+ cmo_account_page_fault();
} else {
current->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
- regs, address);
- }
-
- goto bail;
-
-bad_area:
- up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
- /* User mode accesses cause a SIGSEGV */
- if (is_user) {
- _exception(SIGSEGV, regs, code, address);
- goto bail;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
+ return 0;
+}
+NOKPROBE_SYMBOL(__do_page_fault);
- if (is_exec && (error_code & DSISR_PROTFAULT))
- printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
- " page (%lx) - exploit attempt? (uid: %d)\n",
- address, from_kuid(&init_user_ns, current_uid()));
-
- rc = SIGSEGV;
-
-bail:
+int do_page_fault(struct pt_regs *regs, unsigned long address,
+ unsigned long error_code)
+{
+ enum ctx_state prev_state = exception_enter();
+ int rc = __do_page_fault(regs, address, error_code);
exception_exit(prev_state);
return rc;
}
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 6f962e5cb5e1..ffbd7c0bda96 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -575,7 +575,6 @@ _GLOBAL(flush_hash_pages)
rlwinm r8,r8,0,31,29 /* clear HASHPTE bit */
stwcx. r8,0,r5 /* update the pte */
bne- 33b
-EXPORT_SYMBOL(flush_hash_pages)
/* Get the address of the primary PTE group in the hash table (r3) */
_GLOBAL(flush_hash_patch_A)
@@ -634,6 +633,7 @@ _GLOBAL(flush_hash_patch_B)
SYNC_601
isync
blr
+EXPORT_SYMBOL(flush_hash_pages)
/*
* Flush an entry from the TLB
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 7a20669c19e7..67ec2e927253 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -61,6 +61,7 @@
#include <asm/tm.h>
#include <asm/trace.h>
#include <asm/ps3.h>
+#include <asm/pte-walk.h>
#ifdef DEBUG
#define DBG(fmt...) udbg_printf(fmt)
@@ -507,9 +508,9 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
printk(KERN_INFO "Huge page(16GB) memory: "
"addr = 0x%lX size = 0x%lX pages = %d\n",
phys_addr, block_size, expected_pages);
- if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) {
+ if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) {
memblock_reserve(phys_addr, block_size * expected_pages);
- add_gpage(phys_addr, block_size, expected_pages);
+ pseries_add_gpage(phys_addr, block_size, expected_pages);
}
return 0;
}
@@ -1019,6 +1020,7 @@ void __init hash__early_init_mmu(void)
__kernel_virt_size = H_KERN_VIRT_SIZE;
__vmalloc_start = H_VMALLOC_START;
__vmalloc_end = H_VMALLOC_END;
+ __kernel_io_start = H_KERN_IO_START;
vmemmap = (struct page *)H_VMEMMAP_BASE;
ioremap_bot = IOREMAP_BASE;
@@ -1228,7 +1230,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
unsigned long vsid;
pte_t *ptep;
unsigned hugeshift;
- const struct cpumask *tmp;
int rc, user_region = 0;
int psize, ssize;
@@ -1280,8 +1281,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
}
/* Check CPU locality */
- tmp = cpumask_of(smp_processor_id());
- if (user_region && cpumask_equal(mm_cpumask(mm), tmp))
+ if (user_region && mm_is_thread_local(mm))
flags |= HPTE_LOCAL_UPDATE;
#ifndef CONFIG_PPC_64K_PAGES
@@ -1297,7 +1297,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
#endif /* CONFIG_PPC_64K_PAGES */
/* Get PTE and page size from page tables */
- ptep = __find_linux_pte_or_hugepte(pgdir, ea, &is_thp, &hugeshift);
+ ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift);
if (ptep == NULL || !pte_present(*ptep)) {
DBG_LOW(" no PTE !\n");
rc = 1;
@@ -1526,7 +1526,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
* THP pages use update_mmu_cache_pmd. We don't do
* hash preload there. Hence can ignore THP here
*/
- ptep = find_linux_pte_or_hugepte(pgdir, ea, NULL, &hugepage_shift);
+ ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift);
if (!ptep)
goto out_exit;
@@ -1543,7 +1543,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
#endif /* CONFIG_PPC_64K_PAGES */
/* Is that local to this CPU ? */
- if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+ if (mm_is_thread_local(mm))
update_flags |= HPTE_LOCAL_UPDATE;
/* Hash it in */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index e1bf5ca397fe..1571a498a33f 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -24,6 +24,8 @@
#include <asm/tlb.h>
#include <asm/setup.h>
#include <asm/hugetlb.h>
+#include <asm/pte-walk.h>
+
#ifdef CONFIG_HUGETLB_PAGE
@@ -36,32 +38,15 @@
unsigned int HPAGE_SHIFT;
EXPORT_SYMBOL(HPAGE_SHIFT);
-/*
- * Tracks gpages after the device tree is scanned and before the
- * huge_boot_pages list is ready. On non-Freescale implementations, this is
- * just used to track 16G pages and so is a single array. FSL-based
- * implementations may have more than one gpage size, so we need multiple
- * arrays
- */
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
-#define MAX_NUMBER_GPAGES 128
-struct psize_gpages {
- u64 gpage_list[MAX_NUMBER_GPAGES];
- unsigned int nr_gpages;
-};
-static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
-#else
-#define MAX_NUMBER_GPAGES 1024
-static u64 gpage_freearray[MAX_NUMBER_GPAGES];
-static unsigned nr_gpages;
-#endif
-
#define hugepd_none(hpd) (hpd_val(hpd) == 0)
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
- /* Only called for hugetlbfs pages, hence can ignore THP */
- return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
+ /*
+ * Only called for hugetlbfs pages, hence can ignore THP and the
+ * irq disabled walk.
+ */
+ return __find_linux_pte(mm->pgd, addr, NULL, NULL);
}
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
@@ -210,145 +195,20 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
return hugepte_offset(*hpdp, addr, pdshift);
}
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
-/* Build list of addresses of gigantic pages. This function is used in early
- * boot before the buddy allocator is setup.
- */
-void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
-{
- unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
- int i;
-
- if (addr == 0)
- return;
-
- gpage_freearray[idx].nr_gpages = number_of_pages;
-
- for (i = 0; i < number_of_pages; i++) {
- gpage_freearray[idx].gpage_list[i] = addr;
- addr += page_size;
- }
-}
-
-/*
- * Moves the gigantic page addresses from the temporary list to the
- * huge_boot_pages list.
- */
-int alloc_bootmem_huge_page(struct hstate *hstate)
-{
- struct huge_bootmem_page *m;
- int idx = shift_to_mmu_psize(huge_page_shift(hstate));
- int nr_gpages = gpage_freearray[idx].nr_gpages;
-
- if (nr_gpages == 0)
- return 0;
-
-#ifdef CONFIG_HIGHMEM
- /*
- * If gpages can be in highmem we can't use the trick of storing the
- * data structure in the page; allocate space for this
- */
- m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0);
- m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
-#else
- m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
-#endif
-
- list_add(&m->list, &huge_boot_pages);
- gpage_freearray[idx].nr_gpages = nr_gpages;
- gpage_freearray[idx].gpage_list[nr_gpages] = 0;
- m->hstate = hstate;
-
- return 1;
-}
+#ifdef CONFIG_PPC_BOOK3S_64
/*
- * Scan the command line hugepagesz= options for gigantic pages; store those in
- * a list that we use to allocate the memory once all options are parsed.
+ * Tracks gpages after the device tree is scanned and before the
+ * huge_boot_pages list is ready on pseries.
*/
-
-unsigned long gpage_npages[MMU_PAGE_COUNT];
-
-static int __init do_gpage_early_setup(char *param, char *val,
- const char *unused, void *arg)
-{
- static phys_addr_t size;
- unsigned long npages;
-
- /*
- * The hugepagesz and hugepages cmdline options are interleaved. We
- * use the size variable to keep track of whether or not this was done
- * properly and skip over instances where it is incorrect. Other
- * command-line parsing code will issue warnings, so we don't need to.
- *
- */
- if ((strcmp(param, "default_hugepagesz") == 0) ||
- (strcmp(param, "hugepagesz") == 0)) {
- size = memparse(val, NULL);
- } else if (strcmp(param, "hugepages") == 0) {
- if (size != 0) {
- if (sscanf(val, "%lu", &npages) <= 0)
- npages = 0;
- if (npages > MAX_NUMBER_GPAGES) {
- pr_warn("MMU: %lu pages requested for page "
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
- "size %llu KB, limiting to "
-#else
- "size %u KB, limiting to "
-#endif
- __stringify(MAX_NUMBER_GPAGES) "\n",
- npages, size / 1024);
- npages = MAX_NUMBER_GPAGES;
- }
- gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
- size = 0;
- }
- }
- return 0;
-}
-
+#define MAX_NUMBER_GPAGES 1024
+__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
+__initdata static unsigned nr_gpages;
/*
- * This function allocates physical space for pages that are larger than the
- * buddy allocator can handle. We want to allocate these in highmem because
- * the amount of lowmem is limited. This means that this function MUST be
- * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
- * allocate to grab highmem.
- */
-void __init reserve_hugetlb_gpages(void)
-{
- static __initdata char cmdline[COMMAND_LINE_SIZE];
- phys_addr_t size, base;
- int i;
-
- strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
- parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
- NULL, &do_gpage_early_setup);
-
- /*
- * Walk gpage list in reverse, allocating larger page sizes first.
- * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
- * When we reach the point in the list where pages are no longer
- * considered gpages, we're done.
- */
- for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
- if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
- continue;
- else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
- break;
-
- size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
- base = memblock_alloc_base(size * gpage_npages[i], size,
- MEMBLOCK_ALLOC_ANYWHERE);
- add_gpage(base, size, gpage_npages[i]);
- }
-}
-
-#else /* !PPC_FSL_BOOK3E */
-
-/* Build list of addresses of gigantic pages. This function is used in early
+ * Build list of addresses of gigantic pages. This function is used in early
* boot before the buddy allocator is setup.
*/
-void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
+void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
{
if (!addr)
return;
@@ -360,10 +220,7 @@ void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
}
}
-/* Moves the gigantic page addresses from the temporary list to the
- * huge_boot_pages list.
- */
-int alloc_bootmem_huge_page(struct hstate *hstate)
+int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
{
struct huge_bootmem_page *m;
if (nr_gpages == 0)
@@ -376,6 +233,17 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
}
#endif
+
+int __init alloc_bootmem_huge_page(struct hstate *h)
+{
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
+ return pseries_alloc_bootmem_huge_page(h);
+#endif
+ return __alloc_bootmem_huge_page(h);
+}
+
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
#define HUGEPD_FREELIST_SIZE \
((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
@@ -407,8 +275,7 @@ static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
batchp = &get_cpu_var(hugepd_freelist_cur);
if (atomic_read(&tlb->mm->mm_users) < 2 ||
- cpumask_equal(mm_cpumask(tlb->mm),
- cpumask_of(smp_processor_id()))) {
+ mm_is_thread_local(tlb->mm)) {
kmem_cache_free(hugepte_cache, hugepte);
put_cpu_var(hugepd_freelist_cur);
return;
@@ -886,9 +753,8 @@ void flush_dcache_icache_hugepage(struct page *page)
* This function need to be called with interrupts disabled. We use this variant
* when we have MSR[EE] = 0 but the paca->soft_enabled = 1
*/
-
-pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
- bool *is_thp, unsigned *shift)
+pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
+ bool *is_thp, unsigned *hpage_shift)
{
pgd_t pgd, *pgdp;
pud_t pud, *pudp;
@@ -897,8 +763,8 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
hugepd_t *hpdp = NULL;
unsigned pdshift = PGDIR_SHIFT;
- if (shift)
- *shift = 0;
+ if (hpage_shift)
+ *hpage_shift = 0;
if (is_thp)
*is_thp = false;
@@ -968,16 +834,15 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
ret_pte = hugepte_offset(*hpdp, ea, pdshift);
pdshift = hugepd_shift(*hpdp);
out:
- if (shift)
- *shift = pdshift;
+ if (hpage_shift)
+ *hpage_shift = pdshift;
return ret_pte;
}
-EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte);
+EXPORT_SYMBOL_GPL(__find_linux_pte);
int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- unsigned long mask;
unsigned long pte_end;
struct page *head, *page;
pte_t pte;
@@ -988,18 +853,10 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
end = pte_end;
pte = READ_ONCE(*ptep);
- mask = _PAGE_PRESENT | _PAGE_READ;
- /*
- * On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined
- * as 0 and _PAGE_RO has to be set when a page is not writable
- */
- if (write)
- mask |= _PAGE_WRITE;
- else
- mask |= _PAGE_RO;
-
- if ((pte_val(pte) & mask) != mask)
+ if (!pte_present(pte) || !pte_read(pte))
+ return 0;
+ if (write && !pte_write(pte))
return 0;
/* hugepages are never "special" */
diff --git a/arch/powerpc/mm/icswx.c b/arch/powerpc/mm/icswx.c
deleted file mode 100644
index 1fa794d7d59f..000000000000
--- a/arch/powerpc/mm/icswx.c
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- * ICSWX and ACOP Management
- *
- * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-
-#include "icswx.h"
-
-/*
- * The processor and its L2 cache cause the icswx instruction to
- * generate a COP_REQ transaction on PowerBus. The transaction has no
- * address, and the processor does not perform an MMU access to
- * authenticate the transaction. The command portion of the PowerBus
- * COP_REQ transaction includes the LPAR_ID (LPID) and the coprocessor
- * Process ID (PID), which the coprocessor compares to the authorized
- * LPID and PID held in the coprocessor, to determine if the process
- * is authorized to generate the transaction. The data of the COP_REQ
- * transaction is 128-byte or less in size and is placed in cacheable
- * memory on a 128-byte cache line boundary.
- *
- * The task to use a coprocessor should use use_cop() to mark the use
- * of the Coprocessor Type (CT) and context switching. On a server
- * class processor, the PID register is used only for coprocessor
- * management + * and so a coprocessor PID is allocated before
- * executing icswx + * instruction. Drop_cop() is used to free the
- * coprocessor PID.
- *
- * Example:
- * Host Fabric Interface (HFI) is a PowerPC network coprocessor.
- * Each HFI have multiple windows. Each HFI window serves as a
- * network device sending to and receiving from HFI network.
- * HFI immediate send function uses icswx instruction. The immediate
- * send function allows small (single cache-line) packets be sent
- * without using the regular HFI send FIFO and doorbell, which are
- * much slower than immediate send.
- *
- * For each task intending to use HFI immediate send, the HFI driver
- * calls use_cop() to obtain a coprocessor PID for the task.
- * The HFI driver then allocate a free HFI window and save the
- * coprocessor PID to the HFI window to allow the task to use the
- * HFI window.
- *
- * The HFI driver repeatedly creates immediate send packets and
- * issues icswx instruction to send data through the HFI window.
- * The HFI compares the coprocessor PID in the CPU PID register
- * to the PID held in the HFI window to determine if the transaction
- * is allowed.
- *
- * When the task to release the HFI window, the HFI driver calls
- * drop_cop() to release the coprocessor PID.
- */
-
-void switch_cop(struct mm_struct *next)
-{
-#ifdef CONFIG_PPC_ICSWX_PID
- mtspr(SPRN_PID, next->context.cop_pid);
-#endif
- mtspr(SPRN_ACOP, next->context.acop);
-}
-
-/**
- * Start using a coprocessor.
- * @acop: mask of coprocessor to be used.
- * @mm: The mm the coprocessor to associate with. Most likely current mm.
- *
- * Return a positive PID if successful. Negative errno otherwise.
- * The returned PID will be fed to the coprocessor to determine if an
- * icswx transaction is authenticated.
- */
-int use_cop(unsigned long acop, struct mm_struct *mm)
-{
- int ret;
-
- if (!cpu_has_feature(CPU_FTR_ICSWX))
- return -ENODEV;
-
- if (!mm || !acop)
- return -EINVAL;
-
- /* The page_table_lock ensures mm_users won't change under us */
- spin_lock(&mm->page_table_lock);
- spin_lock(mm->context.cop_lockp);
-
- ret = get_cop_pid(mm);
- if (ret < 0)
- goto out;
-
- /* update acop */
- mm->context.acop |= acop;
-
- sync_cop(mm);
-
- /*
- * If this is a threaded process then there might be other threads
- * running. We need to send an IPI to force them to pick up any
- * change in PID and ACOP.
- */
- if (atomic_read(&mm->mm_users) > 1)
- smp_call_function(sync_cop, mm, 1);
-
-out:
- spin_unlock(mm->context.cop_lockp);
- spin_unlock(&mm->page_table_lock);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(use_cop);
-
-/**
- * Stop using a coprocessor.
- * @acop: mask of coprocessor to be stopped.
- * @mm: The mm the coprocessor associated with.
- */
-void drop_cop(unsigned long acop, struct mm_struct *mm)
-{
- int free_pid;
-
- if (!cpu_has_feature(CPU_FTR_ICSWX))
- return;
-
- if (WARN_ON_ONCE(!mm))
- return;
-
- /* The page_table_lock ensures mm_users won't change under us */
- spin_lock(&mm->page_table_lock);
- spin_lock(mm->context.cop_lockp);
-
- mm->context.acop &= ~acop;
-
- free_pid = disable_cop_pid(mm);
- sync_cop(mm);
-
- /*
- * If this is a threaded process then there might be other threads
- * running. We need to send an IPI to force them to pick up any
- * change in PID and ACOP.
- */
- if (atomic_read(&mm->mm_users) > 1)
- smp_call_function(sync_cop, mm, 1);
-
- if (free_pid != COP_PID_NONE)
- free_cop_pid(free_pid);
-
- spin_unlock(mm->context.cop_lockp);
- spin_unlock(&mm->page_table_lock);
-}
-EXPORT_SYMBOL_GPL(drop_cop);
-
-static int acop_use_cop(int ct)
-{
- /* There is no alternate policy, yet */
- return -1;
-}
-
-/*
- * Get the instruction word at the NIP
- */
-static u32 acop_get_inst(struct pt_regs *regs)
-{
- u32 inst;
- u32 __user *p;
-
- p = (u32 __user *)regs->nip;
- if (!access_ok(VERIFY_READ, p, sizeof(*p)))
- return 0;
-
- if (__get_user(inst, p))
- return 0;
-
- return inst;
-}
-
-/**
- * @regs: registers at time of interrupt
- * @address: storage address
- * @error_code: Fault code, usually the DSISR or ESR depending on
- * processor type
- *
- * Return 0 if we are able to resolve the data storage fault that
- * results from a CT miss in the ACOP register.
- */
-int acop_handle_fault(struct pt_regs *regs, unsigned long address,
- unsigned long error_code)
-{
- int ct;
- u32 inst = 0;
-
- if (!cpu_has_feature(CPU_FTR_ICSWX)) {
- pr_info("No coprocessors available");
- _exception(SIGILL, regs, ILL_ILLOPN, address);
- }
-
- if (!user_mode(regs)) {
- /* this could happen if the HV denies the
- * kernel access, for now we just die */
- die("ICSWX from kernel failed", regs, SIGSEGV);
- }
-
- /* Some implementations leave us a hint for the CT */
- ct = ICSWX_GET_CT_HINT(error_code);
- if (ct < 0) {
- /* we have to peek at the instruction word to figure out CT */
- u32 ccw;
- u32 rs;
-
- inst = acop_get_inst(regs);
- if (inst == 0)
- return -1;
-
- rs = (inst >> (31 - 10)) & 0x1f;
- ccw = regs->gpr[rs];
- ct = (ccw >> 16) & 0x3f;
- }
-
- /*
- * We could be here because another thread has enabled acop
- * but the ACOP register has yet to be updated.
- *
- * This should have been taken care of by the IPI to sync all
- * the threads (see smp_call_function(sync_cop, mm, 1)), but
- * that could take forever if there are a significant amount
- * of threads.
- *
- * Given the number of threads on some of these systems,
- * perhaps this is the best way to sync ACOP rather than whack
- * every thread with an IPI.
- */
- if ((acop_copro_type_bit(ct) & current->active_mm->context.acop) != 0) {
- sync_cop(current->active_mm);
- return 0;
- }
-
- /* check for alternate policy */
- if (!acop_use_cop(ct))
- return 0;
-
- /* at this point the CT is unknown to the system */
- pr_warn("%s[%d]: Coprocessor %d is unavailable\n",
- current->comm, current->pid, ct);
-
- /* get inst if we don't already have it */
- if (inst == 0) {
- inst = acop_get_inst(regs);
- if (inst == 0)
- return -1;
- }
-
- /* Check if the instruction is the "record form" */
- if (inst & 1) {
- /*
- * the instruction is "record" form so we can reject
- * using CR0
- */
- regs->ccr &= ~(0xful << 28);
- regs->ccr |= ICSWX_RC_NOT_FOUND << 28;
-
- /* Move on to the next instruction */
- regs->nip += 4;
- } else {
- /*
- * There is no architected mechanism to report a bad
- * CT so we could either SIGILL or report nothing.
- * Since the non-record version should only bu used
- * for "hints" or "don't care" we should probably do
- * nothing. However, I could see how some people
- * might want an SIGILL so it here if you want it.
- */
-#ifdef CONFIG_PPC_ICSWX_USE_SIGILL
- _exception(SIGILL, regs, ILL_ILLOPN, address);
-#else
- regs->nip += 4;
-#endif
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(acop_handle_fault);
diff --git a/arch/powerpc/mm/icswx.h b/arch/powerpc/mm/icswx.h
deleted file mode 100644
index 6dedc08e62c8..000000000000
--- a/arch/powerpc/mm/icswx.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef _ARCH_POWERPC_MM_ICSWX_H_
-#define _ARCH_POWERPC_MM_ICSWX_H_
-
-/*
- * ICSWX and ACOP Management
- *
- * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <asm/mmu_context.h>
-
-/* also used to denote that PIDs are not used */
-#define COP_PID_NONE 0
-
-static inline void sync_cop(void *arg)
-{
- struct mm_struct *mm = arg;
-
- if (mm == current->active_mm)
- switch_cop(current->active_mm);
-}
-
-#ifdef CONFIG_PPC_ICSWX_PID
-extern int get_cop_pid(struct mm_struct *mm);
-extern int disable_cop_pid(struct mm_struct *mm);
-extern void free_cop_pid(int free_pid);
-#else
-#define get_cop_pid(m) (COP_PID_NONE)
-#define disable_cop_pid(m) (COP_PID_NONE)
-#define free_cop_pid(p)
-#endif
-
-/*
- * These are implementation bits for architected registers. If this
- * ever becomes architecture the should be moved to reg.h et. al.
- */
-/* UCT is the same bit for Server and Embedded */
-#define ICSWX_DSI_UCT 0x00004000 /* Unavailable Coprocessor Type */
-
-#ifdef CONFIG_PPC_BOOK3E
-/* Embedded implementation gives us no hints as to what the CT is */
-#define ICSWX_GET_CT_HINT(x) (-1)
-#else
-/* Server implementation contains the CT value in the DSISR */
-#define ICSWX_DSISR_CTMASK 0x00003f00
-#define ICSWX_GET_CT_HINT(x) (((x) & ICSWX_DSISR_CTMASK) >> 8)
-#endif
-
-#define ICSWX_RC_STARTED 0x8 /* The request has been started */
-#define ICSWX_RC_NOT_IDLE 0x4 /* No coprocessor found idle */
-#define ICSWX_RC_NOT_FOUND 0x2 /* No coprocessor found */
-#define ICSWX_RC_UNDEFINED 0x1 /* Reserved */
-
-extern int acop_handle_fault(struct pt_regs *regs, unsigned long address,
- unsigned long error_code);
-
-static inline u64 acop_copro_type_bit(unsigned int type)
-{
- return 1ULL << (63 - type);
-}
-
-#endif /* !_ARCH_POWERPC_MM_ICSWX_H_ */
diff --git a/arch/powerpc/mm/icswx_pid.c b/arch/powerpc/mm/icswx_pid.c
deleted file mode 100644
index 91e30eb7d054..000000000000
--- a/arch/powerpc/mm/icswx_pid.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * ICSWX and ACOP/PID Management
- *
- * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/module.h>
-#include "icswx.h"
-
-#define COP_PID_MIN (COP_PID_NONE + 1)
-#define COP_PID_MAX (0xFFFF)
-
-static DEFINE_SPINLOCK(mmu_context_acop_lock);
-static DEFINE_IDA(cop_ida);
-
-static int new_cop_pid(struct ida *ida, int min_id, int max_id,
- spinlock_t *lock)
-{
- int index;
- int err;
-
-again:
- if (!ida_pre_get(ida, GFP_KERNEL))
- return -ENOMEM;
-
- spin_lock(lock);
- err = ida_get_new_above(ida, min_id, &index);
- spin_unlock(lock);
-
- if (err == -EAGAIN)
- goto again;
- else if (err)
- return err;
-
- if (index > max_id) {
- spin_lock(lock);
- ida_remove(ida, index);
- spin_unlock(lock);
- return -ENOMEM;
- }
-
- return index;
-}
-
-int get_cop_pid(struct mm_struct *mm)
-{
- int pid;
-
- if (mm->context.cop_pid == COP_PID_NONE) {
- pid = new_cop_pid(&cop_ida, COP_PID_MIN, COP_PID_MAX,
- &mmu_context_acop_lock);
- if (pid >= 0)
- mm->context.cop_pid = pid;
- }
- return mm->context.cop_pid;
-}
-
-int disable_cop_pid(struct mm_struct *mm)
-{
- int free_pid = COP_PID_NONE;
-
- if ((!mm->context.acop) && (mm->context.cop_pid != COP_PID_NONE)) {
- free_pid = mm->context.cop_pid;
- mm->context.cop_pid = COP_PID_NONE;
- }
- return free_pid;
-}
-
-void free_cop_pid(int free_pid)
-{
- spin_lock(&mmu_context_acop_lock);
- ida_remove(&cop_ida, free_pid);
- spin_unlock(&mmu_context_acop_lock);
-}
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 8a7c38b8d335..6419b33ca309 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -113,6 +113,12 @@ void __init MMU_setup(void)
__map_without_bats = 1;
__map_without_ltlbs = 1;
}
+#ifdef CONFIG_STRICT_KERNEL_RWX
+ if (rodata_enabled) {
+ __map_without_bats = 1;
+ __map_without_ltlbs = 1;
+ }
+#endif
}
/*
@@ -132,8 +138,6 @@ void __init MMU_init(void)
* Reserve gigantic pages for hugetlb. This MUST occur before
* lowmem_end_addr is initialized below.
*/
- reserve_hugetlb_gpages();
-
if (memblock.memory.cnt > 1) {
#ifndef CONFIG_WII
memblock_enforce_memory_limit(memblock.memory.regions[0].size);
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 5b4c25d12ff3..588a521966ec 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -356,7 +356,7 @@ struct page *realmode_pfn_to_page(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
-#elif defined(CONFIG_FLATMEM)
+#else
struct page *realmode_pfn_to_page(unsigned long pfn)
{
@@ -365,7 +365,7 @@ struct page *realmode_pfn_to_page(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
-#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
#ifdef CONFIG_PPC_STD_MMU_64
static bool disable_radix;
@@ -381,7 +381,7 @@ early_param("disable_radix", parse_disable_radix);
* /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do
* radix. If not, we clear the radix feature bit so we fall back to hash.
*/
-static void early_check_vec5(void)
+static void __init early_check_vec5(void)
{
unsigned long root, chosen;
int size;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 46b4e67d2372..4362b86ef84c 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -436,7 +436,7 @@ void flush_dcache_icache_page(struct page *page)
return;
}
#endif
-#if defined(CONFIG_8xx) || defined(CONFIG_PPC64)
+#if defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC64)
/* On 8xx there is no need to kmap since highmem is not supported */
__flush_dcache_icache(page_address(page));
#else
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
new file mode 100644
index 000000000000..0f613bc63c50
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context.c
@@ -0,0 +1,99 @@
+/*
+ * Common implementation of switch_mm_irqs_off
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+
+#include <asm/mmu_context.h>
+
+#if defined(CONFIG_PPC32)
+static inline void switch_mm_pgdir(struct task_struct *tsk,
+ struct mm_struct *mm)
+{
+ /* 32-bit keeps track of the current PGDIR in the thread struct */
+ tsk->thread.pgdir = mm->pgd;
+}
+#elif defined(CONFIG_PPC_BOOK3E_64)
+static inline void switch_mm_pgdir(struct task_struct *tsk,
+ struct mm_struct *mm)
+{
+ /* 64-bit Book3E keeps track of current PGD in the PACA */
+ get_paca()->pgd = mm->pgd;
+}
+#else
+static inline void switch_mm_pgdir(struct task_struct *tsk,
+ struct mm_struct *mm) { }
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static inline void inc_mm_active_cpus(struct mm_struct *mm)
+{
+ atomic_inc(&mm->context.active_cpus);
+}
+#else
+static inline void inc_mm_active_cpus(struct mm_struct *mm) { }
+#endif
+
+void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ bool new_on_cpu = false;
+
+ /* Mark this context has been used on the new CPU */
+ if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) {
+ cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
+ inc_mm_active_cpus(next);
+
+ /*
+ * This full barrier orders the store to the cpumask above vs
+ * a subsequent operation which allows this CPU to begin loading
+ * translations for next.
+ *
+ * When using the radix MMU that operation is the load of the
+ * MMU context id, which is then moved to SPRN_PID.
+ *
+ * For the hash MMU it is either the first load from slb_cache
+ * in switch_slb(), and/or the store of paca->mm_ctx_id in
+ * copy_mm_to_paca().
+ *
+ * On the read side the barrier is in pte_xchg(), which orders
+ * the store to the PTE vs the load of mm_cpumask.
+ */
+ smp_mb();
+
+ new_on_cpu = true;
+ }
+
+ /* Some subarchs need to track the PGD elsewhere */
+ switch_mm_pgdir(tsk, next);
+
+ /* Nothing else to do if we aren't actually switching */
+ if (prev == next)
+ return;
+
+ /*
+ * We must stop all altivec streams before changing the HW
+ * context
+ */
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ asm volatile ("dssall");
+
+ if (new_on_cpu)
+ radix_kvm_prefetch_workaround(next);
+
+ /*
+ * The actual HW switching method differs between the various
+ * sub architectures. Out of line for now
+ */
+ switch_mmu_context(prev, next, tsk);
+}
+
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index a75f63833284..05e15386d4cb 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -25,8 +25,6 @@
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
-#include "icswx.h"
-
static DEFINE_SPINLOCK(mmu_context_lock);
static DEFINE_IDA(mmu_context_ida);
@@ -165,16 +163,6 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
return index;
mm->context.id = index;
-#ifdef CONFIG_PPC_ICSWX
- mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
- if (!mm->context.cop_lockp) {
- __destroy_context(index);
- subpage_prot_free(mm);
- mm->context.id = MMU_NO_CONTEXT;
- return -ENOMEM;
- }
- spin_lock_init(mm->context.cop_lockp);
-#endif /* CONFIG_PPC_ICSWX */
#ifdef CONFIG_PPC_64K_PAGES
mm->context.pte_frag = NULL;
@@ -182,6 +170,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
#ifdef CONFIG_SPAPR_TCE_IOMMU
mm_iommu_init(mm);
#endif
+ atomic_set(&mm->context.active_cpus, 0);
+
return 0;
}
@@ -226,12 +216,6 @@ void destroy_context(struct mm_struct *mm)
#ifdef CONFIG_SPAPR_TCE_IOMMU
WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
#endif
-#ifdef CONFIG_PPC_ICSWX
- drop_cop(mm->context.acop, mm);
- kfree(mm->context.cop_lockp);
- mm->context.cop_lockp = NULL;
-#endif /* CONFIG_PPC_ICSWX */
-
if (radix_enabled()) {
/*
* Radix doesn't have a valid bit in the process table
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index d46128b22150..57fbc554c785 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -27,7 +27,7 @@
/*
* On 40x and 8xx, we directly inline tlbia and tlbivax
*/
-#if defined(CONFIG_40x) || defined(CONFIG_8xx)
+#if defined(CONFIG_40x) || defined(CONFIG_PPC_8xx)
static inline void _tlbil_all(void)
{
asm volatile ("sync; tlbia; isync" : : : "memory");
@@ -38,7 +38,7 @@ static inline void _tlbil_pid(unsigned int pid)
}
#define _tlbil_pid_noind(pid) _tlbil_pid(pid)
-#else /* CONFIG_40x || CONFIG_8xx */
+#else /* CONFIG_40x || CONFIG_PPC_8xx */
extern void _tlbil_all(void);
extern void _tlbil_pid(unsigned int pid);
#ifdef CONFIG_PPC_BOOK3E
@@ -46,12 +46,12 @@ extern void _tlbil_pid_noind(unsigned int pid);
#else
#define _tlbil_pid_noind(pid) _tlbil_pid(pid)
#endif
-#endif /* !(CONFIG_40x || CONFIG_8xx) */
+#endif /* !(CONFIG_40x || CONFIG_PPC_8xx) */
/*
* On 8xx, we directly inline tlbie, on others, it's extern
*/
-#ifdef CONFIG_8xx
+#ifdef CONFIG_PPC_8xx
static inline void _tlbil_va(unsigned long address, unsigned int pid,
unsigned int tsize, unsigned int ind)
{
@@ -67,7 +67,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid,
{
__tlbil_va(address, pid);
}
-#endif /* CONFIG_8xx */
+#endif /* CONFIG_PPC_8xx */
#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x)
extern void _tlbivax_bcast(unsigned long address, unsigned int pid,
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index 31eed8fa8e99..3b65917785a5 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -9,6 +9,7 @@
#include <linux/sched.h>
#include <linux/mm_types.h>
+#include <misc/cxl-base.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
@@ -64,6 +65,27 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
trace_hugepage_set_pmd(addr, pmd_val(pmd));
return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
}
+
+static void do_nothing(void *unused)
+{
+
+}
+/*
+ * Serialize against find_current_mm_pte which does lock-less
+ * lookup in page tables with local interrupts disabled. For huge pages
+ * it casts pmd_t to pte_t. Since format of pte_t is different from
+ * pmd_t we want to prevent transit from pmd pointing to page table
+ * to pmd pointing to huge page (and back) while interrupts are disabled.
+ * We clear pmd to possibly replace it with page table pointer in
+ * different code paths. So make sure we wait for the parallel
+ * find_current_mm_pte to finish.
+ */
+void serialize_against_pte_lookup(struct mm_struct *mm)
+{
+ smp_mb();
+ smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
+}
+
/*
* We use this to invalidate a pmdp entry before switching from a
* hugepte to regular pmd entry.
@@ -77,7 +99,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
* This ensures that generic code that rely on IRQ disabling
* to prevent a parallel THP split work as expected.
*/
- kick_all_cpus_sync();
+ serialize_against_pte_lookup(vma->vm_mm);
}
static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index 443a2c66a304..ec277913e01b 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -239,7 +239,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres
* by sending an IPI to all the cpus and executing a dummy
* function there.
*/
- kick_all_cpus_sync();
+ serialize_against_pte_lookup(vma->vm_mm);
/*
* Now invalidate the hpte entries in the range
* covered by pmd. This make sure we take a
@@ -329,7 +329,6 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
unsigned int psize;
unsigned long vsid;
unsigned long flags = 0;
- const struct cpumask *tmp;
/* get the base page size,vsid and segment size */
#ifdef CONFIG_DEBUG_VM
@@ -350,8 +349,7 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
ssize = mmu_kernel_ssize;
}
- tmp = cpumask_of(smp_processor_id());
- if (cpumask_equal(mm_cpumask(mm), tmp))
+ if (mm_is_thread_local(mm))
flags |= HPTE_LOCAL_UPDATE;
return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
@@ -380,16 +378,16 @@ pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
*/
memset(pgtable, 0, PTE_FRAG_SIZE);
/*
- * Serialize against find_linux_pte_or_hugepte which does lock-less
+ * Serialize against find_current_mm_pte variants which does lock-less
* lookup in page tables with local interrupts disabled. For huge pages
* it casts pmd_t to pte_t. Since format of pte_t is different from
* pmd_t we want to prevent transit from pmd pointing to page table
* to pmd pointing to huge page (and back) while interrupts are disabled.
* We clear pmd to possibly replace it with page table pointer in
* different code paths. So make sure we wait for the parallel
- * find_linux_pte_or_hugepage to finish.
+ * find_curren_mm_pte to finish.
*/
- kick_all_cpus_sync();
+ serialize_against_pte_lookup(mm);
return old_pmd;
}
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 671a45d86c18..39c252b54d16 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -8,10 +8,15 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
+
+#define pr_fmt(fmt) "radix-mmu: " fmt
+
+#include <linux/kernel.h>
#include <linux/sched/mm.h>
#include <linux/memblock.h>
#include <linux/of_fdt.h>
#include <linux/mm.h>
+#include <linux/string_helpers.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -31,9 +36,13 @@ unsigned int mmu_base_pid;
static int native_register_process_table(unsigned long base, unsigned long pg_sz,
unsigned long table_size)
{
- unsigned long patb1 = base | table_size | PATB_GR;
+ unsigned long patb0, patb1;
+
+ patb0 = be64_to_cpu(partition_tb[0].patb0);
+ patb1 = base | table_size | PATB_GR;
+
+ mmu_partition_table_set_entry(0, patb0, patb1);
- partition_tb->patb1 = cpu_to_be64(patb1);
return 0;
}
@@ -179,10 +188,14 @@ static inline void __meminit print_mapping(unsigned long start,
unsigned long end,
unsigned long size)
{
+ char buf[10];
+
if (end <= start)
return;
- pr_info("Mapped range 0x%lx - 0x%lx with 0x%lx\n", start, end, size);
+ string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
+
+ pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf);
}
static int __meminit create_physical_mapping(unsigned long start,
@@ -526,6 +539,7 @@ void __init radix__early_init_mmu(void)
__kernel_virt_size = RADIX_KERN_VIRT_SIZE;
__vmalloc_start = RADIX_VMALLOC_START;
__vmalloc_end = RADIX_VMALLOC_END;
+ __kernel_io_start = RADIX_KERN_IO_START;
vmemmap = (struct page *)RADIX_VMEMMAP_BASE;
ioremap_bot = IOREMAP_BASE;
@@ -836,9 +850,12 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre
*/
pmd = *pmdp;
pmd_clear(pmdp);
+
/*FIXME!! Verify whether we need this kick below */
- kick_all_cpus_sync();
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ serialize_against_pte_lookup(vma->vm_mm);
+
+ radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
+
return pmd;
}
@@ -897,16 +914,16 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
old_pmd = __pmd(old);
/*
- * Serialize against find_linux_pte_or_hugepte which does lock-less
+ * Serialize against find_current_mm_pte which does lock-less
* lookup in page tables with local interrupts disabled. For huge pages
* it casts pmd_t to pte_t. Since format of pte_t is different from
* pmd_t we want to prevent transit from pmd pointing to page table
* to pmd pointing to huge page (and back) while interrupts are disabled.
* We clear pmd to possibly replace it with page table pointer in
* different code paths. So make sure we wait for the parallel
- * find_linux_pte_or_hugepage to finish.
+ * find_current_mm_pte to finish.
*/
- kick_all_cpus_sync();
+ serialize_against_pte_lookup(mm);
return old_pmd;
}
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index a9e4bfc025bc..65eda1997c3f 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -34,6 +34,7 @@
#include <asm/fixmap.h>
#include <asm/io.h>
#include <asm/setup.h>
+#include <asm/sections.h>
#include "mmu_decl.h"
@@ -242,7 +243,7 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, int flags)
/*
* Map in a chunk of physical memory starting at start.
*/
-void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
+static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
{
unsigned long v, s, f;
phys_addr_t p;
@@ -294,7 +295,7 @@ void __init mapin_ram(void)
* Returns true (1) if PTE was found, zero otherwise. The pointer to
* the PTE pointer is unmodified if PTE is not found.
*/
-int
+static int
get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp)
{
pgd_t *pgd;
@@ -323,9 +324,7 @@ get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp)
return(retval);
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
-
-static int __change_page_attr(struct page *page, pgprot_t prot)
+static int __change_page_attr_noflush(struct page *page, pgprot_t prot)
{
pte_t *kpte;
pmd_t *kpmd;
@@ -339,8 +338,6 @@ static int __change_page_attr(struct page *page, pgprot_t prot)
if (!get_pteptr(&init_mm, address, &kpte, &kpmd))
return -EINVAL;
__set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0);
- wmb();
- flush_tlb_page(NULL, address);
pte_unmap(kpte);
return 0;
@@ -349,44 +346,65 @@ static int __change_page_attr(struct page *page, pgprot_t prot)
/*
* Change the page attributes of an page in the linear mapping.
*
- * THIS CONFLICTS WITH BAT MAPPINGS, DEBUG USE ONLY
+ * THIS DOES NOTHING WITH BAT MAPPINGS, DEBUG USE ONLY
*/
static int change_page_attr(struct page *page, int numpages, pgprot_t prot)
{
int i, err = 0;
unsigned long flags;
+ struct page *start = page;
local_irq_save(flags);
for (i = 0; i < numpages; i++, page++) {
- err = __change_page_attr(page, prot);
+ err = __change_page_attr_noflush(page, prot);
if (err)
break;
}
+ wmb();
+ flush_tlb_kernel_range((unsigned long)page_address(start),
+ (unsigned long)page_address(page));
local_irq_restore(flags);
return err;
}
-
-void __kernel_map_pages(struct page *page, int numpages, int enable)
+void mark_initmem_nx(void)
{
- if (PageHighMem(page))
- return;
+ struct page *page = virt_to_page(_sinittext);
+ unsigned long numpages = PFN_UP((unsigned long)_einittext) -
+ PFN_DOWN((unsigned long)_sinittext);
- change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
+ change_page_attr(page, numpages, PAGE_KERNEL);
}
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-static int fixmaps;
-
-void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void mark_rodata_ro(void)
{
- unsigned long address = __fix_to_virt(idx);
+ struct page *page;
+ unsigned long numpages;
+
+ page = virt_to_page(_stext);
+ numpages = PFN_UP((unsigned long)_etext) -
+ PFN_DOWN((unsigned long)_stext);
- if (idx >= __end_of_fixed_addresses) {
- BUG();
+ change_page_attr(page, numpages, PAGE_KERNEL_ROX);
+ /*
+ * mark .rodata as read only. Use __init_begin rather than __end_rodata
+ * to cover NOTES and EXCEPTION_TABLE.
+ */
+ page = virt_to_page(__start_rodata);
+ numpages = PFN_UP((unsigned long)__init_begin) -
+ PFN_DOWN((unsigned long)__start_rodata);
+
+ change_page_attr(page, numpages, PAGE_KERNEL_RO);
+}
+#endif
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ if (PageHighMem(page))
return;
- }
- map_kernel_page(address, phys, pgprot_val(flags));
- fixmaps++;
+ change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
}
+#endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 0736e94c7615..ac0717a90ca6 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -104,6 +104,8 @@ unsigned long __vmalloc_start;
EXPORT_SYMBOL(__vmalloc_start);
unsigned long __vmalloc_end;
EXPORT_SYMBOL(__vmalloc_end);
+unsigned long __kernel_io_start;
+EXPORT_SYMBOL(__kernel_io_start);
struct page *vmemmap;
EXPORT_SYMBOL(vmemmap);
unsigned long __pte_frag_nr;
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index bde378559d01..906a86fe457b 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -121,12 +121,25 @@ slb_miss_kernel_load_vmemmap:
1:
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
- /* vmalloc mapping gets the encoding from the PACA as the mapping
- * can be demoted from 64K -> 4K dynamically on some machines
+ /*
+ * r10 contains the ESID, which is the original faulting EA shifted
+ * right by 28 bits. We need to compare that with (H_VMALLOC_END >> 28)
+ * which is 0xd00038000. That can't be used as an immediate, even if we
+ * ignored the 0xd, so we have to load it into a register, and we only
+ * have one register free. So we must load all of (H_VMALLOC_END >> 28)
+ * into a register and compare ESID against that.
+ */
+ lis r11,(H_VMALLOC_END >> 32)@h // r11 = 0xffffffffd0000000
+ ori r11,r11,(H_VMALLOC_END >> 32)@l // r11 = 0xffffffffd0003800
+ // Rotate left 4, then mask with 0xffffffff0
+ rldic r11,r11,4,28 // r11 = 0xd00038000
+ cmpld r10,r11 // if r10 >= r11
+ bge 5f // goto io_mapping
+
+ /*
+ * vmalloc mapping gets the encoding from the PACA as the mapping
+ * can be demoted from 64K -> 4K dynamically on some machines.
*/
- clrldi r11,r10,48
- cmpldi r11,(H_VMALLOC_SIZE >> 28) - 1
- bgt 5f
lhz r11,PACAVMALLOCSLLP(r13)
b 6f
5:
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 16ae1bbe13f0..b3e849c4886e 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -54,23 +54,15 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
*/
__tlbiel_pid(pid, 0, ric);
- if (ric == RIC_FLUSH_ALL)
- /* For the remaining sets, just flush the TLB */
- ric = RIC_FLUSH_TLB;
+ /* For PWC, only one flush is needed */
+ if (ric == RIC_FLUSH_PWC) {
+ asm volatile("ptesync": : :"memory");
+ return;
+ }
+ /* For the remaining sets, just flush the TLB */
for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
- __tlbiel_pid(pid, set, ric);
-
- asm volatile("ptesync": : :"memory");
- asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
-}
-
-static inline void tlbiel_pwc(unsigned long pid)
-{
- asm volatile("ptesync": : :"memory");
-
- /* For PWC flush, we don't look at set number */
- __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
+ __tlbiel_pid(pid, set, RIC_FLUSH_TLB);
asm volatile("ptesync": : :"memory");
asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
@@ -146,31 +138,23 @@ void radix__local_flush_tlb_mm(struct mm_struct *mm)
preempt_disable();
pid = mm->context.id;
if (pid != MMU_NO_CONTEXT)
- _tlbiel_pid(pid, RIC_FLUSH_ALL);
+ _tlbiel_pid(pid, RIC_FLUSH_TLB);
preempt_enable();
}
EXPORT_SYMBOL(radix__local_flush_tlb_mm);
-void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
+#ifndef CONFIG_SMP
+static void radix__local_flush_all_mm(struct mm_struct *mm)
{
unsigned long pid;
- struct mm_struct *mm = tlb->mm;
- /*
- * If we are doing a full mm flush, we will do a tlb flush
- * with RIC_FLUSH_ALL later.
- */
- if (tlb->fullmm)
- return;
preempt_disable();
-
pid = mm->context.id;
if (pid != MMU_NO_CONTEXT)
- tlbiel_pwc(pid);
-
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
preempt_enable();
}
-EXPORT_SYMBOL(radix__local_flush_tlb_pwc);
+#endif /* CONFIG_SMP */
void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
int psize)
@@ -208,38 +192,35 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
goto no_context;
if (!mm_is_thread_local(mm))
- _tlbie_pid(pid, RIC_FLUSH_ALL);
+ _tlbie_pid(pid, RIC_FLUSH_TLB);
else
- _tlbiel_pid(pid, RIC_FLUSH_ALL);
+ _tlbiel_pid(pid, RIC_FLUSH_TLB);
no_context:
preempt_enable();
}
EXPORT_SYMBOL(radix__flush_tlb_mm);
-void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
+static void radix__flush_all_mm(struct mm_struct *mm)
{
unsigned long pid;
- struct mm_struct *mm = tlb->mm;
- /*
- * If we are doing a full mm flush, we will do a tlb flush
- * with RIC_FLUSH_ALL later.
- */
- if (tlb->fullmm)
- return;
preempt_disable();
-
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
goto no_context;
if (!mm_is_thread_local(mm))
- _tlbie_pid(pid, RIC_FLUSH_PWC);
+ _tlbie_pid(pid, RIC_FLUSH_ALL);
else
- tlbiel_pwc(pid);
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
no_context:
preempt_enable();
}
+
+void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
+{
+ tlb->need_flush_all = 1;
+}
EXPORT_SYMBOL(radix__flush_tlb_pwc);
void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
@@ -271,6 +252,8 @@ void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
}
EXPORT_SYMBOL(radix__flush_tlb_page);
+#else /* CONFIG_SMP */
+#define radix__flush_all_mm radix__local_flush_all_mm
#endif /* CONFIG_SMP */
void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
@@ -288,6 +271,7 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
{
struct mm_struct *mm = vma->vm_mm;
+
radix__flush_tlb_mm(mm);
}
EXPORT_SYMBOL(radix__flush_tlb_range);
@@ -319,7 +303,10 @@ void radix__tlb_flush(struct mmu_gather *tlb)
*/
if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all)
radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize);
- else
+ else if (tlb->need_flush_all) {
+ tlb->need_flush_all = 0;
+ radix__flush_all_mm(mm);
+ } else
radix__flush_tlb_mm(mm);
}
@@ -364,6 +351,43 @@ err_out:
preempt_enable();
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
+{
+ int local = mm_is_thread_local(mm);
+ unsigned long ap = mmu_get_ap(mmu_virtual_psize);
+ unsigned long pid, end;
+
+
+ pid = mm ? mm->context.id : 0;
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ goto no_context;
+
+ /* 4k page size, just blow the world */
+ if (PAGE_SIZE == 0x1000) {
+ radix__flush_all_mm(mm);
+ return;
+ }
+
+ /* Otherwise first do the PWC */
+ if (local)
+ _tlbiel_pid(pid, RIC_FLUSH_PWC);
+ else
+ _tlbie_pid(pid, RIC_FLUSH_PWC);
+
+ /* Then iterate the pages */
+ end = addr + HPAGE_PMD_SIZE;
+ for (; addr < end; addr += PAGE_SIZE) {
+ if (local)
+ _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+ else
+ _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+ }
+no_context:
+ preempt_enable();
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa,
unsigned long page_size)
{
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index b5b0fb97b9c0..881ebd53ffc2 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -29,6 +29,8 @@
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/bug.h>
+#include <asm/pte-walk.h>
+
#include <trace/events/thp.h>
@@ -138,13 +140,10 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
*/
void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
{
- const struct cpumask *tmp;
- int i, local = 0;
+ int i, local;
i = batch->index;
- tmp = cpumask_of(smp_processor_id());
- if (cpumask_equal(mm_cpumask(batch->mm), tmp))
- local = 1;
+ local = mm_is_thread_local(batch->mm);
if (i == 1)
flush_hash_page(batch->vpn[0], batch->pte[0],
batch->psize, batch->ssize, local);
@@ -207,8 +206,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
for (; start < end; start += PAGE_SIZE) {
- pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, &is_thp,
- &hugepage_shift);
+ pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp,
+ &hugepage_shift);
unsigned long pte;
if (ptep == NULL)
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
index eabecfcaef7c..048b8e9f4492 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -60,7 +60,7 @@ _GLOBAL(__tlbil_va)
isync
1: blr
-#elif defined(CONFIG_8xx)
+#elif defined(CONFIG_PPC_8xx)
/*
* Nothing to do for 8xx, everything is inline