diff options
Diffstat (limited to 'arch/powerpc/mm')
26 files changed, 670 insertions, 1041 deletions
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index f4c6472f2fc4..f29212e40f40 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -22,8 +22,11 @@ extern int __map_without_ltlbs; +static unsigned long block_mapped_ram; + /* - * Return PA for this VA if it is in IMMR area, or 0 + * Return PA for this VA if it is in an area mapped with LTLBs. + * Otherwise, returns 0 */ phys_addr_t v_block_mapped(unsigned long va) { @@ -33,11 +36,13 @@ phys_addr_t v_block_mapped(unsigned long va) return 0; if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE) return p + va - VIRT_IMMR_BASE; + if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram) + return __pa(va); return 0; } /* - * Return VA for a given PA or 0 if not mapped + * Return VA for a given PA mapped with LTLBs or 0 if not mapped */ unsigned long p_block_mapped(phys_addr_t pa) { @@ -47,6 +52,8 @@ unsigned long p_block_mapped(phys_addr_t pa) return 0; if (pa >= p && pa < p + IMMR_SIZE) return VIRT_IMMR_BASE + pa - p; + if (pa < block_mapped_ram) + return (unsigned long)__va(pa); return 0; } @@ -58,7 +65,7 @@ unsigned long p_block_mapped(phys_addr_t pa) void __init MMU_init_hw(void) { /* PIN up to the 3 first 8Mb after IMMR in DTLB table */ -#ifdef CONFIG_PIN_TLB +#ifdef CONFIG_PIN_TLB_DATA unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000; unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SHARED | _PAGE_DIRTY; #ifdef CONFIG_PIN_TLB_IMMR @@ -80,7 +87,7 @@ void __init MMU_init_hw(void) #endif } -static void mmu_mapin_immr(void) +static void __init mmu_mapin_immr(void) { unsigned long p = PHYS_IMMR_BASE; unsigned long v = VIRT_IMMR_BASE; @@ -96,8 +103,11 @@ static void mmu_mapin_immr(void) extern unsigned int DTLBMiss_jmp; #endif extern unsigned int DTLBMiss_cmp, FixupDAR_cmp; +#ifndef CONFIG_PIN_TLB_TEXT +extern unsigned int ITLBMiss_cmp; +#endif -void mmu_patch_cmp_limit(unsigned int *addr, unsigned long mapped) +static void __init mmu_patch_cmp_limit(unsigned int *addr, unsigned long mapped) { unsigned int instr = *addr; @@ -116,6 +126,9 @@ unsigned long __init mmu_mapin_ram(unsigned long top) #ifndef CONFIG_PIN_TLB_IMMR patch_instruction(&DTLBMiss_jmp, PPC_INST_NOP); #endif +#ifndef CONFIG_PIN_TLB_TEXT + mmu_patch_cmp_limit(&ITLBMiss_cmp, 0); +#endif } else { mapped = top & ~(LARGE_PAGE_SIZE_8M - 1); } @@ -133,11 +146,13 @@ unsigned long __init mmu_mapin_ram(unsigned long top) if (mapped) memblock_set_current_limit(mapped); + block_mapped_ram = mapped; + return mapped; } -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) +void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) { /* We don't currently support the first MEMBLOCK not mapping 0 * physical on those processors diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 7414034df1c3..fb844d2f266e 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -8,7 +8,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ - init-common.o + init-common.o mmu_context.o obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o @@ -22,8 +22,6 @@ ifeq ($(CONFIG_PPC_STD_MMU_64),y) obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o endif -obj-$(CONFIG_PPC_ICSWX) += icswx.o -obj-$(CONFIG_PPC_ICSWX_PID) += icswx_pid.o obj-$(CONFIG_40x) += 40x_mmu.o obj-$(CONFIG_44x) += 44x_mmu.o obj-$(CONFIG_PPC_8xx) += 8xx_mmu.o diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c index b1c144b03fcf..5c4c93dcff19 100644 --- a/arch/powerpc/mm/dump_hashpagetable.c +++ b/arch/powerpc/mm/dump_hashpagetable.c @@ -205,7 +205,7 @@ static void dump_hpte_info(struct pg_state *st, unsigned long ea, u64 v, u64 r, aps_index = calculate_pagesize(st, aps, "actual"); if (aps_index != 2) seq_printf(st->seq, "LP enc: %lx", lp); - seq_puts(st->seq, "\n"); + seq_putc(st->seq, '\n'); } diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c index 44fe4833910f..c9282d27b203 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.c +++ b/arch/powerpc/mm/dump_linuxpagetables.c @@ -350,7 +350,7 @@ static void note_page(struct pg_state *st, unsigned long addr, st->current_flags, pg_level[st->level].num); - seq_puts(st->seq, "\n"); + seq_putc(st->seq, '\n'); } /* diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 4c422632047b..4797d08581ce 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -45,43 +45,39 @@ #include <asm/siginfo.h> #include <asm/debug.h> -#include "icswx.h" - -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs) +static inline bool notify_page_fault(struct pt_regs *regs) { - int ret = 0; + bool ret = false; +#ifdef CONFIG_KPROBES /* kprobe_running() needs smp_processor_id() */ if (!user_mode(regs)) { preempt_disable(); if (kprobe_running() && kprobe_fault_handler(regs, 11)) - ret = 1; + ret = true; preempt_enable(); } +#endif /* CONFIG_KPROBES */ + + if (unlikely(debugger_fault_handler(regs))) + ret = true; return ret; } -#else -static inline int notify_page_fault(struct pt_regs *regs) -{ - return 0; -} -#endif /* * Check whether the instruction at regs->nip is a store using * an update addressing form which will update r1. */ -static int store_updates_sp(struct pt_regs *regs) +static bool store_updates_sp(struct pt_regs *regs) { unsigned int inst; if (get_user(inst, (unsigned int __user *)regs->nip)) - return 0; + return false; /* check for 1 in the rA field */ if (((inst >> 16) & 0x1f) != 1) - return 0; + return false; /* check major opcode */ switch (inst >> 26) { case 37: /* stwu */ @@ -89,7 +85,7 @@ static int store_updates_sp(struct pt_regs *regs) case 45: /* sthu */ case 53: /* stfsu */ case 55: /* stfdu */ - return 1; + return true; case 62: /* std or stdu */ return (inst & 3) == 1; case 31: @@ -101,18 +97,53 @@ static int store_updates_sp(struct pt_regs *regs) case 439: /* sthux */ case 695: /* stfsux */ case 759: /* stfdux */ - return 1; + return true; } } - return 0; + return false; } /* * do_page_fault error handling helpers */ -#define MM_FAULT_RETURN 0 -#define MM_FAULT_CONTINUE -1 -#define MM_FAULT_ERR(sig) (sig) +static int +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code) +{ + /* + * If we are in kernel mode, bail out with a SEGV, this will + * be caught by the assembly which will restore the non-volatile + * registers before calling bad_page_fault() + */ + if (!user_mode(regs)) + return SIGSEGV; + + _exception(SIGSEGV, regs, si_code, address); + + return 0; +} + +static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address) +{ + return __bad_area_nosemaphore(regs, address, SEGV_MAPERR); +} + +static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code) +{ + struct mm_struct *mm = current->mm; + + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + up_read(&mm->mmap_sem); + + return __bad_area_nosemaphore(regs, address, si_code); +} + +static noinline int bad_area(struct pt_regs *regs, unsigned long address) +{ + return __bad_area(regs, address, SEGV_MAPERR); +} static int do_sigbus(struct pt_regs *regs, unsigned long address, unsigned int fault) @@ -121,7 +152,7 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address, unsigned int lsb = 0; if (!user_mode(regs)) - return MM_FAULT_ERR(SIGBUS); + return SIGBUS; current->thread.trap_nr = BUS_ADRERR; info.si_signo = SIGBUS; @@ -142,25 +173,17 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address, #endif info.si_addr_lsb = lsb; force_sig_info(SIGBUS, &info, current); - return MM_FAULT_RETURN; + return 0; } static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) { /* - * Pagefault was interrupted by SIGKILL. We have no reason to - * continue the pagefault. + * Kernel page fault interrupted by SIGKILL. We have no reason to + * continue processing. */ - if (fatal_signal_pending(current)) { - /* Coming from kernel, we need to deal with uaccess fixups */ - if (user_mode(regs)) - return MM_FAULT_RETURN; - return MM_FAULT_ERR(SIGKILL); - } - - /* No fault: be happy */ - if (!(fault & VM_FAULT_ERROR)) - return MM_FAULT_CONTINUE; + if (fatal_signal_pending(current) && !user_mode(regs)) + return SIGKILL; /* Out of memory */ if (fault & VM_FAULT_OOM) { @@ -169,19 +192,176 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) * made us unable to handle the page fault gracefully. */ if (!user_mode(regs)) - return MM_FAULT_ERR(SIGKILL); + return SIGSEGV; pagefault_out_of_memory(); - return MM_FAULT_RETURN; + } else { + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) + return do_sigbus(regs, addr, fault); + else if (fault & VM_FAULT_SIGSEGV) + return bad_area_nosemaphore(regs, addr); + else + BUG(); + } + return 0; +} + +/* Is this a bad kernel fault ? */ +static bool bad_kernel_fault(bool is_exec, unsigned long error_code, + unsigned long address) +{ + if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT))) { + printk_ratelimited(KERN_CRIT "kernel tried to execute" + " exec-protected page (%lx) -" + "exploit attempt? (uid: %d)\n", + address, from_kuid(&init_user_ns, + current_uid())); } + return is_exec || (address >= TASK_SIZE); +} - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) - return do_sigbus(regs, addr, fault); +static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, + struct vm_area_struct *vma, + bool store_update_sp) +{ + /* + * N.B. The POWER/Open ABI allows programs to access up to + * 288 bytes below the stack pointer. + * The kernel signal delivery code writes up to about 1.5kB + * below the stack pointer (r1) before decrementing it. + * The exec code can write slightly over 640kB to the stack + * before setting the user r1. Thus we allow the stack to + * expand to 1MB without further checks. + */ + if (address + 0x100000 < vma->vm_end) { + /* get user regs even if this fault is in kernel mode */ + struct pt_regs *uregs = current->thread.regs; + if (uregs == NULL) + return true; - /* We don't understand the fault code, this is fatal */ - BUG(); - return MM_FAULT_CONTINUE; + /* + * A user-mode access to an address a long way below + * the stack pointer is only valid if the instruction + * is one which would update the stack pointer to the + * address accessed if the instruction completed, + * i.e. either stwu rs,n(r1) or stwux rs,r1,rb + * (or the byte, halfword, float or double forms). + * + * If we don't check this then any write to the area + * between the last mapped region and the stack will + * expand the stack rather than segfaulting. + */ + if (address + 2048 < uregs->gpr[1] && !store_update_sp) + return true; + } + return false; +} + +static bool access_error(bool is_write, bool is_exec, + struct vm_area_struct *vma) +{ + /* + * Allow execution from readable areas if the MMU does not + * provide separate controls over reading and executing. + * + * Note: That code used to not be enabled for 4xx/BookE. + * It is now as I/D cache coherency for these is done at + * set_pte_at() time and I see no reason why the test + * below wouldn't be valid on those processors. This -may- + * break programs compiled with a really old ABI though. + */ + if (is_exec) { + return !(vma->vm_flags & VM_EXEC) && + (cpu_has_feature(CPU_FTR_NOEXECUTE) || + !(vma->vm_flags & (VM_READ | VM_WRITE))); + } + + if (is_write) { + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return true; + return false; + } + + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return true; + + return false; } +#ifdef CONFIG_PPC_SMLPAR +static inline void cmo_account_page_fault(void) +{ + if (firmware_has_feature(FW_FEATURE_CMO)) { + u32 page_ins; + + preempt_disable(); + page_ins = be32_to_cpu(get_lppaca()->page_ins); + page_ins += 1 << PAGE_FACTOR; + get_lppaca()->page_ins = cpu_to_be32(page_ins); + preempt_enable(); + } +} +#else +static inline void cmo_account_page_fault(void) { } +#endif /* CONFIG_PPC_SMLPAR */ + +#ifdef CONFIG_PPC_STD_MMU +static void sanity_check_fault(bool is_write, unsigned long error_code) +{ + /* + * For hash translation mode, we should never get a + * PROTFAULT. Any update to pte to reduce access will result in us + * removing the hash page table entry, thus resulting in a DSISR_NOHPTE + * fault instead of DSISR_PROTFAULT. + * + * A pte update to relax the access will not result in a hash page table + * entry invalidate and hence can result in DSISR_PROTFAULT. + * ptep_set_access_flags() doesn't do a hpte flush. This is why we have + * the special !is_write in the below conditional. + * + * For platforms that doesn't supports coherent icache and do support + * per page noexec bit, we do setup things such that we do the + * sync between D/I cache via fault. But that is handled via low level + * hash fault code (hash_page_do_lazy_icache()) and we should not reach + * here in such case. + * + * For wrong access that can result in PROTFAULT, the above vma->vm_flags + * check should handle those and hence we should fall to the bad_area + * handling correctly. + * + * For embedded with per page exec support that doesn't support coherent + * icache we do get PROTFAULT and we handle that D/I cache sync in + * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON + * is conditional for server MMU. + * + * For radix, we can get prot fault for autonuma case, because radix + * page table will have them marked noaccess for user. + */ + if (!radix_enabled() && !is_write) + WARN_ON_ONCE(error_code & DSISR_PROTFAULT); +} +#else +static void sanity_check_fault(bool is_write, unsigned long error_code) { } +#endif /* CONFIG_PPC_STD_MMU */ + +/* + * Define the correct "is_write" bit in error_code based + * on the processor family + */ +#if (defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) +#define page_fault_is_write(__err) ((__err) & ESR_DST) +#define page_fault_is_bad(__err) (0) +#else +#define page_fault_is_write(__err) ((__err) & DSISR_ISSTORE) +#if defined(CONFIG_PPC_8xx) +#define page_fault_is_bad(__err) ((__err) & DSISR_NOEXEC_OR_G) +#elif defined(CONFIG_PPC64) +#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_64S) +#else +#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S) +#endif +#endif + /* * For 600- and 800-family processors, the error_code parameter is DSISR * for a data fault, SRR1 for an instruction fault. For 400-family processors @@ -195,92 +375,56 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) * The return value is 0 if the fault was handled, or the signal * number if this is a kernel fault that can't be handled here. */ -int do_page_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) +static int __do_page_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code) { - enum ctx_state prev_state = exception_enter(); struct vm_area_struct * vma; struct mm_struct *mm = current->mm; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - int code = SEGV_MAPERR; - int is_write = 0; - int trap = TRAP(regs); - int is_exec = trap == 0x400; + int is_exec = TRAP(regs) == 0x400; int is_user = user_mode(regs); - int fault; - int rc = 0, store_update_sp = 0; + int is_write = page_fault_is_write(error_code); + int fault, major = 0; + bool store_update_sp = false; -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) - /* - * Fortunately the bit assignments in SRR1 for an instruction - * fault and DSISR for a data fault are mostly the same for the - * bits we are interested in. But there are some bits which - * indicate errors in DSISR but can validly be set in SRR1. - */ - if (is_exec) - error_code &= 0x48200000; - else - is_write = error_code & DSISR_ISSTORE; -#else - is_write = error_code & ESR_DST; -#endif /* CONFIG_4xx || CONFIG_BOOKE */ + if (notify_page_fault(regs)) + return 0; -#ifdef CONFIG_PPC_ICSWX - /* - * we need to do this early because this "data storage - * interrupt" does not update the DAR/DEAR so we don't want to - * look at it - */ - if (error_code & ICSWX_DSI_UCT) { - rc = acop_handle_fault(regs, address, error_code); - if (rc) - goto bail; + if (unlikely(page_fault_is_bad(error_code))) { + if (is_user) { + _exception(SIGBUS, regs, BUS_OBJERR, address); + return 0; + } + return SIGBUS; } -#endif /* CONFIG_PPC_ICSWX */ - - if (notify_page_fault(regs)) - goto bail; - if (unlikely(debugger_fault_handler(regs))) - goto bail; + /* Additional sanity check(s) */ + sanity_check_fault(is_write, error_code); /* * The kernel should never take an execute fault nor should it * take a page fault to a kernel address. */ - if (!is_user && (is_exec || (address >= TASK_SIZE))) { - rc = SIGSEGV; - goto bail; - } + if (unlikely(!is_user && bad_kernel_fault(is_exec, error_code, address))) + return SIGSEGV; -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \ - defined(CONFIG_PPC_BOOK3S_64) || defined(CONFIG_PPC_8xx)) - if (error_code & DSISR_DABRMATCH) { - /* breakpoint match */ - do_break(regs, address, error_code); - goto bail; + /* + * If we're in an interrupt, have no user context or are running + * in a region with pagefaults disabled then we must not take the fault + */ + if (unlikely(faulthandler_disabled() || !mm)) { + if (is_user) + printk_ratelimited(KERN_ERR "Page fault in user mode" + " with faulthandler_disabled()=%d" + " mm=%p\n", + faulthandler_disabled(), mm); + return bad_area_nosemaphore(regs, address); } -#endif /* We restore the interrupt state now */ if (!arch_irq_disabled_regs(regs)) local_irq_enable(); - if (faulthandler_disabled() || mm == NULL) { - if (!is_user) { - rc = SIGSEGV; - goto bail; - } - /* faulthandler_disabled() in user mode is really bad, - as is current->mm == NULL. */ - printk(KERN_EMERG "Page fault in user mode with " - "faulthandler_disabled() = %d mm = %p\n", - faulthandler_disabled(), mm); - printk(KERN_EMERG "NIP = %lx MSR = %lx\n", - regs->nip, regs->msr); - die("Weird page fault", regs, SIGSEGV); - } - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* @@ -293,6 +437,10 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, if (is_user) flags |= FAULT_FLAG_USER; + if (is_write) + flags |= FAULT_FLAG_WRITE; + if (is_exec) + flags |= FAULT_FLAG_INSTRUCTION; /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -309,9 +457,9 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * source. If this is invalid we can skip the address space check, * thus avoiding the deadlock. */ - if (!down_read_trylock(&mm->mmap_sem)) { + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if (!is_user && !search_exception_tables(regs->nip)) - goto bad_area_nosemaphore; + return bad_area_nosemaphore(regs, address); retry: down_read(&mm->mmap_sem); @@ -325,122 +473,24 @@ retry: } vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) + if (unlikely(!vma)) + return bad_area(regs, address); + if (likely(vma->vm_start <= address)) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) + return bad_area(regs, address); - /* - * N.B. The POWER/Open ABI allows programs to access up to - * 288 bytes below the stack pointer. - * The kernel signal delivery code writes up to about 1.5kB - * below the stack pointer (r1) before decrementing it. - * The exec code can write slightly over 640kB to the stack - * before setting the user r1. Thus we allow the stack to - * expand to 1MB without further checks. - */ - if (address + 0x100000 < vma->vm_end) { - /* get user regs even if this fault is in kernel mode */ - struct pt_regs *uregs = current->thread.regs; - if (uregs == NULL) - goto bad_area; + /* The stack is being expanded, check if it's valid */ + if (unlikely(bad_stack_expansion(regs, address, vma, store_update_sp))) + return bad_area(regs, address); - /* - * A user-mode access to an address a long way below - * the stack pointer is only valid if the instruction - * is one which would update the stack pointer to the - * address accessed if the instruction completed, - * i.e. either stwu rs,n(r1) or stwux rs,r1,rb - * (or the byte, halfword, float or double forms). - * - * If we don't check this then any write to the area - * between the last mapped region and the stack will - * expand the stack rather than segfaulting. - */ - if (address + 2048 < uregs->gpr[1] && !store_update_sp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; + /* Try to expand it */ + if (unlikely(expand_stack(vma, address))) + return bad_area(regs, address); good_area: - code = SEGV_ACCERR; -#if defined(CONFIG_6xx) - if (error_code & 0x95700000) - /* an error such as lwarx to I/O controller space, - address matching DABR, eciwx, etc. */ - goto bad_area; -#endif /* CONFIG_6xx */ -#if defined(CONFIG_8xx) - /* The MPC8xx seems to always set 0x80000000, which is - * "undefined". Of those that can be set, this is the only - * one which seems bad. - */ - if (error_code & 0x10000000) - /* Guarded storage error. */ - goto bad_area; -#endif /* CONFIG_8xx */ - - if (is_exec) { - /* - * Allow execution from readable areas if the MMU does not - * provide separate controls over reading and executing. - * - * Note: That code used to not be enabled for 4xx/BookE. - * It is now as I/D cache coherency for these is done at - * set_pte_at() time and I see no reason why the test - * below wouldn't be valid on those processors. This -may- - * break programs compiled with a really old ABI though. - */ - if (!(vma->vm_flags & VM_EXEC) && - (cpu_has_feature(CPU_FTR_NOEXECUTE) || - !(vma->vm_flags & (VM_READ | VM_WRITE)))) - goto bad_area; - /* a write */ - } else if (is_write) { - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - flags |= FAULT_FLAG_WRITE; - /* a read */ - } else { - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; - } -#ifdef CONFIG_PPC_STD_MMU - /* - * For hash translation mode, we should never get a - * PROTFAULT. Any update to pte to reduce access will result in us - * removing the hash page table entry, thus resulting in a DSISR_NOHPTE - * fault instead of DSISR_PROTFAULT. - * - * A pte update to relax the access will not result in a hash page table - * entry invalidate and hence can result in DSISR_PROTFAULT. - * ptep_set_access_flags() doesn't do a hpte flush. This is why we have - * the special !is_write in the below conditional. - * - * For platforms that doesn't supports coherent icache and do support - * per page noexec bit, we do setup things such that we do the - * sync between D/I cache via fault. But that is handled via low level - * hash fault code (hash_page_do_lazy_icache()) and we should not reach - * here in such case. - * - * For wrong access that can result in PROTFAULT, the above vma->vm_flags - * check should handle those and hence we should fall to the bad_area - * handling correctly. - * - * For embedded with per page exec support that doesn't support coherent - * icache we do get PROTFAULT and we handle that D/I cache sync in - * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON - * is conditional for server MMU. - * - * For radix, we can get prot fault for autonuma case, because radix - * page table will have them marked noaccess for user. - */ - if (!radix_enabled() && !is_write) - WARN_ON_ONCE(error_code & DSISR_PROTFAULT); -#endif /* CONFIG_PPC_STD_MMU */ + if (unlikely(access_error(is_write, is_exec, vma))) + return bad_area(regs, address); /* * If for any reason at all we couldn't handle the fault, @@ -448,6 +498,7 @@ good_area: * the fault. */ fault = handle_mm_fault(vma, address, flags); + major |= fault & VM_FAULT_MAJOR; /* * Handle the retry right now, the mmap_sem has been released in that @@ -465,64 +516,39 @@ good_area: if (!fatal_signal_pending(current)) goto retry; } - /* We will enter mm_fault_error() below */ - } else - up_read(¤t->mm->mmap_sem); - - if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { - if (fault & VM_FAULT_SIGSEGV) - goto bad_area_nosemaphore; - rc = mm_fault_error(regs, address, fault); - if (rc >= MM_FAULT_RETURN) - goto bail; - else - rc = 0; + + /* + * User mode? Just return to handle the fatal exception otherwise + * return to bad_page_fault + */ + return is_user ? 0 : SIGBUS; } + up_read(¤t->mm->mmap_sem); + + if (unlikely(fault & VM_FAULT_ERROR)) + return mm_fault_error(regs, address, fault); + /* * Major/minor page fault accounting. */ - if (fault & VM_FAULT_MAJOR) { + if (major) { current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); -#ifdef CONFIG_PPC_SMLPAR - if (firmware_has_feature(FW_FEATURE_CMO)) { - u32 page_ins; - - preempt_disable(); - page_ins = be32_to_cpu(get_lppaca()->page_ins); - page_ins += 1 << PAGE_FACTOR; - get_lppaca()->page_ins = cpu_to_be32(page_ins); - preempt_enable(); - } -#endif /* CONFIG_PPC_SMLPAR */ + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); + cmo_account_page_fault(); } else { current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } - - goto bail; - -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses cause a SIGSEGV */ - if (is_user) { - _exception(SIGSEGV, regs, code, address); - goto bail; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } + return 0; +} +NOKPROBE_SYMBOL(__do_page_fault); - if (is_exec && (error_code & DSISR_PROTFAULT)) - printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected" - " page (%lx) - exploit attempt? (uid: %d)\n", - address, from_kuid(&init_user_ns, current_uid())); - - rc = SIGSEGV; - -bail: +int do_page_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code) +{ + enum ctx_state prev_state = exception_enter(); + int rc = __do_page_fault(regs, address, error_code); exception_exit(prev_state); return rc; } diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index 6f962e5cb5e1..ffbd7c0bda96 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -575,7 +575,6 @@ _GLOBAL(flush_hash_pages) rlwinm r8,r8,0,31,29 /* clear HASHPTE bit */ stwcx. r8,0,r5 /* update the pte */ bne- 33b -EXPORT_SYMBOL(flush_hash_pages) /* Get the address of the primary PTE group in the hash table (r3) */ _GLOBAL(flush_hash_patch_A) @@ -634,6 +633,7 @@ _GLOBAL(flush_hash_patch_B) SYNC_601 isync blr +EXPORT_SYMBOL(flush_hash_pages) /* * Flush an entry from the TLB diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 7a20669c19e7..67ec2e927253 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -61,6 +61,7 @@ #include <asm/tm.h> #include <asm/trace.h> #include <asm/ps3.h> +#include <asm/pte-walk.h> #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -507,9 +508,9 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node, printk(KERN_INFO "Huge page(16GB) memory: " "addr = 0x%lX size = 0x%lX pages = %d\n", phys_addr, block_size, expected_pages); - if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) { + if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) { memblock_reserve(phys_addr, block_size * expected_pages); - add_gpage(phys_addr, block_size, expected_pages); + pseries_add_gpage(phys_addr, block_size, expected_pages); } return 0; } @@ -1019,6 +1020,7 @@ void __init hash__early_init_mmu(void) __kernel_virt_size = H_KERN_VIRT_SIZE; __vmalloc_start = H_VMALLOC_START; __vmalloc_end = H_VMALLOC_END; + __kernel_io_start = H_KERN_IO_START; vmemmap = (struct page *)H_VMEMMAP_BASE; ioremap_bot = IOREMAP_BASE; @@ -1228,7 +1230,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, unsigned long vsid; pte_t *ptep; unsigned hugeshift; - const struct cpumask *tmp; int rc, user_region = 0; int psize, ssize; @@ -1280,8 +1281,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, } /* Check CPU locality */ - tmp = cpumask_of(smp_processor_id()); - if (user_region && cpumask_equal(mm_cpumask(mm), tmp)) + if (user_region && mm_is_thread_local(mm)) flags |= HPTE_LOCAL_UPDATE; #ifndef CONFIG_PPC_64K_PAGES @@ -1297,7 +1297,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, #endif /* CONFIG_PPC_64K_PAGES */ /* Get PTE and page size from page tables */ - ptep = __find_linux_pte_or_hugepte(pgdir, ea, &is_thp, &hugeshift); + ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift); if (ptep == NULL || !pte_present(*ptep)) { DBG_LOW(" no PTE !\n"); rc = 1; @@ -1526,7 +1526,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, * THP pages use update_mmu_cache_pmd. We don't do * hash preload there. Hence can ignore THP here */ - ptep = find_linux_pte_or_hugepte(pgdir, ea, NULL, &hugepage_shift); + ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift); if (!ptep) goto out_exit; @@ -1543,7 +1543,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, #endif /* CONFIG_PPC_64K_PAGES */ /* Is that local to this CPU ? */ - if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + if (mm_is_thread_local(mm)) update_flags |= HPTE_LOCAL_UPDATE; /* Hash it in */ diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index e1bf5ca397fe..1571a498a33f 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -24,6 +24,8 @@ #include <asm/tlb.h> #include <asm/setup.h> #include <asm/hugetlb.h> +#include <asm/pte-walk.h> + #ifdef CONFIG_HUGETLB_PAGE @@ -36,32 +38,15 @@ unsigned int HPAGE_SHIFT; EXPORT_SYMBOL(HPAGE_SHIFT); -/* - * Tracks gpages after the device tree is scanned and before the - * huge_boot_pages list is ready. On non-Freescale implementations, this is - * just used to track 16G pages and so is a single array. FSL-based - * implementations may have more than one gpage size, so we need multiple - * arrays - */ -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) -#define MAX_NUMBER_GPAGES 128 -struct psize_gpages { - u64 gpage_list[MAX_NUMBER_GPAGES]; - unsigned int nr_gpages; -}; -static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT]; -#else -#define MAX_NUMBER_GPAGES 1024 -static u64 gpage_freearray[MAX_NUMBER_GPAGES]; -static unsigned nr_gpages; -#endif - #define hugepd_none(hpd) (hpd_val(hpd) == 0) pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { - /* Only called for hugetlbfs pages, hence can ignore THP */ - return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL); + /* + * Only called for hugetlbfs pages, hence can ignore THP and the + * irq disabled walk. + */ + return __find_linux_pte(mm->pgd, addr, NULL, NULL); } static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, @@ -210,145 +195,20 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz return hugepte_offset(*hpdp, addr, pdshift); } -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) -/* Build list of addresses of gigantic pages. This function is used in early - * boot before the buddy allocator is setup. - */ -void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) -{ - unsigned int idx = shift_to_mmu_psize(__ffs(page_size)); - int i; - - if (addr == 0) - return; - - gpage_freearray[idx].nr_gpages = number_of_pages; - - for (i = 0; i < number_of_pages; i++) { - gpage_freearray[idx].gpage_list[i] = addr; - addr += page_size; - } -} - -/* - * Moves the gigantic page addresses from the temporary list to the - * huge_boot_pages list. - */ -int alloc_bootmem_huge_page(struct hstate *hstate) -{ - struct huge_bootmem_page *m; - int idx = shift_to_mmu_psize(huge_page_shift(hstate)); - int nr_gpages = gpage_freearray[idx].nr_gpages; - - if (nr_gpages == 0) - return 0; - -#ifdef CONFIG_HIGHMEM - /* - * If gpages can be in highmem we can't use the trick of storing the - * data structure in the page; allocate space for this - */ - m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0); - m->phys = gpage_freearray[idx].gpage_list[--nr_gpages]; -#else - m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]); -#endif - - list_add(&m->list, &huge_boot_pages); - gpage_freearray[idx].nr_gpages = nr_gpages; - gpage_freearray[idx].gpage_list[nr_gpages] = 0; - m->hstate = hstate; - - return 1; -} +#ifdef CONFIG_PPC_BOOK3S_64 /* - * Scan the command line hugepagesz= options for gigantic pages; store those in - * a list that we use to allocate the memory once all options are parsed. + * Tracks gpages after the device tree is scanned and before the + * huge_boot_pages list is ready on pseries. */ - -unsigned long gpage_npages[MMU_PAGE_COUNT]; - -static int __init do_gpage_early_setup(char *param, char *val, - const char *unused, void *arg) -{ - static phys_addr_t size; - unsigned long npages; - - /* - * The hugepagesz and hugepages cmdline options are interleaved. We - * use the size variable to keep track of whether or not this was done - * properly and skip over instances where it is incorrect. Other - * command-line parsing code will issue warnings, so we don't need to. - * - */ - if ((strcmp(param, "default_hugepagesz") == 0) || - (strcmp(param, "hugepagesz") == 0)) { - size = memparse(val, NULL); - } else if (strcmp(param, "hugepages") == 0) { - if (size != 0) { - if (sscanf(val, "%lu", &npages) <= 0) - npages = 0; - if (npages > MAX_NUMBER_GPAGES) { - pr_warn("MMU: %lu pages requested for page " -#ifdef CONFIG_PHYS_ADDR_T_64BIT - "size %llu KB, limiting to " -#else - "size %u KB, limiting to " -#endif - __stringify(MAX_NUMBER_GPAGES) "\n", - npages, size / 1024); - npages = MAX_NUMBER_GPAGES; - } - gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages; - size = 0; - } - } - return 0; -} - +#define MAX_NUMBER_GPAGES 1024 +__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES]; +__initdata static unsigned nr_gpages; /* - * This function allocates physical space for pages that are larger than the - * buddy allocator can handle. We want to allocate these in highmem because - * the amount of lowmem is limited. This means that this function MUST be - * called before lowmem_end_addr is set up in MMU_init() in order for the lmb - * allocate to grab highmem. - */ -void __init reserve_hugetlb_gpages(void) -{ - static __initdata char cmdline[COMMAND_LINE_SIZE]; - phys_addr_t size, base; - int i; - - strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); - parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0, - NULL, &do_gpage_early_setup); - - /* - * Walk gpage list in reverse, allocating larger page sizes first. - * Skip over unsupported sizes, or sizes that have 0 gpages allocated. - * When we reach the point in the list where pages are no longer - * considered gpages, we're done. - */ - for (i = MMU_PAGE_COUNT-1; i >= 0; i--) { - if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0) - continue; - else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT)) - break; - - size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i)); - base = memblock_alloc_base(size * gpage_npages[i], size, - MEMBLOCK_ALLOC_ANYWHERE); - add_gpage(base, size, gpage_npages[i]); - } -} - -#else /* !PPC_FSL_BOOK3E */ - -/* Build list of addresses of gigantic pages. This function is used in early + * Build list of addresses of gigantic pages. This function is used in early * boot before the buddy allocator is setup. */ -void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) +void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) { if (!addr) return; @@ -360,10 +220,7 @@ void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) } } -/* Moves the gigantic page addresses from the temporary list to the - * huge_boot_pages list. - */ -int alloc_bootmem_huge_page(struct hstate *hstate) +int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) { struct huge_bootmem_page *m; if (nr_gpages == 0) @@ -376,6 +233,17 @@ int alloc_bootmem_huge_page(struct hstate *hstate) } #endif + +int __init alloc_bootmem_huge_page(struct hstate *h) +{ + +#ifdef CONFIG_PPC_BOOK3S_64 + if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) + return pseries_alloc_bootmem_huge_page(h); +#endif + return __alloc_bootmem_huge_page(h); +} + #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) #define HUGEPD_FREELIST_SIZE \ ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) @@ -407,8 +275,7 @@ static void hugepd_free(struct mmu_gather *tlb, void *hugepte) batchp = &get_cpu_var(hugepd_freelist_cur); if (atomic_read(&tlb->mm->mm_users) < 2 || - cpumask_equal(mm_cpumask(tlb->mm), - cpumask_of(smp_processor_id()))) { + mm_is_thread_local(tlb->mm)) { kmem_cache_free(hugepte_cache, hugepte); put_cpu_var(hugepd_freelist_cur); return; @@ -886,9 +753,8 @@ void flush_dcache_icache_hugepage(struct page *page) * This function need to be called with interrupts disabled. We use this variant * when we have MSR[EE] = 0 but the paca->soft_enabled = 1 */ - -pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, - bool *is_thp, unsigned *shift) +pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, + bool *is_thp, unsigned *hpage_shift) { pgd_t pgd, *pgdp; pud_t pud, *pudp; @@ -897,8 +763,8 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, hugepd_t *hpdp = NULL; unsigned pdshift = PGDIR_SHIFT; - if (shift) - *shift = 0; + if (hpage_shift) + *hpage_shift = 0; if (is_thp) *is_thp = false; @@ -968,16 +834,15 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, ret_pte = hugepte_offset(*hpdp, ea, pdshift); pdshift = hugepd_shift(*hpdp); out: - if (shift) - *shift = pdshift; + if (hpage_shift) + *hpage_shift = pdshift; return ret_pte; } -EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte); +EXPORT_SYMBOL_GPL(__find_linux_pte); int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - unsigned long mask; unsigned long pte_end; struct page *head, *page; pte_t pte; @@ -988,18 +853,10 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, end = pte_end; pte = READ_ONCE(*ptep); - mask = _PAGE_PRESENT | _PAGE_READ; - /* - * On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined - * as 0 and _PAGE_RO has to be set when a page is not writable - */ - if (write) - mask |= _PAGE_WRITE; - else - mask |= _PAGE_RO; - - if ((pte_val(pte) & mask) != mask) + if (!pte_present(pte) || !pte_read(pte)) + return 0; + if (write && !pte_write(pte)) return 0; /* hugepages are never "special" */ diff --git a/arch/powerpc/mm/icswx.c b/arch/powerpc/mm/icswx.c deleted file mode 100644 index 1fa794d7d59f..000000000000 --- a/arch/powerpc/mm/icswx.c +++ /dev/null @@ -1,292 +0,0 @@ -/* - * ICSWX and ACOP Management - * - * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/module.h> -#include <linux/uaccess.h> - -#include "icswx.h" - -/* - * The processor and its L2 cache cause the icswx instruction to - * generate a COP_REQ transaction on PowerBus. The transaction has no - * address, and the processor does not perform an MMU access to - * authenticate the transaction. The command portion of the PowerBus - * COP_REQ transaction includes the LPAR_ID (LPID) and the coprocessor - * Process ID (PID), which the coprocessor compares to the authorized - * LPID and PID held in the coprocessor, to determine if the process - * is authorized to generate the transaction. The data of the COP_REQ - * transaction is 128-byte or less in size and is placed in cacheable - * memory on a 128-byte cache line boundary. - * - * The task to use a coprocessor should use use_cop() to mark the use - * of the Coprocessor Type (CT) and context switching. On a server - * class processor, the PID register is used only for coprocessor - * management + * and so a coprocessor PID is allocated before - * executing icswx + * instruction. Drop_cop() is used to free the - * coprocessor PID. - * - * Example: - * Host Fabric Interface (HFI) is a PowerPC network coprocessor. - * Each HFI have multiple windows. Each HFI window serves as a - * network device sending to and receiving from HFI network. - * HFI immediate send function uses icswx instruction. The immediate - * send function allows small (single cache-line) packets be sent - * without using the regular HFI send FIFO and doorbell, which are - * much slower than immediate send. - * - * For each task intending to use HFI immediate send, the HFI driver - * calls use_cop() to obtain a coprocessor PID for the task. - * The HFI driver then allocate a free HFI window and save the - * coprocessor PID to the HFI window to allow the task to use the - * HFI window. - * - * The HFI driver repeatedly creates immediate send packets and - * issues icswx instruction to send data through the HFI window. - * The HFI compares the coprocessor PID in the CPU PID register - * to the PID held in the HFI window to determine if the transaction - * is allowed. - * - * When the task to release the HFI window, the HFI driver calls - * drop_cop() to release the coprocessor PID. - */ - -void switch_cop(struct mm_struct *next) -{ -#ifdef CONFIG_PPC_ICSWX_PID - mtspr(SPRN_PID, next->context.cop_pid); -#endif - mtspr(SPRN_ACOP, next->context.acop); -} - -/** - * Start using a coprocessor. - * @acop: mask of coprocessor to be used. - * @mm: The mm the coprocessor to associate with. Most likely current mm. - * - * Return a positive PID if successful. Negative errno otherwise. - * The returned PID will be fed to the coprocessor to determine if an - * icswx transaction is authenticated. - */ -int use_cop(unsigned long acop, struct mm_struct *mm) -{ - int ret; - - if (!cpu_has_feature(CPU_FTR_ICSWX)) - return -ENODEV; - - if (!mm || !acop) - return -EINVAL; - - /* The page_table_lock ensures mm_users won't change under us */ - spin_lock(&mm->page_table_lock); - spin_lock(mm->context.cop_lockp); - - ret = get_cop_pid(mm); - if (ret < 0) - goto out; - - /* update acop */ - mm->context.acop |= acop; - - sync_cop(mm); - - /* - * If this is a threaded process then there might be other threads - * running. We need to send an IPI to force them to pick up any - * change in PID and ACOP. - */ - if (atomic_read(&mm->mm_users) > 1) - smp_call_function(sync_cop, mm, 1); - -out: - spin_unlock(mm->context.cop_lockp); - spin_unlock(&mm->page_table_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(use_cop); - -/** - * Stop using a coprocessor. - * @acop: mask of coprocessor to be stopped. - * @mm: The mm the coprocessor associated with. - */ -void drop_cop(unsigned long acop, struct mm_struct *mm) -{ - int free_pid; - - if (!cpu_has_feature(CPU_FTR_ICSWX)) - return; - - if (WARN_ON_ONCE(!mm)) - return; - - /* The page_table_lock ensures mm_users won't change under us */ - spin_lock(&mm->page_table_lock); - spin_lock(mm->context.cop_lockp); - - mm->context.acop &= ~acop; - - free_pid = disable_cop_pid(mm); - sync_cop(mm); - - /* - * If this is a threaded process then there might be other threads - * running. We need to send an IPI to force them to pick up any - * change in PID and ACOP. - */ - if (atomic_read(&mm->mm_users) > 1) - smp_call_function(sync_cop, mm, 1); - - if (free_pid != COP_PID_NONE) - free_cop_pid(free_pid); - - spin_unlock(mm->context.cop_lockp); - spin_unlock(&mm->page_table_lock); -} -EXPORT_SYMBOL_GPL(drop_cop); - -static int acop_use_cop(int ct) -{ - /* There is no alternate policy, yet */ - return -1; -} - -/* - * Get the instruction word at the NIP - */ -static u32 acop_get_inst(struct pt_regs *regs) -{ - u32 inst; - u32 __user *p; - - p = (u32 __user *)regs->nip; - if (!access_ok(VERIFY_READ, p, sizeof(*p))) - return 0; - - if (__get_user(inst, p)) - return 0; - - return inst; -} - -/** - * @regs: registers at time of interrupt - * @address: storage address - * @error_code: Fault code, usually the DSISR or ESR depending on - * processor type - * - * Return 0 if we are able to resolve the data storage fault that - * results from a CT miss in the ACOP register. - */ -int acop_handle_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) -{ - int ct; - u32 inst = 0; - - if (!cpu_has_feature(CPU_FTR_ICSWX)) { - pr_info("No coprocessors available"); - _exception(SIGILL, regs, ILL_ILLOPN, address); - } - - if (!user_mode(regs)) { - /* this could happen if the HV denies the - * kernel access, for now we just die */ - die("ICSWX from kernel failed", regs, SIGSEGV); - } - - /* Some implementations leave us a hint for the CT */ - ct = ICSWX_GET_CT_HINT(error_code); - if (ct < 0) { - /* we have to peek at the instruction word to figure out CT */ - u32 ccw; - u32 rs; - - inst = acop_get_inst(regs); - if (inst == 0) - return -1; - - rs = (inst >> (31 - 10)) & 0x1f; - ccw = regs->gpr[rs]; - ct = (ccw >> 16) & 0x3f; - } - - /* - * We could be here because another thread has enabled acop - * but the ACOP register has yet to be updated. - * - * This should have been taken care of by the IPI to sync all - * the threads (see smp_call_function(sync_cop, mm, 1)), but - * that could take forever if there are a significant amount - * of threads. - * - * Given the number of threads on some of these systems, - * perhaps this is the best way to sync ACOP rather than whack - * every thread with an IPI. - */ - if ((acop_copro_type_bit(ct) & current->active_mm->context.acop) != 0) { - sync_cop(current->active_mm); - return 0; - } - - /* check for alternate policy */ - if (!acop_use_cop(ct)) - return 0; - - /* at this point the CT is unknown to the system */ - pr_warn("%s[%d]: Coprocessor %d is unavailable\n", - current->comm, current->pid, ct); - - /* get inst if we don't already have it */ - if (inst == 0) { - inst = acop_get_inst(regs); - if (inst == 0) - return -1; - } - - /* Check if the instruction is the "record form" */ - if (inst & 1) { - /* - * the instruction is "record" form so we can reject - * using CR0 - */ - regs->ccr &= ~(0xful << 28); - regs->ccr |= ICSWX_RC_NOT_FOUND << 28; - - /* Move on to the next instruction */ - regs->nip += 4; - } else { - /* - * There is no architected mechanism to report a bad - * CT so we could either SIGILL or report nothing. - * Since the non-record version should only bu used - * for "hints" or "don't care" we should probably do - * nothing. However, I could see how some people - * might want an SIGILL so it here if you want it. - */ -#ifdef CONFIG_PPC_ICSWX_USE_SIGILL - _exception(SIGILL, regs, ILL_ILLOPN, address); -#else - regs->nip += 4; -#endif - } - - return 0; -} -EXPORT_SYMBOL_GPL(acop_handle_fault); diff --git a/arch/powerpc/mm/icswx.h b/arch/powerpc/mm/icswx.h deleted file mode 100644 index 6dedc08e62c8..000000000000 --- a/arch/powerpc/mm/icswx.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef _ARCH_POWERPC_MM_ICSWX_H_ -#define _ARCH_POWERPC_MM_ICSWX_H_ - -/* - * ICSWX and ACOP Management - * - * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <asm/mmu_context.h> - -/* also used to denote that PIDs are not used */ -#define COP_PID_NONE 0 - -static inline void sync_cop(void *arg) -{ - struct mm_struct *mm = arg; - - if (mm == current->active_mm) - switch_cop(current->active_mm); -} - -#ifdef CONFIG_PPC_ICSWX_PID -extern int get_cop_pid(struct mm_struct *mm); -extern int disable_cop_pid(struct mm_struct *mm); -extern void free_cop_pid(int free_pid); -#else -#define get_cop_pid(m) (COP_PID_NONE) -#define disable_cop_pid(m) (COP_PID_NONE) -#define free_cop_pid(p) -#endif - -/* - * These are implementation bits for architected registers. If this - * ever becomes architecture the should be moved to reg.h et. al. - */ -/* UCT is the same bit for Server and Embedded */ -#define ICSWX_DSI_UCT 0x00004000 /* Unavailable Coprocessor Type */ - -#ifdef CONFIG_PPC_BOOK3E -/* Embedded implementation gives us no hints as to what the CT is */ -#define ICSWX_GET_CT_HINT(x) (-1) -#else -/* Server implementation contains the CT value in the DSISR */ -#define ICSWX_DSISR_CTMASK 0x00003f00 -#define ICSWX_GET_CT_HINT(x) (((x) & ICSWX_DSISR_CTMASK) >> 8) -#endif - -#define ICSWX_RC_STARTED 0x8 /* The request has been started */ -#define ICSWX_RC_NOT_IDLE 0x4 /* No coprocessor found idle */ -#define ICSWX_RC_NOT_FOUND 0x2 /* No coprocessor found */ -#define ICSWX_RC_UNDEFINED 0x1 /* Reserved */ - -extern int acop_handle_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code); - -static inline u64 acop_copro_type_bit(unsigned int type) -{ - return 1ULL << (63 - type); -} - -#endif /* !_ARCH_POWERPC_MM_ICSWX_H_ */ diff --git a/arch/powerpc/mm/icswx_pid.c b/arch/powerpc/mm/icswx_pid.c deleted file mode 100644 index 91e30eb7d054..000000000000 --- a/arch/powerpc/mm/icswx_pid.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * ICSWX and ACOP/PID Management - * - * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/idr.h> -#include <linux/module.h> -#include "icswx.h" - -#define COP_PID_MIN (COP_PID_NONE + 1) -#define COP_PID_MAX (0xFFFF) - -static DEFINE_SPINLOCK(mmu_context_acop_lock); -static DEFINE_IDA(cop_ida); - -static int new_cop_pid(struct ida *ida, int min_id, int max_id, - spinlock_t *lock) -{ - int index; - int err; - -again: - if (!ida_pre_get(ida, GFP_KERNEL)) - return -ENOMEM; - - spin_lock(lock); - err = ida_get_new_above(ida, min_id, &index); - spin_unlock(lock); - - if (err == -EAGAIN) - goto again; - else if (err) - return err; - - if (index > max_id) { - spin_lock(lock); - ida_remove(ida, index); - spin_unlock(lock); - return -ENOMEM; - } - - return index; -} - -int get_cop_pid(struct mm_struct *mm) -{ - int pid; - - if (mm->context.cop_pid == COP_PID_NONE) { - pid = new_cop_pid(&cop_ida, COP_PID_MIN, COP_PID_MAX, - &mmu_context_acop_lock); - if (pid >= 0) - mm->context.cop_pid = pid; - } - return mm->context.cop_pid; -} - -int disable_cop_pid(struct mm_struct *mm) -{ - int free_pid = COP_PID_NONE; - - if ((!mm->context.acop) && (mm->context.cop_pid != COP_PID_NONE)) { - free_pid = mm->context.cop_pid; - mm->context.cop_pid = COP_PID_NONE; - } - return free_pid; -} - -void free_cop_pid(int free_pid) -{ - spin_lock(&mmu_context_acop_lock); - ida_remove(&cop_ida, free_pid); - spin_unlock(&mmu_context_acop_lock); -} diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 8a7c38b8d335..6419b33ca309 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -113,6 +113,12 @@ void __init MMU_setup(void) __map_without_bats = 1; __map_without_ltlbs = 1; } +#ifdef CONFIG_STRICT_KERNEL_RWX + if (rodata_enabled) { + __map_without_bats = 1; + __map_without_ltlbs = 1; + } +#endif } /* @@ -132,8 +138,6 @@ void __init MMU_init(void) * Reserve gigantic pages for hugetlb. This MUST occur before * lowmem_end_addr is initialized below. */ - reserve_hugetlb_gpages(); - if (memblock.memory.cnt > 1) { #ifndef CONFIG_WII memblock_enforce_memory_limit(memblock.memory.regions[0].size); diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 5b4c25d12ff3..588a521966ec 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -356,7 +356,7 @@ struct page *realmode_pfn_to_page(unsigned long pfn) } EXPORT_SYMBOL_GPL(realmode_pfn_to_page); -#elif defined(CONFIG_FLATMEM) +#else struct page *realmode_pfn_to_page(unsigned long pfn) { @@ -365,7 +365,7 @@ struct page *realmode_pfn_to_page(unsigned long pfn) } EXPORT_SYMBOL_GPL(realmode_pfn_to_page); -#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */ +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ #ifdef CONFIG_PPC_STD_MMU_64 static bool disable_radix; @@ -381,7 +381,7 @@ early_param("disable_radix", parse_disable_radix); * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do * radix. If not, we clear the radix feature bit so we fall back to hash. */ -static void early_check_vec5(void) +static void __init early_check_vec5(void) { unsigned long root, chosen; int size; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 46b4e67d2372..4362b86ef84c 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -436,7 +436,7 @@ void flush_dcache_icache_page(struct page *page) return; } #endif -#if defined(CONFIG_8xx) || defined(CONFIG_PPC64) +#if defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC64) /* On 8xx there is no need to kmap since highmem is not supported */ __flush_dcache_icache(page_address(page)); #else diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c new file mode 100644 index 000000000000..0f613bc63c50 --- /dev/null +++ b/arch/powerpc/mm/mmu_context.c @@ -0,0 +1,99 @@ +/* + * Common implementation of switch_mm_irqs_off + * + * Copyright IBM Corp. 2017 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/mm.h> +#include <linux/cpu.h> + +#include <asm/mmu_context.h> + +#if defined(CONFIG_PPC32) +static inline void switch_mm_pgdir(struct task_struct *tsk, + struct mm_struct *mm) +{ + /* 32-bit keeps track of the current PGDIR in the thread struct */ + tsk->thread.pgdir = mm->pgd; +} +#elif defined(CONFIG_PPC_BOOK3E_64) +static inline void switch_mm_pgdir(struct task_struct *tsk, + struct mm_struct *mm) +{ + /* 64-bit Book3E keeps track of current PGD in the PACA */ + get_paca()->pgd = mm->pgd; +} +#else +static inline void switch_mm_pgdir(struct task_struct *tsk, + struct mm_struct *mm) { } +#endif + +#ifdef CONFIG_PPC_BOOK3S_64 +static inline void inc_mm_active_cpus(struct mm_struct *mm) +{ + atomic_inc(&mm->context.active_cpus); +} +#else +static inline void inc_mm_active_cpus(struct mm_struct *mm) { } +#endif + +void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + bool new_on_cpu = false; + + /* Mark this context has been used on the new CPU */ + if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) { + cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); + inc_mm_active_cpus(next); + + /* + * This full barrier orders the store to the cpumask above vs + * a subsequent operation which allows this CPU to begin loading + * translations for next. + * + * When using the radix MMU that operation is the load of the + * MMU context id, which is then moved to SPRN_PID. + * + * For the hash MMU it is either the first load from slb_cache + * in switch_slb(), and/or the store of paca->mm_ctx_id in + * copy_mm_to_paca(). + * + * On the read side the barrier is in pte_xchg(), which orders + * the store to the PTE vs the load of mm_cpumask. + */ + smp_mb(); + + new_on_cpu = true; + } + + /* Some subarchs need to track the PGD elsewhere */ + switch_mm_pgdir(tsk, next); + + /* Nothing else to do if we aren't actually switching */ + if (prev == next) + return; + + /* + * We must stop all altivec streams before changing the HW + * context + */ + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + asm volatile ("dssall"); + + if (new_on_cpu) + radix_kvm_prefetch_workaround(next); + + /* + * The actual HW switching method differs between the various + * sub architectures. Out of line for now + */ + switch_mmu_context(prev, next, tsk); +} + diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index a75f63833284..05e15386d4cb 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -25,8 +25,6 @@ #include <asm/mmu_context.h> #include <asm/pgalloc.h> -#include "icswx.h" - static DEFINE_SPINLOCK(mmu_context_lock); static DEFINE_IDA(mmu_context_ida); @@ -165,16 +163,6 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) return index; mm->context.id = index; -#ifdef CONFIG_PPC_ICSWX - mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL); - if (!mm->context.cop_lockp) { - __destroy_context(index); - subpage_prot_free(mm); - mm->context.id = MMU_NO_CONTEXT; - return -ENOMEM; - } - spin_lock_init(mm->context.cop_lockp); -#endif /* CONFIG_PPC_ICSWX */ #ifdef CONFIG_PPC_64K_PAGES mm->context.pte_frag = NULL; @@ -182,6 +170,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) #ifdef CONFIG_SPAPR_TCE_IOMMU mm_iommu_init(mm); #endif + atomic_set(&mm->context.active_cpus, 0); + return 0; } @@ -226,12 +216,6 @@ void destroy_context(struct mm_struct *mm) #ifdef CONFIG_SPAPR_TCE_IOMMU WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list)); #endif -#ifdef CONFIG_PPC_ICSWX - drop_cop(mm->context.acop, mm); - kfree(mm->context.cop_lockp); - mm->context.cop_lockp = NULL; -#endif /* CONFIG_PPC_ICSWX */ - if (radix_enabled()) { /* * Radix doesn't have a valid bit in the process table diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index d46128b22150..57fbc554c785 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -27,7 +27,7 @@ /* * On 40x and 8xx, we directly inline tlbia and tlbivax */ -#if defined(CONFIG_40x) || defined(CONFIG_8xx) +#if defined(CONFIG_40x) || defined(CONFIG_PPC_8xx) static inline void _tlbil_all(void) { asm volatile ("sync; tlbia; isync" : : : "memory"); @@ -38,7 +38,7 @@ static inline void _tlbil_pid(unsigned int pid) } #define _tlbil_pid_noind(pid) _tlbil_pid(pid) -#else /* CONFIG_40x || CONFIG_8xx */ +#else /* CONFIG_40x || CONFIG_PPC_8xx */ extern void _tlbil_all(void); extern void _tlbil_pid(unsigned int pid); #ifdef CONFIG_PPC_BOOK3E @@ -46,12 +46,12 @@ extern void _tlbil_pid_noind(unsigned int pid); #else #define _tlbil_pid_noind(pid) _tlbil_pid(pid) #endif -#endif /* !(CONFIG_40x || CONFIG_8xx) */ +#endif /* !(CONFIG_40x || CONFIG_PPC_8xx) */ /* * On 8xx, we directly inline tlbie, on others, it's extern */ -#ifdef CONFIG_8xx +#ifdef CONFIG_PPC_8xx static inline void _tlbil_va(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind) { @@ -67,7 +67,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, { __tlbil_va(address, pid); } -#endif /* CONFIG_8xx */ +#endif /* CONFIG_PPC_8xx */ #if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x) extern void _tlbivax_bcast(unsigned long address, unsigned int pid, diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 31eed8fa8e99..3b65917785a5 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -9,6 +9,7 @@ #include <linux/sched.h> #include <linux/mm_types.h> +#include <misc/cxl-base.h> #include <asm/pgalloc.h> #include <asm/tlb.h> @@ -64,6 +65,27 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, trace_hugepage_set_pmd(addr, pmd_val(pmd)); return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); } + +static void do_nothing(void *unused) +{ + +} +/* + * Serialize against find_current_mm_pte which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_current_mm_pte to finish. + */ +void serialize_against_pte_lookup(struct mm_struct *mm) +{ + smp_mb(); + smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1); +} + /* * We use this to invalidate a pmdp entry before switching from a * hugepte to regular pmd entry. @@ -77,7 +99,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, * This ensures that generic code that rely on IRQ disabling * to prevent a parallel THP split work as expected. */ - kick_all_cpus_sync(); + serialize_against_pte_lookup(vma->vm_mm); } static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 443a2c66a304..ec277913e01b 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -239,7 +239,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres * by sending an IPI to all the cpus and executing a dummy * function there. */ - kick_all_cpus_sync(); + serialize_against_pte_lookup(vma->vm_mm); /* * Now invalidate the hpte entries in the range * covered by pmd. This make sure we take a @@ -329,7 +329,6 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, unsigned int psize; unsigned long vsid; unsigned long flags = 0; - const struct cpumask *tmp; /* get the base page size,vsid and segment size */ #ifdef CONFIG_DEBUG_VM @@ -350,8 +349,7 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, ssize = mmu_kernel_ssize; } - tmp = cpumask_of(smp_processor_id()); - if (cpumask_equal(mm_cpumask(mm), tmp)) + if (mm_is_thread_local(mm)) flags |= HPTE_LOCAL_UPDATE; return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); @@ -380,16 +378,16 @@ pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, */ memset(pgtable, 0, PTE_FRAG_SIZE); /* - * Serialize against find_linux_pte_or_hugepte which does lock-less + * Serialize against find_current_mm_pte variants which does lock-less * lookup in page tables with local interrupts disabled. For huge pages * it casts pmd_t to pte_t. Since format of pte_t is different from * pmd_t we want to prevent transit from pmd pointing to page table * to pmd pointing to huge page (and back) while interrupts are disabled. * We clear pmd to possibly replace it with page table pointer in * different code paths. So make sure we wait for the parallel - * find_linux_pte_or_hugepage to finish. + * find_curren_mm_pte to finish. */ - kick_all_cpus_sync(); + serialize_against_pte_lookup(mm); return old_pmd; } diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 671a45d86c18..39c252b54d16 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -8,10 +8,15 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ + +#define pr_fmt(fmt) "radix-mmu: " fmt + +#include <linux/kernel.h> #include <linux/sched/mm.h> #include <linux/memblock.h> #include <linux/of_fdt.h> #include <linux/mm.h> +#include <linux/string_helpers.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -31,9 +36,13 @@ unsigned int mmu_base_pid; static int native_register_process_table(unsigned long base, unsigned long pg_sz, unsigned long table_size) { - unsigned long patb1 = base | table_size | PATB_GR; + unsigned long patb0, patb1; + + patb0 = be64_to_cpu(partition_tb[0].patb0); + patb1 = base | table_size | PATB_GR; + + mmu_partition_table_set_entry(0, patb0, patb1); - partition_tb->patb1 = cpu_to_be64(patb1); return 0; } @@ -179,10 +188,14 @@ static inline void __meminit print_mapping(unsigned long start, unsigned long end, unsigned long size) { + char buf[10]; + if (end <= start) return; - pr_info("Mapped range 0x%lx - 0x%lx with 0x%lx\n", start, end, size); + string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); + + pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf); } static int __meminit create_physical_mapping(unsigned long start, @@ -526,6 +539,7 @@ void __init radix__early_init_mmu(void) __kernel_virt_size = RADIX_KERN_VIRT_SIZE; __vmalloc_start = RADIX_VMALLOC_START; __vmalloc_end = RADIX_VMALLOC_END; + __kernel_io_start = RADIX_KERN_IO_START; vmemmap = (struct page *)RADIX_VMEMMAP_BASE; ioremap_bot = IOREMAP_BASE; @@ -836,9 +850,12 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre */ pmd = *pmdp; pmd_clear(pmdp); + /*FIXME!! Verify whether we need this kick below */ - kick_all_cpus_sync(); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + serialize_against_pte_lookup(vma->vm_mm); + + radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); + return pmd; } @@ -897,16 +914,16 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); old_pmd = __pmd(old); /* - * Serialize against find_linux_pte_or_hugepte which does lock-less + * Serialize against find_current_mm_pte which does lock-less * lookup in page tables with local interrupts disabled. For huge pages * it casts pmd_t to pte_t. Since format of pte_t is different from * pmd_t we want to prevent transit from pmd pointing to page table * to pmd pointing to huge page (and back) while interrupts are disabled. * We clear pmd to possibly replace it with page table pointer in * different code paths. So make sure we wait for the parallel - * find_linux_pte_or_hugepage to finish. + * find_current_mm_pte to finish. */ - kick_all_cpus_sync(); + serialize_against_pte_lookup(mm); return old_pmd; } diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index a9e4bfc025bc..65eda1997c3f 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -34,6 +34,7 @@ #include <asm/fixmap.h> #include <asm/io.h> #include <asm/setup.h> +#include <asm/sections.h> #include "mmu_decl.h" @@ -242,7 +243,7 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, int flags) /* * Map in a chunk of physical memory starting at start. */ -void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) +static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) { unsigned long v, s, f; phys_addr_t p; @@ -294,7 +295,7 @@ void __init mapin_ram(void) * Returns true (1) if PTE was found, zero otherwise. The pointer to * the PTE pointer is unmodified if PTE is not found. */ -int +static int get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp) { pgd_t *pgd; @@ -323,9 +324,7 @@ get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp) return(retval); } -#ifdef CONFIG_DEBUG_PAGEALLOC - -static int __change_page_attr(struct page *page, pgprot_t prot) +static int __change_page_attr_noflush(struct page *page, pgprot_t prot) { pte_t *kpte; pmd_t *kpmd; @@ -339,8 +338,6 @@ static int __change_page_attr(struct page *page, pgprot_t prot) if (!get_pteptr(&init_mm, address, &kpte, &kpmd)) return -EINVAL; __set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0); - wmb(); - flush_tlb_page(NULL, address); pte_unmap(kpte); return 0; @@ -349,44 +346,65 @@ static int __change_page_attr(struct page *page, pgprot_t prot) /* * Change the page attributes of an page in the linear mapping. * - * THIS CONFLICTS WITH BAT MAPPINGS, DEBUG USE ONLY + * THIS DOES NOTHING WITH BAT MAPPINGS, DEBUG USE ONLY */ static int change_page_attr(struct page *page, int numpages, pgprot_t prot) { int i, err = 0; unsigned long flags; + struct page *start = page; local_irq_save(flags); for (i = 0; i < numpages; i++, page++) { - err = __change_page_attr(page, prot); + err = __change_page_attr_noflush(page, prot); if (err) break; } + wmb(); + flush_tlb_kernel_range((unsigned long)page_address(start), + (unsigned long)page_address(page)); local_irq_restore(flags); return err; } - -void __kernel_map_pages(struct page *page, int numpages, int enable) +void mark_initmem_nx(void) { - if (PageHighMem(page)) - return; + struct page *page = virt_to_page(_sinittext); + unsigned long numpages = PFN_UP((unsigned long)_einittext) - + PFN_DOWN((unsigned long)_sinittext); - change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); + change_page_attr(page, numpages, PAGE_KERNEL); } -#endif /* CONFIG_DEBUG_PAGEALLOC */ -static int fixmaps; - -void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) +#ifdef CONFIG_STRICT_KERNEL_RWX +void mark_rodata_ro(void) { - unsigned long address = __fix_to_virt(idx); + struct page *page; + unsigned long numpages; + + page = virt_to_page(_stext); + numpages = PFN_UP((unsigned long)_etext) - + PFN_DOWN((unsigned long)_stext); - if (idx >= __end_of_fixed_addresses) { - BUG(); + change_page_attr(page, numpages, PAGE_KERNEL_ROX); + /* + * mark .rodata as read only. Use __init_begin rather than __end_rodata + * to cover NOTES and EXCEPTION_TABLE. + */ + page = virt_to_page(__start_rodata); + numpages = PFN_UP((unsigned long)__init_begin) - + PFN_DOWN((unsigned long)__start_rodata); + + change_page_attr(page, numpages, PAGE_KERNEL_RO); +} +#endif + +#ifdef CONFIG_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (PageHighMem(page)) return; - } - map_kernel_page(address, phys, pgprot_val(flags)); - fixmaps++; + change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); } +#endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 0736e94c7615..ac0717a90ca6 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -104,6 +104,8 @@ unsigned long __vmalloc_start; EXPORT_SYMBOL(__vmalloc_start); unsigned long __vmalloc_end; EXPORT_SYMBOL(__vmalloc_end); +unsigned long __kernel_io_start; +EXPORT_SYMBOL(__kernel_io_start); struct page *vmemmap; EXPORT_SYMBOL(vmemmap); unsigned long __pte_frag_nr; diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index bde378559d01..906a86fe457b 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -121,12 +121,25 @@ slb_miss_kernel_load_vmemmap: 1: #endif /* CONFIG_SPARSEMEM_VMEMMAP */ - /* vmalloc mapping gets the encoding from the PACA as the mapping - * can be demoted from 64K -> 4K dynamically on some machines + /* + * r10 contains the ESID, which is the original faulting EA shifted + * right by 28 bits. We need to compare that with (H_VMALLOC_END >> 28) + * which is 0xd00038000. That can't be used as an immediate, even if we + * ignored the 0xd, so we have to load it into a register, and we only + * have one register free. So we must load all of (H_VMALLOC_END >> 28) + * into a register and compare ESID against that. + */ + lis r11,(H_VMALLOC_END >> 32)@h // r11 = 0xffffffffd0000000 + ori r11,r11,(H_VMALLOC_END >> 32)@l // r11 = 0xffffffffd0003800 + // Rotate left 4, then mask with 0xffffffff0 + rldic r11,r11,4,28 // r11 = 0xd00038000 + cmpld r10,r11 // if r10 >= r11 + bge 5f // goto io_mapping + + /* + * vmalloc mapping gets the encoding from the PACA as the mapping + * can be demoted from 64K -> 4K dynamically on some machines. */ - clrldi r11,r10,48 - cmpldi r11,(H_VMALLOC_SIZE >> 28) - 1 - bgt 5f lhz r11,PACAVMALLOCSLLP(r13) b 6f 5: diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 16ae1bbe13f0..b3e849c4886e 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -54,23 +54,15 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) */ __tlbiel_pid(pid, 0, ric); - if (ric == RIC_FLUSH_ALL) - /* For the remaining sets, just flush the TLB */ - ric = RIC_FLUSH_TLB; + /* For PWC, only one flush is needed */ + if (ric == RIC_FLUSH_PWC) { + asm volatile("ptesync": : :"memory"); + return; + } + /* For the remaining sets, just flush the TLB */ for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_pid(pid, set, ric); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); -} - -static inline void tlbiel_pwc(unsigned long pid) -{ - asm volatile("ptesync": : :"memory"); - - /* For PWC flush, we don't look at set number */ - __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); + __tlbiel_pid(pid, set, RIC_FLUSH_TLB); asm volatile("ptesync": : :"memory"); asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); @@ -146,31 +138,23 @@ void radix__local_flush_tlb_mm(struct mm_struct *mm) preempt_disable(); pid = mm->context.id; if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_ALL); + _tlbiel_pid(pid, RIC_FLUSH_TLB); preempt_enable(); } EXPORT_SYMBOL(radix__local_flush_tlb_mm); -void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) +#ifndef CONFIG_SMP +static void radix__local_flush_all_mm(struct mm_struct *mm) { unsigned long pid; - struct mm_struct *mm = tlb->mm; - /* - * If we are doing a full mm flush, we will do a tlb flush - * with RIC_FLUSH_ALL later. - */ - if (tlb->fullmm) - return; preempt_disable(); - pid = mm->context.id; if (pid != MMU_NO_CONTEXT) - tlbiel_pwc(pid); - + _tlbiel_pid(pid, RIC_FLUSH_ALL); preempt_enable(); } -EXPORT_SYMBOL(radix__local_flush_tlb_pwc); +#endif /* CONFIG_SMP */ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) @@ -208,38 +192,35 @@ void radix__flush_tlb_mm(struct mm_struct *mm) goto no_context; if (!mm_is_thread_local(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); + _tlbie_pid(pid, RIC_FLUSH_TLB); else - _tlbiel_pid(pid, RIC_FLUSH_ALL); + _tlbiel_pid(pid, RIC_FLUSH_TLB); no_context: preempt_enable(); } EXPORT_SYMBOL(radix__flush_tlb_mm); -void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) +static void radix__flush_all_mm(struct mm_struct *mm) { unsigned long pid; - struct mm_struct *mm = tlb->mm; - /* - * If we are doing a full mm flush, we will do a tlb flush - * with RIC_FLUSH_ALL later. - */ - if (tlb->fullmm) - return; preempt_disable(); - pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) goto no_context; if (!mm_is_thread_local(mm)) - _tlbie_pid(pid, RIC_FLUSH_PWC); + _tlbie_pid(pid, RIC_FLUSH_ALL); else - tlbiel_pwc(pid); + _tlbiel_pid(pid, RIC_FLUSH_ALL); no_context: preempt_enable(); } + +void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) +{ + tlb->need_flush_all = 1; +} EXPORT_SYMBOL(radix__flush_tlb_pwc); void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, @@ -271,6 +252,8 @@ void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) } EXPORT_SYMBOL(radix__flush_tlb_page); +#else /* CONFIG_SMP */ +#define radix__flush_all_mm radix__local_flush_all_mm #endif /* CONFIG_SMP */ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) @@ -288,6 +271,7 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, { struct mm_struct *mm = vma->vm_mm; + radix__flush_tlb_mm(mm); } EXPORT_SYMBOL(radix__flush_tlb_range); @@ -319,7 +303,10 @@ void radix__tlb_flush(struct mmu_gather *tlb) */ if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all) radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize); - else + else if (tlb->need_flush_all) { + tlb->need_flush_all = 0; + radix__flush_all_mm(mm); + } else radix__flush_tlb_mm(mm); } @@ -364,6 +351,43 @@ err_out: preempt_enable(); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) +{ + int local = mm_is_thread_local(mm); + unsigned long ap = mmu_get_ap(mmu_virtual_psize); + unsigned long pid, end; + + + pid = mm ? mm->context.id : 0; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto no_context; + + /* 4k page size, just blow the world */ + if (PAGE_SIZE == 0x1000) { + radix__flush_all_mm(mm); + return; + } + + /* Otherwise first do the PWC */ + if (local) + _tlbiel_pid(pid, RIC_FLUSH_PWC); + else + _tlbie_pid(pid, RIC_FLUSH_PWC); + + /* Then iterate the pages */ + end = addr + HPAGE_PMD_SIZE; + for (; addr < end; addr += PAGE_SIZE) { + if (local) + _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); + else + _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + } +no_context: + preempt_enable(); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa, unsigned long page_size) { diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index b5b0fb97b9c0..881ebd53ffc2 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -29,6 +29,8 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> #include <asm/bug.h> +#include <asm/pte-walk.h> + #include <trace/events/thp.h> @@ -138,13 +140,10 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, */ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) { - const struct cpumask *tmp; - int i, local = 0; + int i, local; i = batch->index; - tmp = cpumask_of(smp_processor_id()); - if (cpumask_equal(mm_cpumask(batch->mm), tmp)) - local = 1; + local = mm_is_thread_local(batch->mm); if (i == 1) flush_hash_page(batch->vpn[0], batch->pte[0], batch->psize, batch->ssize, local); @@ -207,8 +206,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, local_irq_save(flags); arch_enter_lazy_mmu_mode(); for (; start < end; start += PAGE_SIZE) { - pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, &is_thp, - &hugepage_shift); + pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp, + &hugepage_shift); unsigned long pte; if (ptep == NULL) diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S index eabecfcaef7c..048b8e9f4492 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -60,7 +60,7 @@ _GLOBAL(__tlbil_va) isync 1: blr -#elif defined(CONFIG_8xx) +#elif defined(CONFIG_PPC_8xx) /* * Nothing to do for 8xx, everything is inline |