diff options
Diffstat (limited to 'arch')
78 files changed, 958 insertions, 1828 deletions
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 1075534b0a2e..8ed8b9a24efe 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -283,7 +283,7 @@ void flush_cache_pages(struct vm_area_struct *vma, unsigned long user_addr, * flush_dcache_page is used when the kernel has written to the page * cache page at virtual address page->virtual. * - * If this page isn't mapped (ie, page_mapping == NULL), or it might + * If this page isn't mapped (ie, folio_mapping == NULL), or it might * have userspace mappings, then we _must_ always clean + invalidate * the dcache entries associated with the kernel mapping. * diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h index a30be5505793..87d48e2d90ad 100644 --- a/arch/arm/include/asm/hugetlb-3level.h +++ b/arch/arm/include/asm/hugetlb-3level.h @@ -13,12 +13,12 @@ /* * If our huge pte is non-zero then mark the valid bit. - * This allows pte_present(huge_ptep_get(ptep)) to return true for non-zero + * This allows pte_present(huge_ptep_get(mm,addr,ptep)) to return true for non-zero * ptes. * (The valid bit is automatically cleared by set_pte_at for PROT_NONE ptes). */ #define __HAVE_ARCH_HUGE_PTEP_GET -static inline pte_t huge_ptep_get(pte_t *ptep) +static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t retval = *ptep; if (pte_val(retval)) diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index fefac75fa009..28ab96e808ef 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -117,7 +117,7 @@ extern void copy_to_user_page(struct vm_area_struct *, struct page *, * flush_dcache_folio is used when the kernel has written to the page * cache page at virtual address page->virtual. * - * If this page isn't mapped (ie, page_mapping == NULL), or it might + * If this page isn't mapped (ie, folio_mapping == NULL), or it might * have userspace mappings, then we _must_ always clean + invalidate * the dcache entries associated with the kernel mapping. * diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 3954cbd2ff56..293f880865e8 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -46,7 +46,7 @@ extern pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_GET -extern pte_t huge_ptep_get(pte_t *ptep); +extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void __init arm64_hugetlb_cma_reserve(void); diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 3f09ac73cce3..5f1e2103888b 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -127,7 +127,7 @@ static inline int num_contig_ptes(unsigned long size, size_t *pgsize) return contig_ptes; } -pte_t huge_ptep_get(pte_t *ptep) +pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { int ncontig, i; size_t pgsize; diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index af3acdf3481a..161dd6e10479 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -467,8 +467,8 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf, #define update_mmu_cache(vma, addr, ptep) \ update_mmu_cache_range(NULL, vma, addr, ptep, 1) -#define __HAVE_ARCH_UPDATE_MMU_TLB -#define update_mmu_tlb update_mmu_cache +#define update_mmu_tlb_range(vma, addr, ptep, nr) \ + update_mmu_cache_range(NULL, vma, addr, ptep, nr) static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index e27a4c83c548..c29a551eb0ca 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -594,8 +594,8 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf, #define update_mmu_cache(vma, address, ptep) \ update_mmu_cache_range(NULL, vma, address, ptep, 1) -#define __HAVE_ARCH_UPDATE_MMU_TLB -#define update_mmu_tlb update_mmu_cache +#define update_mmu_tlb_range(vma, address, ptep, nr) \ + update_mmu_cache_range(NULL, vma, address, ptep, nr) static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index df1ced4fc3b5..bf9a37c60e9f 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -112,7 +112,7 @@ void __flush_dcache_pages(struct page *page, unsigned int nr) } /* - * We could delay the flush for the !page_mapping case too. But that + * We could delay the flush for the !folio_mapping case too. But that * case is for exec env/arg pages and those are %99 certainly going to * get faulted into the tlb (and thus flushed) anyways. */ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index f8891fbe7c16..bc5a1612be72 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -135,7 +135,6 @@ config PPC select ARCH_HAS_DMA_MAP_DIRECT if PPC_PSERIES select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_HUGEPD if HUGETLB_PAGE select ARCH_HAS_KCOV select ARCH_HAS_KERNEL_FPU_SUPPORT if PPC64 && PPC_FPU select ARCH_HAS_MEMBARRIER_CALLBACKS diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index dc5c039eb28e..dd4eb3063175 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -47,8 +47,6 @@ static inline void pgtable_free(void *table, unsigned index_size) } } -#define get_hugepd_cache_index(x) (x) - static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) { diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 6472b08fa1b0..c654c376ef8b 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -74,21 +74,6 @@ #define remap_4k_pfn(vma, addr, pfn, prot) \ remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot)) -#ifdef CONFIG_HUGETLB_PAGE -static inline int hash__hugepd_ok(hugepd_t hpd) -{ - unsigned long hpdval = hpd_val(hpd); - /* - * if it is not a pte and have hugepd shift mask - * set, then it is a hugepd directory pointer - */ - if (!(hpdval & _PAGE_PTE) && (hpdval & _PAGE_PRESENT) && - ((hpdval & HUGEPD_SHIFT_MASK) != 0)) - return true; - return false; -} -#endif - /* * 4K PTE format is different from 64K PTE format. Saving the hash_slot is just * a matter of returning the PTE bits that need to be modified. On 64K PTE, diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index faf3e3b4e4b2..0755f2567021 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -4,6 +4,7 @@ #ifdef __KERNEL__ #include <asm/asm-const.h> +#include <asm/book3s/64/slice.h> /* * Common bits between 4K and 64K pages in a linux-style PTE. @@ -161,14 +162,10 @@ extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pte, int huge); unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags); /* Atomic PTE updates */ -static inline unsigned long hash__pte_update(struct mm_struct *mm, - unsigned long addr, - pte_t *ptep, unsigned long clr, - unsigned long set, - int huge) +static inline unsigned long hash__pte_update_one(pte_t *ptep, unsigned long clr, + unsigned long set) { __be64 old_be, tmp_be; - unsigned long old; __asm__ __volatile__( "1: ldarx %0,0,%3 # pte_update\n\ @@ -182,11 +179,40 @@ static inline unsigned long hash__pte_update(struct mm_struct *mm, : "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep), "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) : "cc" ); + + return be64_to_cpu(old_be); +} + +static inline unsigned long hash__pte_update(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, unsigned long clr, + unsigned long set, + int huge) +{ + unsigned long old; + + old = hash__pte_update_one(ptep, clr, set); + + if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && huge) { + unsigned int psize = get_slice_psize(mm, addr); + int nb, i; + + if (psize == MMU_PAGE_16M) + nb = SZ_16M / PMD_SIZE; + else if (psize == MMU_PAGE_16G) + nb = SZ_16G / PUD_SIZE; + else + nb = 1; + + WARN_ON_ONCE(nb == 1); /* Should never happen */ + + for (i = 1; i < nb; i++) + hash__pte_update_one(ptep + i, clr, set); + } /* huge pages use the old page table lock */ if (!huge) assert_pte_locked(mm, addr); - old = be64_to_cpu(old_be); if (old & H_PAGE_HASHPTE) hpte_need_flush(mm, addr, ptep, old, huge); diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index aa1c67c8bfc8..f0bba9c5f9c3 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -49,9 +49,6 @@ static inline bool gigantic_page_runtime_supported(void) return true; } -/* hugepd entry valid bit */ -#define HUGEPD_VAL_BITS (0x8000000000000000UL) - #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); @@ -60,29 +57,7 @@ extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t new_pte); -/* - * This should work for other subarchs too. But right now we use the - * new format only for 64bit book3s - */ -static inline pte_t *hugepd_page(hugepd_t hpd) -{ - BUG_ON(!hugepd_ok(hpd)); - /* - * We have only four bits to encode, MMU page size - */ - BUILD_BUG_ON((MMU_PAGE_COUNT - 1) > 0xf); - return __va(hpd_val(hpd) & HUGEPD_ADDR_MASK); -} - -static inline unsigned int hugepd_mmu_psize(hugepd_t hpd) -{ - return (hpd_val(hpd) & HUGEPD_SHIFT_MASK) >> 2; -} -static inline unsigned int hugepd_shift(hugepd_t hpd) -{ - return mmu_psize_to_shift(hugepd_mmu_psize(hpd)); -} static inline void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { @@ -90,19 +65,6 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma, return radix__flush_hugetlb_page(vma, vmaddr); } -static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, - unsigned int pdshift) -{ - unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(hpd); - - return hugepd_page(hpd) + idx; -} - -static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift) -{ - *hpdp = __hugepd(__pa(new) | HUGEPD_VAL_BITS | (shift_to_mmu_psize(pshift) << 2)); -} - void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); static inline int check_and_get_huge_psize(int shift) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h deleted file mode 100644 index baf934578c3a..000000000000 --- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H -#define _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H -/* - * hash 4k can't share hugetlb and also doesn't support THP - */ -#ifndef __ASSEMBLY__ -#ifdef CONFIG_HUGETLB_PAGE -/* - * With radix , we have hugepage ptes in the pud and pmd entries. We don't - * need to setup hugepage directory for them. Our pte and page directory format - * enable us to have this enabled. - */ -static inline int hugepd_ok(hugepd_t hpd) -{ - if (radix_enabled()) - return 0; - return hash__hugepd_ok(hpd); -} -#define is_hugepd(hpd) (hugepd_ok(hpd)) - -/* - * 16M and 16G huge page directory tables are allocated from slab cache - * - */ -#define H_16M_CACHE_INDEX (PAGE_SHIFT + H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE - 24) -#define H_16G_CACHE_INDEX \ - (PAGE_SHIFT + H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE + H_PUD_INDEX_SIZE - 34) - -static inline int get_hugepd_cache_index(int index) -{ - switch (index) { - case H_16M_CACHE_INDEX: - return HTLB_16M_INDEX; - case H_16G_CACHE_INDEX: - return HTLB_16G_INDEX; - default: - BUG(); - } - /* should not reach */ -} - -#endif /* CONFIG_HUGETLB_PAGE */ - -#endif /* __ASSEMBLY__ */ - -#endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h index 6ac73da7b80e..4d8d7b4ea16b 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -5,26 +5,6 @@ #ifndef __ASSEMBLY__ #ifdef CONFIG_HUGETLB_PAGE -/* - * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't - * need to setup hugepage directory for them. Our pte and page directory format - * enable us to have this enabled. - */ -static inline int hugepd_ok(hugepd_t hpd) -{ - return 0; -} - -#define is_hugepd(pdep) 0 - -/* - * This should never get called - */ -static __always_inline int get_hugepd_cache_index(int index) -{ - BUILD_BUG(); -} - #endif /* CONFIG_HUGETLB_PAGE */ static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 8f9432e3855a..519b1743a0f4 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -274,6 +274,24 @@ static inline bool pud_leaf(pud_t pud) { return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); } + +#define pmd_leaf_size pmd_leaf_size +static inline unsigned long pmd_leaf_size(pmd_t pmd) +{ + if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !radix_enabled()) + return SZ_16M; + else + return PMD_SIZE; +} + +#define pud_leaf_size pud_leaf_size +static inline unsigned long pud_leaf_size(pud_t pud) +{ + if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !radix_enabled()) + return SZ_16G; + else + return PUD_SIZE; +} #endif /* __ASSEMBLY__ */ #include <asm/book3s/64/hash.h> @@ -285,11 +303,9 @@ static inline bool pud_leaf(pud_t pud) #define MAX_PHYSMEM_BITS R_MAX_PHYSMEM_BITS #endif - +/* hash 4k can't share hugetlb and also doesn't support THP */ #ifdef CONFIG_PPC_64K_PAGES #include <asm/book3s/64/pgtable-64k.h> -#else -#include <asm/book3s/64/pgtable-4k.h> #endif #include <asm/barrier.h> diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index ea71f7245a63..18a3028ac3b6 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -30,10 +30,9 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, } #define is_hugepage_only_range is_hugepage_only_range -#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE -void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, - unsigned long end, unsigned long floor, - unsigned long ceiling); +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, @@ -67,14 +66,6 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma, { } -#define hugepd_shift(x) 0 -static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, - unsigned pdshift) -{ - return NULL; -} - - static inline void __init gigantic_hugetlb_cma_reserve(void) { } diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h index 92df40c6cc6b..014799557f60 100644 --- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -4,42 +4,12 @@ #define PAGE_SHIFT_8M 23 -static inline pte_t *hugepd_page(hugepd_t hpd) -{ - BUG_ON(!hugepd_ok(hpd)); - - return (pte_t *)__va(hpd_val(hpd) & ~HUGEPD_SHIFT_MASK); -} - -static inline unsigned int hugepd_shift(hugepd_t hpd) -{ - return PAGE_SHIFT_8M; -} - -static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, - unsigned int pdshift) -{ - unsigned long idx = (addr & (SZ_4M - 1)) >> PAGE_SHIFT; - - return hugepd_page(hpd) + idx; -} - static inline void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { flush_tlb_page(vma, vmaddr); } -static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift) -{ - *hpdp = __hugepd(__pa(new) | _PMD_USER | _PMD_PRESENT | _PMD_PAGE_8M); -} - -static inline void hugepd_populate_kernel(hugepd_t *hpdp, pte_t *new, unsigned int pshift) -{ - *hpdp = __hugepd(__pa(new) | _PMD_PRESENT | _PMD_PAGE_8M); -} - static inline int check_and_get_huge_psize(int shift) { return shift_to_mmu_psize(shift); @@ -49,6 +19,14 @@ static inline int check_and_get_huge_psize(int shift) void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz); +#define __HAVE_ARCH_HUGE_PTEP_GET +static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + if (ptep_is_8m_pmdp(mm, addr, ptep)) + ptep = pte_offset_kernel((pmd_t *)ptep, ALIGN_DOWN(addr, SZ_8M)); + return ptep_get(ptep); +} + #define __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index 141d82e249a8..a756a1e59c54 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -189,19 +189,14 @@ typedef struct { #define PHYS_IMMR_BASE (mfspr(SPRN_IMMR) & 0xfff80000) -/* Page size definitions, common between 32 and 64-bit +/* + * Page size definitions for 8xx * * shift : is the "PAGE_SHIFT" value for that page size - * penc : is the pte encoding mask * */ struct mmu_psize_def { unsigned int shift; /* number of bits */ - unsigned int enc; /* PTE encoding */ - unsigned int ind; /* Corresponding indirect page size shift */ - unsigned int flags; -#define MMU_PAGE_SIZE_DIRECT 0x1 /* Supported as a direct size */ -#define MMU_PAGE_SIZE_INDIRECT 0x2 /* Supported as an indirect size */ }; extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; diff --git a/arch/powerpc/include/asm/nohash/32/pte-44x.h b/arch/powerpc/include/asm/nohash/32/pte-44x.h index 851813725237..da0469928273 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-44x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-44x.h @@ -75,9 +75,6 @@ #define _PAGE_NO_CACHE 0x00000400 /* H: I bit */ #define _PAGE_WRITETHRU 0x00000800 /* H: W bit */ -/* No page size encoding in the linux PTE */ -#define _PAGE_PSIZE 0 - /* TODO: Add large page lowmem mapping support */ #define _PMD_PRESENT 0 #define _PMD_PRESENT_MASK (PAGE_MASK) diff --git a/arch/powerpc/include/asm/nohash/32/pte-85xx.h b/arch/powerpc/include/asm/nohash/32/pte-85xx.h index 653a342d3b25..14d64b4f3f14 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-85xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-85xx.h @@ -31,9 +31,6 @@ #define _PAGE_WRITETHRU 0x00400 /* H: W bit */ #define _PAGE_SPECIAL 0x00800 /* S: Special page */ -/* No page size encoding in the linux PTE */ -#define _PAGE_PSIZE 0 - #define _PMD_PRESENT 0 #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index 137dc3c84e45..54ebb91dbdcf 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -74,12 +74,11 @@ #define _PTE_NONE_MASK 0 #ifdef CONFIG_PPC_16K_PAGES -#define _PAGE_PSIZE _PAGE_SPS +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_SPS) #else -#define _PAGE_PSIZE 0 +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) #endif -#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) #define _PAGE_BASE (_PAGE_BASE_NC) #include <asm/pgtable-masks.h> @@ -120,7 +119,7 @@ static inline pte_t pte_mkhuge(pte_t pte) #define pte_mkhuge pte_mkhuge -static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p, +static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long clr, unsigned long set, int huge); static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -142,19 +141,12 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, pte_t *pt } #define __ptep_set_access_flags __ptep_set_access_flags -static inline unsigned long pgd_leaf_size(pgd_t pgd) -{ - if (pgd_val(pgd) & _PMD_PAGE_8M) - return SZ_8M; - return SZ_4M; -} - -#define pgd_leaf_size pgd_leaf_size - -static inline unsigned long pte_leaf_size(pte_t pte) +static inline unsigned long __pte_leaf_size(pmd_t pmd, pte_t pte) { pte_basic_t val = pte_val(pte); + if (pmd_val(pmd) & _PMD_PAGE_8M) + return SZ_8M; if (val & _PAGE_HUGE) return SZ_512K; if (val & _PAGE_SPS) @@ -162,31 +154,38 @@ static inline unsigned long pte_leaf_size(pte_t pte) return SZ_4K; } -#define pte_leaf_size pte_leaf_size +#define __pte_leaf_size __pte_leaf_size /* * On the 8xx, the page tables are a bit special. For 16k pages, we have * 4 identical entries. For 512k pages, we have 128 entries as if it was * 4k pages, but they are flagged as 512k pages for the hardware. - * For other page sizes, we have a single entry in the table. + * For 8M pages, we have 1024 entries as if it was 4M pages (PMD_SIZE) + * but they are flagged as 8M pages for the hardware. + * For 4k pages, we have a single entry in the table. */ static pmd_t *pmd_off(struct mm_struct *mm, unsigned long addr); -static int hugepd_ok(hugepd_t hpd); +static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address); + +static inline bool ptep_is_8m_pmdp(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + return (pmd_t *)ptep == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M)); +} static inline int number_of_cells_per_pte(pmd_t *pmd, pte_basic_t val, int huge) { if (!huge) return PAGE_SIZE / SZ_4K; - else if (hugepd_ok(*((hugepd_t *)pmd))) - return 1; + else if ((pmd_val(*pmd) & _PMD_PAGE_MASK) == _PMD_PAGE_8M) + return SZ_4M / SZ_4K; else if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !(val & _PAGE_HUGE)) return SZ_16K / SZ_4K; else return SZ_512K / SZ_4K; } -static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p, - unsigned long clr, unsigned long set, int huge) +static inline pte_basic_t __pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p, + unsigned long clr, unsigned long set, int huge) { pte_basic_t *entry = (pte_basic_t *)p; pte_basic_t old = pte_val(*p); @@ -198,7 +197,7 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p for (i = 0; i < num; i += PAGE_SIZE / SZ_4K, new += PAGE_SIZE) { *entry++ = new; - if (IS_ENABLED(CONFIG_PPC_16K_PAGES) && num != 1) { + if (IS_ENABLED(CONFIG_PPC_16K_PAGES)) { *entry++ = new; *entry++ = new; *entry++ = new; @@ -208,6 +207,21 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p return old; } +static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + unsigned long clr, unsigned long set, int huge) +{ + pte_basic_t old; + + if (huge && ptep_is_8m_pmdp(mm, addr, ptep)) { + pmd_t *pmdp = (pmd_t *)ptep; + + old = __pte_update(mm, addr, pte_offset_kernel(pmdp, 0), clr, set, huge); + __pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), clr, set, huge); + } else { + old = __pte_update(mm, addr, ptep, clr, set, huge); + } + return old; +} #define pte_update pte_update #ifdef CONFIG_PPC_16K_PAGES diff --git a/arch/powerpc/include/asm/nohash/hugetlb-e500.h b/arch/powerpc/include/asm/nohash/hugetlb-e500.h index 8f04ad20e040..cab0e1f1eea0 100644 --- a/arch/powerpc/include/asm/nohash/hugetlb-e500.h +++ b/arch/powerpc/include/asm/nohash/hugetlb-e500.h @@ -2,38 +2,8 @@ #ifndef _ASM_POWERPC_NOHASH_HUGETLB_E500_H #define _ASM_POWERPC_NOHASH_HUGETLB_E500_H -static inline pte_t *hugepd_page(hugepd_t hpd) -{ - if (WARN_ON(!hugepd_ok(hpd))) - return NULL; - - return (pte_t *)((hpd_val(hpd) & ~HUGEPD_SHIFT_MASK) | PD_HUGE); -} - -static inline unsigned int hugepd_shift(hugepd_t hpd) -{ - return hpd_val(hpd) & HUGEPD_SHIFT_MASK; -} - -static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, - unsigned int pdshift) -{ - /* - * On FSL BookE, we have multiple higher-level table entries that - * point to the same hugepte. Just use the first one since they're all - * identical. So for that case, idx=0. - */ - return hugepd_page(hpd); -} - void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); -static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift) -{ - /* We use the old format for PPC_E500 */ - *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift); -} - static inline int check_and_get_huge_psize(int shift) { if (shift & 1) /* Not a power of 4 */ @@ -42,4 +12,13 @@ static inline int check_and_get_huge_psize(int shift) return shift_to_mmu_psize(shift); } +static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) +{ + unsigned int tsize = shift - _PAGE_PSIZE_SHIFT_OFFSET; + pte_basic_t val = (tsize << _PAGE_PSIZE_SHIFT) & _PAGE_PSIZE_MSK; + + return __pte((pte_val(entry) & ~(pte_basic_t)_PAGE_PSIZE_MSK) | val); +} +#define arch_make_huge_pte arch_make_huge_pte + #endif /* _ASM_POWERPC_NOHASH_HUGETLB_E500_H */ diff --git a/arch/powerpc/include/asm/nohash/mmu-e500.h b/arch/powerpc/include/asm/nohash/mmu-e500.h index 6ddced0415cb..b281d9eeaf1e 100644 --- a/arch/powerpc/include/asm/nohash/mmu-e500.h +++ b/arch/powerpc/include/asm/nohash/mmu-e500.h @@ -244,14 +244,11 @@ typedef struct { /* Page size definitions, common between 32 and 64-bit * * shift : is the "PAGE_SHIFT" value for that page size - * penc : is the pte encoding mask * */ struct mmu_psize_def { unsigned int shift; /* number of bits */ - unsigned int enc; /* PTE encoding */ - unsigned int ind; /* Corresponding indirect page size shift */ unsigned int flags; #define MMU_PAGE_SIZE_DIRECT 0x1 /* Supported as a direct size */ #define MMU_PAGE_SIZE_INDIRECT 0x2 /* Supported as an indirect size */ @@ -303,8 +300,7 @@ extern unsigned long linear_map_top; extern int book3e_htw_mode; #define PPC_HTW_NONE 0 -#define PPC_HTW_IBM 1 -#define PPC_HTW_E6500 2 +#define PPC_HTW_E6500 1 /* * 64-bit booke platforms don't load the tlb in the tlb miss handler code. diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h index 4b62376318e1..d06efac6d7aa 100644 --- a/arch/powerpc/include/asm/nohash/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/pgalloc.h @@ -44,8 +44,6 @@ static inline void pgtable_free(void *table, int shift) } } -#define get_hugepd_cache_index(x) (x) - static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) { unsigned long pgf = (unsigned long)table; diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index f5f39d4f03c8..8d1f0b7062eb 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -31,6 +31,13 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p extern int icache_44x_need_flush; +#ifndef pte_huge_size +static inline unsigned long pte_huge_size(pte_t pte) +{ + return PAGE_SIZE; +} +#endif + /* * PTE updates. This function is called whenever an existing * valid PTE is updated. This does -not- include set_pte_at() @@ -52,11 +59,34 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p { pte_basic_t old = pte_val(*p); pte_basic_t new = (old & ~(pte_basic_t)clr) | set; + unsigned long sz; + unsigned long pdsize; + int i; if (new == old) return old; - *p = __pte(new); + if (huge) + sz = pte_huge_size(__pte(old)); + else + sz = PAGE_SIZE; + + if (sz < PMD_SIZE) + pdsize = PAGE_SIZE; + else if (sz < PUD_SIZE) + pdsize = PMD_SIZE; + else if (sz < P4D_SIZE) + pdsize = PUD_SIZE; + else if (sz < PGDIR_SIZE) + pdsize = P4D_SIZE; + else + pdsize = PGDIR_SIZE; + + for (i = 0; i < sz / pdsize; i++, p++) { + *p = __pte(new); + if (new) + new += (unsigned long long)(pdsize / PAGE_SIZE) << PTE_RPN_SHIFT; + } if (IS_ENABLED(CONFIG_44x) && !is_kernel_addr(addr) && (old & _PAGE_EXEC)) icache_44x_need_flush = 1; @@ -340,20 +370,6 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, #define pgprot_writecombine pgprot_noncached_wc -#ifdef CONFIG_HUGETLB_PAGE -static inline int hugepd_ok(hugepd_t hpd) -{ -#ifdef CONFIG_PPC_8xx - return ((hpd_val(hpd) & _PMD_PAGE_MASK) == _PMD_PAGE_8M); -#else - /* We clear the top bit to indicate hugepd */ - return (hpd_val(hpd) && (hpd_val(hpd) & PD_HUGE) == 0); -#endif -} - -#define is_hugepd(hpd) (hugepd_ok(hpd)) -#endif - int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); void unmap_kernel_page(unsigned long va); diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h b/arch/powerpc/include/asm/nohash/pte-e500.h index f516f0b5b7a8..cb78392494da 100644 --- a/arch/powerpc/include/asm/nohash/pte-e500.h +++ b/arch/powerpc/include/asm/nohash/pte-e500.h @@ -19,20 +19,7 @@ #define _PAGE_BAP_SX 0x000040 #define _PAGE_BAP_UX 0x000080 #define _PAGE_PSIZE_MSK 0x000f00 -#define _PAGE_PSIZE_4K 0x000200 -#define _PAGE_PSIZE_8K 0x000300 -#define _PAGE_PSIZE_16K 0x000400 -#define _PAGE_PSIZE_32K 0x000500 -#define _PAGE_PSIZE_64K 0x000600 -#define _PAGE_PSIZE_128K 0x000700 -#define _PAGE_PSIZE_256K 0x000800 -#define _PAGE_PSIZE_512K 0x000900 -#define _PAGE_PSIZE_1M 0x000a00 -#define _PAGE_PSIZE_2M 0x000b00 -#define _PAGE_PSIZE_4M 0x000c00 -#define _PAGE_PSIZE_8M 0x000d00 -#define _PAGE_PSIZE_16M 0x000e00 -#define _PAGE_PSIZE_32M 0x000f00 +#define _PAGE_TSIZE_4K 0x000100 #define _PAGE_DIRTY 0x001000 /* C: page changed */ #define _PAGE_SW0 0x002000 #define _PAGE_U3 0x004000 @@ -46,6 +33,9 @@ #define _PAGE_NO_CACHE 0x400000 /* I: cache inhibit */ #define _PAGE_WRITETHRU 0x800000 /* W: cache write-through */ +#define _PAGE_PSIZE_SHIFT 7 +#define _PAGE_PSIZE_SHIFT_OFFSET 10 + /* "Higher level" linux bit combinations */ #define _PAGE_EXEC (_PAGE_BAP_SX | _PAGE_BAP_UX) /* .. and was cache cleaned */ #define _PAGE_READ (_PAGE_BAP_SR | _PAGE_BAP_UR) /* User read permission */ @@ -65,8 +55,6 @@ #define _PAGE_SPECIAL _PAGE_SW0 -/* Base page size */ -#define _PAGE_PSIZE _PAGE_PSIZE_4K #define PTE_RPN_SHIFT (24) #define PTE_WIMGE_SHIFT (19) @@ -89,7 +77,7 @@ * pages. We always set _PAGE_COHERENT when SMP is enabled or * the processor might need it for DMA coherency. */ -#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_TSIZE_4K) #if defined(CONFIG_SMP) #define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) #else @@ -105,6 +93,47 @@ static inline pte_t pte_mkexec(pte_t pte) } #define pte_mkexec pte_mkexec +static inline unsigned long pte_huge_size(pte_t pte) +{ + pte_basic_t val = pte_val(pte); + + return 1UL << (((val & _PAGE_PSIZE_MSK) >> _PAGE_PSIZE_SHIFT) + _PAGE_PSIZE_SHIFT_OFFSET); +} +#define pte_huge_size pte_huge_size + +static inline int pmd_leaf(pmd_t pmd) +{ + if (IS_ENABLED(CONFIG_PPC64)) + return (long)pmd_val(pmd) > 0; + else + return pmd_val(pmd) & _PAGE_PSIZE_MSK; +} +#define pmd_leaf pmd_leaf + +static inline unsigned long pmd_leaf_size(pmd_t pmd) +{ + return pte_huge_size(__pte(pmd_val(pmd))); +} +#define pmd_leaf_size pmd_leaf_size + +#ifdef CONFIG_PPC64 +static inline int pud_leaf(pud_t pud) +{ + if (IS_ENABLED(CONFIG_PPC64)) + return (long)pud_val(pud) > 0; + else + return pud_val(pud) & _PAGE_PSIZE_MSK; +} +#define pud_leaf pud_leaf + +static inline unsigned long pud_leaf_size(pud_t pud) +{ + return pte_huge_size(__pte(pud_val(pud))); +} +#define pud_leaf_size pud_leaf_size + +#endif + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index e411e5a70ea3..83d0a4fc5f75 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -269,38 +269,6 @@ static inline const void *pfn_to_kaddr(unsigned long pfn) #define is_kernel_addr(x) ((x) >= TASK_SIZE) #endif -#ifndef CONFIG_PPC_BOOK3S_64 -/* - * Use the top bit of the higher-level page table entries to indicate whether - * the entries we point to contain hugepages. This works because we know that - * the page tables live in kernel space. If we ever decide to support having - * page tables at arbitrary addresses, this breaks and will have to change. - */ -#ifdef CONFIG_PPC64 -#define PD_HUGE 0x8000000000000000UL -#else -#define PD_HUGE 0x80000000 -#endif - -#else /* CONFIG_PPC_BOOK3S_64 */ -/* - * Book3S 64 stores real addresses in the hugepd entries to - * avoid overlaps with _PAGE_PRESENT and _PAGE_PTE. - */ -#define HUGEPD_ADDR_MASK (0x0ffffffffffffffful & ~HUGEPD_SHIFT_MASK) -#endif /* CONFIG_PPC_BOOK3S_64 */ - -/* - * Some number of bits at the level of the page table that points to - * a hugepte are used to encode the size. This masks those bits. - * On 8xx, HW assistance requires 4k alignment for the hugepte. - */ -#ifdef CONFIG_PPC_8xx -#define HUGEPD_SHIFT_MASK 0xfff -#else -#define HUGEPD_SHIFT_MASK 0x3f -#endif - #ifndef __ASSEMBLY__ #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h index 82633200b500..6bd8f89b25dc 100644 --- a/arch/powerpc/include/asm/pgtable-be-types.h +++ b/arch/powerpc/include/asm/pgtable-be-types.h @@ -101,14 +101,4 @@ static inline bool pmd_xchg(pmd_t *pmdp, pmd_t old, pmd_t new) return pmd_raw(old) == prev; } -#ifdef CONFIG_ARCH_HAS_HUGEPD -typedef struct { __be64 pdbe; } hugepd_t; -#define __hugepd(x) ((hugepd_t) { cpu_to_be64(x) }) - -static inline unsigned long hpd_val(hugepd_t x) -{ - return be64_to_cpu(x.pdbe); -} -#endif - #endif /* _ASM_POWERPC_PGTABLE_BE_TYPES_H */ diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index 082c85cc09b1..7b3d4c592a10 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -49,7 +49,11 @@ static inline unsigned long pud_val(pud_t x) #endif /* CONFIG_PPC64 */ /* PGD level */ +#if defined(CONFIG_PPC_E500) && defined(CONFIG_PTE_64BIT) +typedef struct { unsigned long long pgd; } pgd_t; +#else typedef struct { unsigned long pgd; } pgd_t; +#endif #define __pgd(x) ((pgd_t) { (x) }) static inline unsigned long pgd_val(pgd_t x) { @@ -83,13 +87,4 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) } #endif -#ifdef CONFIG_ARCH_HAS_HUGEPD -typedef struct { unsigned long pd; } hugepd_t; -#define __hugepd(x) ((hugepd_t) { (x) }) -static inline unsigned long hpd_val(hugepd_t x) -{ - return x.pd; -} -#endif - #endif /* _ASM_POWERPC_PGTABLE_TYPES_H */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 239709a2f68e..264a6c09517a 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -106,6 +106,9 @@ unsigned long vmalloc_to_phys(void *vmalloc_addr); void pgtable_cache_add(unsigned int shift); +#ifdef CONFIG_PPC32 +void __init *early_alloc_pgtable(unsigned long size); +#endif pte_t *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va); #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_PPC32) diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index dcf0591ad3c2..63f6b9f513a4 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -485,8 +485,8 @@ interrupt_base_book3e: /* fake trap */ EXCEPTION_STUB(0x160, decrementer) /* 0x0900 */ EXCEPTION_STUB(0x180, fixed_interval) /* 0x0980 */ EXCEPTION_STUB(0x1a0, watchdog) /* 0x09f0 */ - EXCEPTION_STUB(0x1c0, data_tlb_miss) - EXCEPTION_STUB(0x1e0, instruction_tlb_miss) + EXCEPTION_STUB(0x1c0, data_tlb_miss_bolted) + EXCEPTION_STUB(0x1e0, instruction_tlb_miss_bolted) EXCEPTION_STUB(0x200, altivec_unavailable) EXCEPTION_STUB(0x220, altivec_assist) EXCEPTION_STUB(0x260, perfmon) diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S index 39724ff5ae1f..f9a73fae6464 100644 --- a/arch/powerpc/kernel/head_85xx.S +++ b/arch/powerpc/kernel/head_85xx.S @@ -294,9 +294,10 @@ set_ivor: /* Macros to hide the PTE size differences * * FIND_PTE -- walks the page tables given EA & pgdir pointer - * r10 -- EA of fault + * r10 -- free * r11 -- PGDIR pointer * r12 -- free + * r13 -- EA of fault * label 2: is the bailout case * * if we find the pte (fall through): @@ -307,34 +308,34 @@ set_ivor: #ifdef CONFIG_PTE_64BIT #ifdef CONFIG_HUGETLB_PAGE #define FIND_PTE \ - rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \ - lwzx r11, r12, r11; /* Get pgd/pmd entry */ \ + rlwinm r12, r13, 14, 18, 28; /* Compute pgdir/pmd offset */ \ + add r12, r11, r12; \ + lwz r11, 4(r12); /* Get pgd/pmd entry */ \ + rlwinm. r10, r11, 32 - _PAGE_PSIZE_SHIFT, 0x1e; /* get tsize*/ \ + bne 1000f; /* Huge page (leaf entry) */ \ rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */ \ - blt 1000f; /* Normal non-huge page */ \ beq 2f; /* Bail if no table */ \ - oris r11, r11, PD_HUGE@h; /* Put back address bit */ \ - andi. r10, r11, HUGEPD_SHIFT_MASK@l; /* extract size field */ \ - xor r12, r10, r11; /* drop size bits from pointer */ \ - b 1001f; \ -1000: rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \ + rlwimi r12, r13, 23, 20, 28; /* Compute pte address */ \ li r10, 0; /* clear r10 */ \ -1001: lwz r11, 4(r12); /* Get pte entry */ + lwz r11, 4(r12); /* Get pte entry */ \ +1000: #else #define FIND_PTE \ - rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \ - lwzx r11, r12, r11; /* Get pgd/pmd entry */ \ + rlwinm r12, r13, 14, 18, 28; /* Compute pgdir/pmd offset */ \ + add r12, r11, r12; \ + lwz r11, 4(r12); /* Get pgd/pmd entry */ \ rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */ \ beq 2f; /* Bail if no table */ \ - rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \ + rlwimi r12, r13, 23, 20, 28; /* Compute pte address */ \ lwz r11, 4(r12); /* Get pte entry */ #endif /* HUGEPAGE */ #else /* !PTE_64BIT */ #define FIND_PTE \ - rlwimi r11, r10, 12, 20, 29; /* Create L1 (pgdir/pmd) address */ \ + rlwimi r11, r13, 12, 20, 29; /* Create L1 (pgdir/pmd) address */ \ lwz r11, 0(r11); /* Get L1 entry */ \ rlwinm. r12, r11, 0, 0, 19; /* Extract L2 (pte) base address */ \ beq 2f; /* Bail if no table */ \ - rlwimi r12, r10, 22, 20, 29; /* Compute PTE address */ \ + rlwimi r12, r13, 22, 20, 29; /* Compute PTE address */ \ lwz r11, 0(r12); /* Get Linux PTE */ #endif @@ -441,13 +442,13 @@ START_BTB_FLUSH_SECTION BTB_FLUSH(r10) 1: END_BTB_FLUSH_SECTION - mfspr r10, SPRN_DEAR /* Get faulting address */ + mfspr r13, SPRN_DEAR /* Get faulting address */ /* If we are faulting a kernel address, we have to use the * kernel page tables. */ lis r11, PAGE_OFFSET@h - cmplw 5, r10, r11 + cmplw 5, r13, r11 blt 5, 3f lis r11, swapper_pg_dir@h ori r11, r11, swapper_pg_dir@l @@ -470,29 +471,14 @@ END_BTB_FLUSH_SECTION #endif 4: - /* Mask of required permission bits. Note that while we - * do copy ESR:ST to _PAGE_WRITE position as trying to write - * to an RO page is pretty common, we don't do it with - * _PAGE_DIRTY. We could do it, but it's a fairly rare - * event so I'd rather take the overhead when it happens - * rather than adding an instruction here. We should measure - * whether the whole thing is worth it in the first place - * as we could avoid loading SPRN_ESR completely in the first - * place... - * - * TODO: Is it worth doing that mfspr & rlwimi in the first - * place or can we save a couple of instructions here ? - */ - mfspr r12,SPRN_ESR + FIND_PTE + #ifdef CONFIG_PTE_64BIT li r13,_PAGE_PRESENT|_PAGE_BAP_SR oris r13,r13,_PAGE_ACCESSED@h #else li r13,_PAGE_PRESENT|_PAGE_READ|_PAGE_ACCESSED #endif - rlwimi r13,r12,11,29,29 - - FIND_PTE andc. r13,r13,r11 /* Check permission */ #ifdef CONFIG_PTE_64BIT @@ -549,13 +535,13 @@ START_BTB_FLUSH_SECTION 1: END_BTB_FLUSH_SECTION - mfspr r10, SPRN_SRR0 /* Get faulting address */ + mfspr r13, SPRN_SRR0 /* Get faulting address */ /* If we are faulting a kernel address, we have to use the * kernel page tables. */ lis r11, PAGE_OFFSET@h - cmplw 5, r10, r11 + cmplw 5, r13, r11 blt 5, 3f lis r11, swapper_pg_dir@h ori r11, r11, swapper_pg_dir@l @@ -564,6 +550,7 @@ END_BTB_FLUSH_SECTION rlwinm r12,r12,0,16,1 mtspr SPRN_MAS1,r12 + FIND_PTE /* Make up the required permissions for kernel code */ #ifdef CONFIG_PTE_64BIT li r13,_PAGE_PRESENT | _PAGE_BAP_SX @@ -584,6 +571,7 @@ END_BTB_FLUSH_SECTION beq 2f /* KUAP fault */ #endif + FIND_PTE /* Make up the required permissions for user code */ #ifdef CONFIG_PTE_64BIT li r13,_PAGE_PRESENT | _PAGE_BAP_UX @@ -593,7 +581,6 @@ END_BTB_FLUSH_SECTION #endif 4: - FIND_PTE andc. r13,r13,r11 /* Check permission */ #ifdef CONFIG_PTE_64BIT @@ -746,17 +733,12 @@ finish_tlb_load: lwz r15, 0(r14) 100: stw r15, 0(r17) - /* - * Calc MAS1_TSIZE from r10 (which has pshift encoded) - * tlb_enc = (pshift - 10). - */ - subi r15, r10, 10 mfspr r16, SPRN_MAS1 - rlwimi r16, r15, 7, 20, 24 + rlwimi r16, r10, MAS1_TSIZE_SHIFT, MAS1_TSIZE_MASK mtspr SPRN_MAS1, r16 /* copy the pshift for use later */ - mr r14, r10 + addi r14, r10, _PAGE_PSIZE_SHIFT_OFFSET /* fall through */ diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index edc479a7c2bc..ac74321b1192 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -415,14 +415,13 @@ FixupDAR:/* Entry point for dcbx workaround. */ oris r11, r11, (swapper_pg_dir - PAGE_OFFSET)@ha 3: lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11) /* Get the level 1 entry */ + rlwinm r11, r11, 0, ~_PMD_PAGE_8M mtspr SPRN_MD_TWC, r11 - mtcrf 0x01, r11 mfspr r11, SPRN_MD_TWC lwz r11, 0(r11) /* Get the pte */ - bt 28,200f /* bit 28 = Large page (8M) */ /* concat physical page address(r11) and page offset(r10) */ rlwimi r11, r10, 0, 32 - PAGE_SHIFT, 31 -201: lwz r11,0(r11) + lwz r11,0(r11) /* Check if it really is a dcbx instruction. */ /* dcbt and dcbtst does not generate DTLB Misses/Errors, * no need to include them here */ @@ -441,11 +440,6 @@ FixupDAR:/* Entry point for dcbx workaround. */ 141: mfspr r10,SPRN_M_TW b DARFixed /* Nope, go back to normal TLB processing */ -200: - /* concat physical page address(r11) and page offset(r10) */ - rlwimi r11, r10, 0, 32 - PAGE_SHIFT_8M, 31 - b 201b - 144: mfspr r10, SPRN_DSISR rlwinm r10, r10,0,7,5 /* Clear store bit for buggy dcbst insn */ mtspr SPRN_DSISR, r10 diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index ae36a129789f..22f83fbbc762 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -696,11 +696,7 @@ __init u64 ppc64_bolted_size(void) { #ifdef CONFIG_PPC_BOOK3E_64 /* Freescale BookE bolts the entire linear mapping */ - /* XXX: BookE ppc64_rma_limit setup seems to disagree? */ - if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) - return linear_map_top; - /* Other BookE, we assume the first GB is bolted */ - return 1ul << 30; + return linear_map_top; #else /* BookS radix, does not take faults on linear mapping */ if (early_radix_enabled()) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 01c3b4b65241..6727a15ab94f 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1233,10 +1233,6 @@ void __init hash__early_init_mmu(void) __pmd_table_size = H_PMD_TABLE_SIZE; __pud_table_size = H_PUD_TABLE_SIZE; __pgd_table_size = H_PGD_TABLE_SIZE; - /* - * 4k use hugepd format, so for hash set then to - * zero - */ __pmd_val_bits = HASH_PMD_VAL_BITS; __pud_val_bits = HASH_PUD_VAL_BITS; __pgd_val_bits = HASH_PGD_VAL_BITS; @@ -1546,6 +1542,13 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, goto bail; } + if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !radix_enabled()) { + if (hugeshift == PMD_SHIFT && psize == MMU_PAGE_16M) + hugeshift = mmu_psize_defs[MMU_PAGE_16M].shift; + if (hugeshift == PUD_SHIFT && psize == MMU_PAGE_16G) + hugeshift = mmu_psize_defs[MMU_PAGE_16G].shift; + } + /* * Add _PAGE_PRESENT to the required access perm. If there are parallel * updates to the pte that can possibly clear _PAGE_PTE, catch that too. diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c index 5a2e512e96db..83c3361b358b 100644 --- a/arch/powerpc/mm/book3s64/hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c @@ -53,6 +53,16 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, /* If PTE permissions don't match, take page fault */ if (unlikely(!check_pte_access(access, old_pte))) return 1; + /* + * If hash-4k, hugepages use seeral contiguous PxD entries + * so bail out and let mm make the page young or dirty + */ + if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) { + if (!(old_pte & _PAGE_ACCESSED)) + return 1; + if ((access & _PAGE_WRITE) && !(old_pte & _PAGE_DIRTY)) + return 1; + } /* * Try to lock the PTE, add ACCESSED and DIRTY if it was diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 2975ea0841ba..f4d8d3c40e5c 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -461,18 +461,6 @@ static inline void pgtable_free(void *table, int index) case PUD_INDEX: __pud_free(table); break; -#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) - /* 16M hugepd directory at pud level */ - case HTLB_16M_INDEX: - BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0); - kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table); - break; - /* 16G hugepd directory at the pgd level */ - case HTLB_16G_INDEX: - BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0); - kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table); - break; -#endif /* We don't free pgd table via RCU callback */ default: BUG(); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 594a4b7b2ca2..6b043180220a 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -28,8 +28,6 @@ bool hugetlb_disabled = false; -#define hugepd_none(hpd) (hpd_val(hpd) == 0) - #define PTE_T_ORDER (__builtin_ffs(sizeof(pte_basic_t)) - \ __builtin_ffs(sizeof(void *))) @@ -42,156 +40,43 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long s return __find_linux_pte(mm->pgd, addr, NULL, NULL); } -static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, - unsigned long address, unsigned int pdshift, - unsigned int pshift, spinlock_t *ptl) +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long sz) { - struct kmem_cache *cachep; - pte_t *new; - int i; - int num_hugepd; - - if (pshift >= pdshift) { - cachep = PGT_CACHE(PTE_T_ORDER); - num_hugepd = 1 << (pshift - pdshift); - } else { - cachep = PGT_CACHE(pdshift - pshift); - num_hugepd = 1; - } - - if (!cachep) { - WARN_ONCE(1, "No page table cache created for hugetlb tables"); - return -ENOMEM; - } - - new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL)); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; - BUG_ON(pshift > HUGEPD_SHIFT_MASK); - BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); + addr &= ~(sz - 1); - if (!new) - return -ENOMEM; + p4d = p4d_offset(pgd_offset(mm, addr), addr); + if (!mm_pud_folded(mm) && sz >= P4D_SIZE) + return (pte_t *)p4d; - /* - * Make sure other cpus find the hugepd set only after a - * properly initialized page table is visible to them. - * For more details look for comment in __pte_alloc(). - */ - smp_wmb(); + pud = pud_alloc(mm, p4d, addr); + if (!pud) + return NULL; + if (!mm_pmd_folded(mm) && sz >= PUD_SIZE) + return (pte_t *)pud; - spin_lock(ptl); - /* - * We have multiple higher-level entries that point to the same - * actual pte location. Fill in each as we go and backtrack on error. - * We need all of these so the DTLB pgtable walk code can find the - * right higher-level entry without knowing if it's a hugepage or not. - */ - for (i = 0; i < num_hugepd; i++, hpdp++) { - if (unlikely(!hugepd_none(*hpdp))) - break; - hugepd_populate(hpdp, new, pshift); - } - /* If we bailed from the for loop early, an error occurred, clean up */ - if (i < num_hugepd) { - for (i = i - 1 ; i >= 0; i--, hpdp--) - *hpdp = __hugepd(0); - kmem_cache_free(cachep, new); - } else { - kmemleak_ignore(new); - } - spin_unlock(ptl); - return 0; -} + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return NULL; -/* - * At this point we do the placement change only for BOOK3S 64. This would - * possibly work on other subarchs. - */ -pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, unsigned long sz) -{ - pgd_t *pg; - p4d_t *p4; - pud_t *pu; - pmd_t *pm; - hugepd_t *hpdp = NULL; - unsigned pshift = __ffs(sz); - unsigned pdshift = PGDIR_SHIFT; - spinlock_t *ptl; - - addr &= ~(sz-1); - pg = pgd_offset(mm, addr); - p4 = p4d_offset(pg, addr); + if (sz >= PMD_SIZE) { + /* On 8xx, all hugepages are handled as contiguous PTEs */ + if (IS_ENABLED(CONFIG_PPC_8xx)) { + int i; -#ifdef CONFIG_PPC_BOOK3S_64 - if (pshift == PGDIR_SHIFT) - /* 16GB huge page */ - return (pte_t *) p4; - else if (pshift > PUD_SHIFT) { - /* - * We need to use hugepd table - */ - ptl = &mm->page_table_lock; - hpdp = (hugepd_t *)p4; - } else { - pdshift = PUD_SHIFT; - pu = pud_alloc(mm, p4, addr); - if (!pu) - return NULL; - if (pshift == PUD_SHIFT) - return (pte_t *)pu; - else if (pshift > PMD_SHIFT) { - ptl = pud_lockptr(mm, pu); - hpdp = (hugepd_t *)pu; - } else { - pdshift = PMD_SHIFT; - pm = pmd_alloc(mm, pu, addr); - if (!pm) - return NULL; - if (pshift == PMD_SHIFT) - /* 16MB hugepage */ - return (pte_t *)pm; - else { - ptl = pmd_lockptr(mm, pm); - hpdp = (hugepd_t *)pm; + for (i = 0; i < sz / PMD_SIZE; i++) { + if (!pte_alloc_huge(mm, pmd + i, addr)) + return NULL; } } + return (pte_t *)pmd; } -#else - if (pshift >= PGDIR_SHIFT) { - ptl = &mm->page_table_lock; - hpdp = (hugepd_t *)p4; - } else { - pdshift = PUD_SHIFT; - pu = pud_alloc(mm, p4, addr); - if (!pu) - return NULL; - if (pshift >= PUD_SHIFT) { - ptl = pud_lockptr(mm, pu); - hpdp = (hugepd_t *)pu; - } else { - pdshift = PMD_SHIFT; - pm = pmd_alloc(mm, pu, addr); - if (!pm) - return NULL; - ptl = pmd_lockptr(mm, pm); - hpdp = (hugepd_t *)pm; - } - } -#endif - if (!hpdp) - return NULL; - - if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT) - return pte_alloc_huge(mm, (pmd_t *)hpdp, addr); - - BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); - if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, - pdshift, pshift, ptl)) - return NULL; - - return hugepte_offset(*hpdp, addr, pdshift); + return pte_alloc_huge(mm, pmd, addr); } #ifdef CONFIG_PPC_BOOK3S_64 @@ -248,264 +133,6 @@ int __init alloc_bootmem_huge_page(struct hstate *h, int nid) return __alloc_bootmem_huge_page(h, nid); } -#ifndef CONFIG_PPC_BOOK3S_64 -#define HUGEPD_FREELIST_SIZE \ - ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) - -struct hugepd_freelist { - struct rcu_head rcu; - unsigned int index; - void *ptes[]; -}; - -static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); - -static void hugepd_free_rcu_callback(struct rcu_head *head) -{ - struct hugepd_freelist *batch = - container_of(head, struct hugepd_freelist, rcu); - unsigned int i; - - for (i = 0; i < batch->index; i++) - kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]); - - free_page((unsigned long)batch); -} - -static void hugepd_free(struct mmu_gather *tlb, void *hugepte) -{ - struct hugepd_freelist **batchp; - - batchp = &get_cpu_var(hugepd_freelist_cur); - - if (atomic_read(&tlb->mm->mm_users) < 2 || - mm_is_thread_local(tlb->mm)) { - kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte); - put_cpu_var(hugepd_freelist_cur); - return; - } - - if (*batchp == NULL) { - *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); - (*batchp)->index = 0; - } - - (*batchp)->ptes[(*batchp)->index++] = hugepte; - if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { - call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback); - *batchp = NULL; - } - put_cpu_var(hugepd_freelist_cur); -} -#else -static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {} -#endif - -/* Return true when the entry to be freed maps more than the area being freed */ -static bool range_is_outside_limits(unsigned long start, unsigned long end, - unsigned long floor, unsigned long ceiling, - unsigned long mask) -{ - if ((start & mask) < floor) - return true; - if (ceiling) { - ceiling &= mask; - if (!ceiling) - return true; - } - return end - 1 > ceiling - 1; -} - -static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, - unsigned long start, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ - pte_t *hugepte = hugepd_page(*hpdp); - int i; - - unsigned long pdmask = ~((1UL << pdshift) - 1); - unsigned int num_hugepd = 1; - unsigned int shift = hugepd_shift(*hpdp); - - /* Note: On fsl the hpdp may be the first of several */ - if (shift > pdshift) - num_hugepd = 1 << (shift - pdshift); - - if (range_is_outside_limits(start, end, floor, ceiling, pdmask)) - return; - - for (i = 0; i < num_hugepd; i++, hpdp++) - *hpdp = __hugepd(0); - - if (shift >= pdshift) - hugepd_free(tlb, hugepte); - else - pgtable_free_tlb(tlb, hugepte, - get_hugepd_cache_index(pdshift - shift)); -} - -static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ - pgtable_t token = pmd_pgtable(*pmd); - - if (range_is_outside_limits(addr, end, floor, ceiling, PMD_MASK)) - return; - - pmd_clear(pmd); - pte_free_tlb(tlb, token, addr); - mm_dec_nr_ptes(tlb->mm); -} - -static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ - pmd_t *pmd; - unsigned long next; - unsigned long start; - - start = addr; - do { - unsigned long more; - - pmd = pmd_offset(pud, addr); - next = pmd_addr_end(addr, end); - if (!is_hugepd(__hugepd(pmd_val(*pmd)))) { - if (pmd_none_or_clear_bad(pmd)) - continue; - - /* - * if it is not hugepd pointer, we should already find - * it cleared. - */ - WARN_ON(!IS_ENABLED(CONFIG_PPC_8xx)); - - hugetlb_free_pte_range(tlb, pmd, addr, end, floor, ceiling); - - continue; - } - /* - * Increment next by the size of the huge mapping since - * there may be more than one entry at this level for a - * single hugepage, but all of them point to - * the same kmem cache that holds the hugepte. - */ - more = addr + (1UL << hugepd_shift(*(hugepd_t *)pmd)); - if (more > next) - next = more; - - free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, - addr, next, floor, ceiling); - } while (addr = next, addr != end); - - if (range_is_outside_limits(start, end, floor, ceiling, PUD_MASK)) - return; - - pmd = pmd_offset(pud, start & PUD_MASK); - pud_clear(pud); - pmd_free_tlb(tlb, pmd, start & PUD_MASK); - mm_dec_nr_pmds(tlb->mm); -} - -static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ - pud_t *pud; - unsigned long next; - unsigned long start; - - start = addr; - do { - pud = pud_offset(p4d, addr); - next = pud_addr_end(addr, end); - if (!is_hugepd(__hugepd(pud_val(*pud)))) { - if (pud_none_or_clear_bad(pud)) - continue; - hugetlb_free_pmd_range(tlb, pud, addr, next, floor, - ceiling); - } else { - unsigned long more; - /* - * Increment next by the size of the huge mapping since - * there may be more than one entry at this level for a - * single hugepage, but all of them point to - * the same kmem cache that holds the hugepte. - */ - more = addr + (1UL << hugepd_shift(*(hugepd_t *)pud)); - if (more > next) - next = more; - - free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, - addr, next, floor, ceiling); - } - } while (addr = next, addr != end); - - if (range_is_outside_limits(start, end, floor, ceiling, PGDIR_MASK)) - return; - - pud = pud_offset(p4d, start & PGDIR_MASK); - p4d_clear(p4d); - pud_free_tlb(tlb, pud, start & PGDIR_MASK); - mm_dec_nr_puds(tlb->mm); -} - -/* - * This function frees user-level page tables of a process. - */ -void hugetlb_free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ - pgd_t *pgd; - p4d_t *p4d; - unsigned long next; - - /* - * Because there are a number of different possible pagetable - * layouts for hugepage ranges, we limit knowledge of how - * things should be laid out to the allocation path - * (huge_pte_alloc(), above). Everything else works out the - * structure as it goes from information in the hugepd - * pointers. That means that we can't here use the - * optimization used in the normal page free_pgd_range(), of - * checking whether we're actually covering a large enough - * range to have to do anything at the top level of the walk - * instead of at the bottom. - * - * To make sense of this, you should probably go read the big - * block comment at the top of the normal free_pgd_range(), - * too. - */ - - do { - next = pgd_addr_end(addr, end); - pgd = pgd_offset(tlb->mm, addr); - p4d = p4d_offset(pgd, addr); - if (!is_hugepd(__hugepd(pgd_val(*pgd)))) { - if (p4d_none_or_clear_bad(p4d)) - continue; - hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling); - } else { - unsigned long more; - /* - * Increment next by the size of the huge mapping since - * there may be more than one entry at the pgd level - * for a single hugepage, but all of them point to the - * same kmem cache that holds the hugepte. - */ - more = addr + (1UL << hugepd_shift(*(hugepd_t *)pgd)); - if (more > next) - next = more; - - free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT, - addr, next, floor, ceiling); - } - } while (addr = next, addr != end); -} - bool __init arch_hugetlb_valid_size(unsigned long size) { int shift = __ffs(size); @@ -552,44 +179,14 @@ static int __init hugetlbpage_init(void) for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { unsigned shift; - unsigned pdshift; if (!mmu_psize_defs[psize].shift) continue; shift = mmu_psize_to_shift(psize); -#ifdef CONFIG_PPC_BOOK3S_64 - if (shift > PGDIR_SHIFT) - continue; - else if (shift > PUD_SHIFT) - pdshift = PGDIR_SHIFT; - else if (shift > PMD_SHIFT) - pdshift = PUD_SHIFT; - else - pdshift = PMD_SHIFT; -#else - if (shift < PUD_SHIFT) - pdshift = PMD_SHIFT; - else if (shift < PGDIR_SHIFT) - pdshift = PUD_SHIFT; - else - pdshift = PGDIR_SHIFT; -#endif - if (add_huge_page_size(1ULL << shift) < 0) continue; - /* - * if we have pdshift and shift value same, we don't - * use pgt cache for hugepd. - */ - if (pdshift > shift) { - if (!IS_ENABLED(CONFIG_PPC_8xx)) - pgtable_cache_add(pdshift - shift); - } else if (IS_ENABLED(CONFIG_PPC_E500) || - IS_ENABLED(CONFIG_PPC_8xx)) { - pgtable_cache_add(PTE_T_ORDER); - } configured = true; } diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 21131b96d209..9b4a675eb8f8 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -123,12 +123,8 @@ void pgtable_cache_add(unsigned int shift) /* When batching pgtable pointers for RCU freeing, we store * the index size in the low bits. Table alignment must be * big enough to fit it. - * - * Likewise, hugeapge pagetable pointers contain a (different) - * shift value in the low bits. All tables must be aligned so - * as to leave enough 0 bits in the address to contain it. */ - unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1, - HUGEPD_SHIFT_MASK + 1); + */ + unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1; struct kmem_cache *new = NULL; /* It would be nice if this was a BUILD_BUG_ON(), but at the diff --git a/arch/powerpc/mm/kasan/8xx.c b/arch/powerpc/mm/kasan/8xx.c index 2784224054f8..989d6cdf4141 100644 --- a/arch/powerpc/mm/kasan/8xx.c +++ b/arch/powerpc/mm/kasan/8xx.c @@ -6,28 +6,33 @@ #include <linux/memblock.h> #include <linux/hugetlb.h> +#include <asm/pgalloc.h> + static int __init kasan_init_shadow_8M(unsigned long k_start, unsigned long k_end, void *block) { pmd_t *pmd = pmd_off_k(k_start); unsigned long k_cur, k_next; - for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd += 2, block += SZ_8M) { - pte_basic_t *new; + for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++, block += SZ_4M) { + pte_t *ptep; + int i; k_next = pgd_addr_end(k_cur, k_end); - k_next = pgd_addr_end(k_next, k_end); if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte) continue; - new = memblock_alloc(sizeof(pte_basic_t), SZ_4K); - if (!new) + ptep = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); + if (!ptep) return -ENOMEM; - *new = pte_val(pte_mkhuge(pfn_pte(PHYS_PFN(__pa(block)), PAGE_KERNEL))); + for (i = 0; i < PTRS_PER_PTE; i++) { + pte_t pte = pte_mkhuge(pfn_pte(PHYS_PFN(__pa(block + i * PAGE_SIZE)), PAGE_KERNEL)); - hugepd_populate_kernel((hugepd_t *)pmd, (pte_t *)new, PAGE_SHIFT_8M); - hugepd_populate_kernel((hugepd_t *)pmd + 1, (pte_t *)new, PAGE_SHIFT_8M); + __set_pte_at(&init_mm, k_cur, ptep + i, pte, 1); + } + pmd_populate_kernel(&init_mm, pmd, ptep); + *pmd = __pmd(pmd_val(*pmd) | _PMD_PAGE_8M); } return 0; } diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index 43d4842bb1c7..388bba0ab3e7 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -11,6 +11,7 @@ #include <linux/hugetlb.h> #include <asm/fixmap.h> +#include <asm/pgalloc.h> #include <mm/mmu_decl.h> @@ -48,20 +49,6 @@ unsigned long p_block_mapped(phys_addr_t pa) return 0; } -static pte_t __init *early_hugepd_alloc_kernel(hugepd_t *pmdp, unsigned long va) -{ - if (hpd_val(*pmdp) == 0) { - pte_t *ptep = memblock_alloc(sizeof(pte_basic_t), SZ_4K); - - if (!ptep) - return NULL; - - hugepd_populate_kernel((hugepd_t *)pmdp, ptep, PAGE_SHIFT_8M); - hugepd_populate_kernel((hugepd_t *)pmdp + 1, ptep, PAGE_SHIFT_8M); - } - return hugepte_offset(*(hugepd_t *)pmdp, va, PGDIR_SHIFT); -} - static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa, pgprot_t prot, int psize, bool new) { @@ -75,26 +62,36 @@ static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa, if (WARN_ON(slab_is_available())) return -EINVAL; - if (psize == MMU_PAGE_512K) + if (psize == MMU_PAGE_512K) { ptep = early_pte_alloc_kernel(pmdp, va); - else - ptep = early_hugepd_alloc_kernel((hugepd_t *)pmdp, va); + /* The PTE should never be already present */ + if (WARN_ON(pte_present(*ptep) && pgprot_val(prot))) + return -EINVAL; + } else { + if (WARN_ON(!pmd_none(*pmdp) || !pmd_none(*(pmdp + 1)))) + return -EINVAL; + + ptep = early_alloc_pgtable(PTE_FRAG_SIZE); + pmd_populate_kernel(&init_mm, pmdp, ptep); + + ptep = early_alloc_pgtable(PTE_FRAG_SIZE); + pmd_populate_kernel(&init_mm, pmdp + 1, ptep); + + ptep = (pte_t *)pmdp; + } } else { if (psize == MMU_PAGE_512K) ptep = pte_offset_kernel(pmdp, va); else - ptep = hugepte_offset(*(hugepd_t *)pmdp, va, PGDIR_SHIFT); + ptep = (pte_t *)pmdp; } if (WARN_ON(!ptep)) return -ENOMEM; - /* The PTE should never be already present */ - if (new && WARN_ON(pte_present(*ptep) && pgprot_val(prot))) - return -EINVAL; - set_huge_pte_at(&init_mm, va, ptep, - pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)), psize); + pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)), + 1UL << mmu_psize_to_shift(psize)); return 0; } diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile index 86d0fe434824..cf60c776c883 100644 --- a/arch/powerpc/mm/nohash/Makefile +++ b/arch/powerpc/mm/nohash/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-y += mmu_context.o tlb.o tlb_low.o kup.o -obj-$(CONFIG_PPC_BOOK3E_64) += tlb_low_64e.o book3e_pgtable.o +obj-$(CONFIG_PPC_BOOK3E_64) += tlb_64e.o tlb_low_64e.o book3e_pgtable.o obj-$(CONFIG_44x) += 44x.o obj-$(CONFIG_PPC_8xx) += 8xx.o obj-$(CONFIG_PPC_E500) += e500.o diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c index 1c5e4ecbebeb..ad2a7c26f2a0 100644 --- a/arch/powerpc/mm/nohash/book3e_pgtable.c +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -29,10 +29,10 @@ int __meminit vmemmap_create_mapping(unsigned long start, _PAGE_KERNEL_RW; /* PTEs only contain page size encodings up to 32M */ - BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); + BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].shift - 10 > 0xf); /* Encode the size in the PTE */ - flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; + flags |= (mmu_psize_defs[mmu_vmemmap_psize].shift - 10) << 8; /* For each PTE for that area, map things. Note that we don't * increment phys because all PTEs are of the large size and diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index 5ffa0af4328a..b653a7be4cb1 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -53,37 +53,30 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { [MMU_PAGE_4K] = { .shift = 12, - .enc = BOOK3E_PAGESZ_4K, }, [MMU_PAGE_2M] = { .shift = 21, - .enc = BOOK3E_PAGESZ_2M, }, [MMU_PAGE_4M] = { .shift = 22, - .enc = BOOK3E_PAGESZ_4M, }, [MMU_PAGE_16M] = { .shift = 24, - .enc = BOOK3E_PAGESZ_16M, }, [MMU_PAGE_64M] = { .shift = 26, - .enc = BOOK3E_PAGESZ_64M, }, [MMU_PAGE_256M] = { .shift = 28, - .enc = BOOK3E_PAGESZ_256M, }, [MMU_PAGE_1G] = { .shift = 30, - .enc = BOOK3E_PAGESZ_1GB, }, }; static inline int mmu_get_tsize(int psize) { - return mmu_psize_defs[psize].enc; + return mmu_psize_defs[psize].shift - 10; } #else static inline int mmu_get_tsize(int psize) @@ -110,28 +103,6 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { }; #endif -/* The variables below are currently only used on 64-bit Book3E - * though this will probably be made common with other nohash - * implementations at some point - */ -#ifdef CONFIG_PPC64 - -int mmu_pte_psize; /* Page size used for PTE pages */ -int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ -int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */ -unsigned long linear_map_top; /* Top of linear mapping */ - - -/* - * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug - * exceptions. This is used for bolted and e6500 TLB miss handlers which - * do not modify this SPRG in the TLB miss code; for other TLB miss handlers, - * this is set to zero. - */ -int extlb_level_exc; - -#endif /* CONFIG_PPC64 */ - #ifdef CONFIG_PPC_E500 /* next_tlbcam_idx is used to round-robin tlbcam entry assignment */ DEFINE_PER_CPU(int, next_tlbcam_idx); @@ -358,381 +329,7 @@ void tlb_flush(struct mmu_gather *tlb) flush_tlb_mm(tlb->mm); } -/* - * Below are functions specific to the 64-bit variant of Book3E though that - * may change in the future - */ - -#ifdef CONFIG_PPC64 - -/* - * Handling of virtual linear page tables or indirect TLB entries - * flushing when PTE pages are freed - */ -void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) -{ - int tsize = mmu_psize_defs[mmu_pte_psize].enc; - - if (book3e_htw_mode != PPC_HTW_NONE) { - unsigned long start = address & PMD_MASK; - unsigned long end = address + PMD_SIZE; - unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift; - - /* This isn't the most optimal, ideally we would factor out the - * while preempt & CPU mask mucking around, or even the IPI but - * it will do for now - */ - while (start < end) { - __flush_tlb_page(tlb->mm, start, tsize, 1); - start += size; - } - } else { - unsigned long rmask = 0xf000000000000000ul; - unsigned long rid = (address & rmask) | 0x1000000000000000ul; - unsigned long vpte = address & ~rmask; - - vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; - vpte |= rid; - __flush_tlb_page(tlb->mm, vpte, tsize, 0); - } -} - -static void __init setup_page_sizes(void) -{ - unsigned int tlb0cfg; - unsigned int tlb0ps; - unsigned int eptcfg; - int i, psize; - -#ifdef CONFIG_PPC_E500 - unsigned int mmucfg = mfspr(SPRN_MMUCFG); - int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E); - - if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { - unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG); - unsigned int min_pg, max_pg; - - min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT; - max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT; - - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def; - unsigned int shift; - - def = &mmu_psize_defs[psize]; - shift = def->shift; - - if (shift == 0 || shift & 1) - continue; - - /* adjust to be in terms of 4^shift Kb */ - shift = (shift - 10) >> 1; - - if ((shift >= min_pg) && (shift <= max_pg)) - def->flags |= MMU_PAGE_SIZE_DIRECT; - } - - goto out; - } - - if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) { - u32 tlb1cfg, tlb1ps; - - tlb0cfg = mfspr(SPRN_TLB0CFG); - tlb1cfg = mfspr(SPRN_TLB1CFG); - tlb1ps = mfspr(SPRN_TLB1PS); - eptcfg = mfspr(SPRN_EPTCFG); - - if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT)) - book3e_htw_mode = PPC_HTW_E6500; - - /* - * We expect 4K subpage size and unrestricted indirect size. - * The lack of a restriction on indirect size is a Freescale - * extension, indicated by PSn = 0 but SPSn != 0. - */ - if (eptcfg != 2) - book3e_htw_mode = PPC_HTW_NONE; - - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - - if (!def->shift) - continue; - - if (tlb1ps & (1U << (def->shift - 10))) { - def->flags |= MMU_PAGE_SIZE_DIRECT; - - if (book3e_htw_mode && psize == MMU_PAGE_2M) - def->flags |= MMU_PAGE_SIZE_INDIRECT; - } - } - - goto out; - } -#endif - - tlb0cfg = mfspr(SPRN_TLB0CFG); - tlb0ps = mfspr(SPRN_TLB0PS); - eptcfg = mfspr(SPRN_EPTCFG); - - /* Look for supported direct sizes */ - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - - if (tlb0ps & (1U << (def->shift - 10))) - def->flags |= MMU_PAGE_SIZE_DIRECT; - } - - /* Indirect page sizes supported ? */ - if ((tlb0cfg & TLBnCFG_IND) == 0 || - (tlb0cfg & TLBnCFG_PT) == 0) - goto out; - - book3e_htw_mode = PPC_HTW_IBM; - - /* Now, we only deal with one IND page size for each - * direct size. Hopefully all implementations today are - * unambiguous, but we might want to be careful in the - * future. - */ - for (i = 0; i < 3; i++) { - unsigned int ps, sps; - - sps = eptcfg & 0x1f; - eptcfg >>= 5; - ps = eptcfg & 0x1f; - eptcfg >>= 5; - if (!ps || !sps) - continue; - for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - - if (ps == (def->shift - 10)) - def->flags |= MMU_PAGE_SIZE_INDIRECT; - if (sps == (def->shift - 10)) - def->ind = ps + 10; - } - } - -out: - /* Cleanup array and print summary */ - pr_info("MMU: Supported page sizes\n"); - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - const char *__page_type_names[] = { - "unsupported", - "direct", - "indirect", - "direct & indirect" - }; - if (def->flags == 0) { - def->shift = 0; - continue; - } - pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10), - __page_type_names[def->flags & 0x3]); - } -} - -static void __init setup_mmu_htw(void) -{ - /* - * If we want to use HW tablewalk, enable it by patching the TLB miss - * handlers to branch to the one dedicated to it. - */ - - switch (book3e_htw_mode) { - case PPC_HTW_IBM: - patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e); - patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e); - break; -#ifdef CONFIG_PPC_E500 - case PPC_HTW_E6500: - extlb_level_exc = EX_TLB_SIZE; - patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e); - patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e); - break; -#endif - } - pr_info("MMU: Book3E HW tablewalk %s\n", - book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported"); -} - -/* - * Early initialization of the MMU TLB code - */ -static void early_init_this_mmu(void) -{ - unsigned int mas4; - - /* Set MAS4 based on page table setting */ - - mas4 = 0x4 << MAS4_WIMGED_SHIFT; - switch (book3e_htw_mode) { - case PPC_HTW_E6500: - mas4 |= MAS4_INDD; - mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT; - mas4 |= MAS4_TLBSELD(1); - mmu_pte_psize = MMU_PAGE_2M; - break; - - case PPC_HTW_IBM: - mas4 |= MAS4_INDD; - mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT; - mmu_pte_psize = MMU_PAGE_1M; - break; - - case PPC_HTW_NONE: - mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; - mmu_pte_psize = mmu_virtual_psize; - break; - } - mtspr(SPRN_MAS4, mas4); - -#ifdef CONFIG_PPC_E500 - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { - unsigned int num_cams; - bool map = true; - - /* use a quarter of the TLBCAM for bolted linear map */ - num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; - - /* - * Only do the mapping once per core, or else the - * transient mapping would cause problems. - */ -#ifdef CONFIG_SMP - if (hweight32(get_tensr()) > 1) - map = false; -#endif - - if (map) - linear_map_top = map_mem_in_cams(linear_map_top, - num_cams, false, true); - } -#endif - - /* A sync won't hurt us after mucking around with - * the MMU configuration - */ - mb(); -} - -static void __init early_init_mmu_global(void) -{ - /* XXX This should be decided at runtime based on supported - * page sizes in the TLB, but for now let's assume 16M is - * always there and a good fit (which it probably is) - * - * Freescale booke only supports 4K pages in TLB0, so use that. - */ - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) - mmu_vmemmap_psize = MMU_PAGE_4K; - else - mmu_vmemmap_psize = MMU_PAGE_16M; - - /* XXX This code only checks for TLB 0 capabilities and doesn't - * check what page size combos are supported by the HW. It - * also doesn't handle the case where a separate array holds - * the IND entries from the array loaded by the PT. - */ - /* Look for supported page sizes */ - setup_page_sizes(); - - /* Look for HW tablewalk support */ - setup_mmu_htw(); - -#ifdef CONFIG_PPC_E500 - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { - if (book3e_htw_mode == PPC_HTW_NONE) { - extlb_level_exc = EX_TLB_SIZE; - patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e); - patch_exception(0x1e0, - exc_instruction_tlb_miss_bolted_book3e); - } - } -#endif - - /* Set the global containing the top of the linear mapping - * for use by the TLB miss code - */ - linear_map_top = memblock_end_of_DRAM(); - - ioremap_bot = IOREMAP_BASE; -} - -static void __init early_mmu_set_memory_limit(void) -{ -#ifdef CONFIG_PPC_E500 - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { - /* - * Limit memory so we dont have linear faults. - * Unlike memblock_set_current_limit, which limits - * memory available during early boot, this permanently - * reduces the memory available to Linux. We need to - * do this because highmem is not supported on 64-bit. - */ - memblock_enforce_memory_limit(linear_map_top); - } -#endif - - memblock_set_current_limit(linear_map_top); -} - -/* boot cpu only */ -void __init early_init_mmu(void) -{ - early_init_mmu_global(); - early_init_this_mmu(); - early_mmu_set_memory_limit(); -} - -void early_init_mmu_secondary(void) -{ - early_init_this_mmu(); -} - -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* On non-FSL Embedded 64-bit, we adjust the RMA size to match - * the bolted TLB entry. We know for now that only 1G - * entries are supported though that may eventually - * change. - * - * on FSL Embedded 64-bit, usually all RAM is bolted, but with - * unusual memory sizes it's possible for some RAM to not be mapped - * (such RAM is not used at all by Linux, since we don't support - * highmem on 64-bit). We limit ppc64_rma_size to what would be - * mappable if this memblock is the only one. Additional memblocks - * can only increase, not decrease, the amount that ends up getting - * mapped. We still limit max to 1G even if we'll eventually map - * more. This is due to what the early init code is set up to do. - * - * We crop it to the size of the first MEMBLOCK to - * avoid going over total available memory just in case... - */ -#ifdef CONFIG_PPC_E500 - if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { - unsigned long linear_sz; - unsigned int num_cams; - - /* use a quarter of the TLBCAM for bolted linear map */ - num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; - - linear_sz = map_mem_in_cams(first_memblock_size, num_cams, - true, true); - - ppc64_rma_size = min_t(u64, linear_sz, 0x40000000); - } else -#endif - ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); - - /* Finally limit subsequent allocations */ - memblock_set_current_limit(first_memblock_base + ppc64_rma_size); -} -#else /* ! CONFIG_PPC64 */ +#ifndef CONFIG_PPC64 void __init early_init_mmu(void) { unsigned long root = of_get_flat_dt_root(); diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c new file mode 100644 index 000000000000..113edf76d3ce --- /dev/null +++ b/arch/powerpc/mm/nohash/tlb_64e.c @@ -0,0 +1,314 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org> + * IBM Corp. + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/pagemap.h> +#include <linux/memblock.h> + +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/code-patching.h> +#include <asm/cputhreads.h> + +#include <mm/mmu_decl.h> + +/* The variables below are currently only used on 64-bit Book3E + * though this will probably be made common with other nohash + * implementations at some point + */ +int mmu_pte_psize; /* Page size used for PTE pages */ +int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ +int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */ +unsigned long linear_map_top; /* Top of linear mapping */ + + +/* + * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug + * exceptions. This is used for bolted and e6500 TLB miss handlers which + * do not modify this SPRG in the TLB miss code; for other TLB miss handlers, + * this is set to zero. + */ +int extlb_level_exc; + +/* + * Handling of virtual linear page tables or indirect TLB entries + * flushing when PTE pages are freed + */ +void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) +{ + int tsize = mmu_psize_defs[mmu_pte_psize].shift - 10; + + if (book3e_htw_mode != PPC_HTW_NONE) { + unsigned long start = address & PMD_MASK; + unsigned long end = address + PMD_SIZE; + unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift; + + /* This isn't the most optimal, ideally we would factor out the + * while preempt & CPU mask mucking around, or even the IPI but + * it will do for now + */ + while (start < end) { + __flush_tlb_page(tlb->mm, start, tsize, 1); + start += size; + } + } else { + unsigned long rmask = 0xf000000000000000ul; + unsigned long rid = (address & rmask) | 0x1000000000000000ul; + unsigned long vpte = address & ~rmask; + + vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; + vpte |= rid; + __flush_tlb_page(tlb->mm, vpte, tsize, 0); + } +} + +static void __init setup_page_sizes(void) +{ + unsigned int tlb0cfg; + unsigned int eptcfg; + int psize; + + unsigned int mmucfg = mfspr(SPRN_MMUCFG); + + if ((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { + unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG); + unsigned int min_pg, max_pg; + + min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT; + max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def; + unsigned int shift; + + def = &mmu_psize_defs[psize]; + shift = def->shift; + + if (shift == 0 || shift & 1) + continue; + + /* adjust to be in terms of 4^shift Kb */ + shift = (shift - 10) >> 1; + + if ((shift >= min_pg) && (shift <= max_pg)) + def->flags |= MMU_PAGE_SIZE_DIRECT; + } + + goto out; + } + + if ((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) { + u32 tlb1cfg, tlb1ps; + + tlb0cfg = mfspr(SPRN_TLB0CFG); + tlb1cfg = mfspr(SPRN_TLB1CFG); + tlb1ps = mfspr(SPRN_TLB1PS); + eptcfg = mfspr(SPRN_EPTCFG); + + if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT)) + book3e_htw_mode = PPC_HTW_E6500; + + /* + * We expect 4K subpage size and unrestricted indirect size. + * The lack of a restriction on indirect size is a Freescale + * extension, indicated by PSn = 0 but SPSn != 0. + */ + if (eptcfg != 2) + book3e_htw_mode = PPC_HTW_NONE; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + + if (!def->shift) + continue; + + if (tlb1ps & (1U << (def->shift - 10))) { + def->flags |= MMU_PAGE_SIZE_DIRECT; + + if (book3e_htw_mode && psize == MMU_PAGE_2M) + def->flags |= MMU_PAGE_SIZE_INDIRECT; + } + } + + goto out; + } +out: + /* Cleanup array and print summary */ + pr_info("MMU: Supported page sizes\n"); + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + const char *__page_type_names[] = { + "unsupported", + "direct", + "indirect", + "direct & indirect" + }; + if (def->flags == 0) { + def->shift = 0; + continue; + } + pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10), + __page_type_names[def->flags & 0x3]); + } +} + +/* + * Early initialization of the MMU TLB code + */ +static void early_init_this_mmu(void) +{ + unsigned int mas4; + + /* Set MAS4 based on page table setting */ + + mas4 = 0x4 << MAS4_WIMGED_SHIFT; + switch (book3e_htw_mode) { + case PPC_HTW_E6500: + mas4 |= MAS4_INDD; + mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT; + mas4 |= MAS4_TLBSELD(1); + mmu_pte_psize = MMU_PAGE_2M; + break; + + case PPC_HTW_NONE: + mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; + mmu_pte_psize = mmu_virtual_psize; + break; + } + mtspr(SPRN_MAS4, mas4); + + unsigned int num_cams; + bool map = true; + + /* use a quarter of the TLBCAM for bolted linear map */ + num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; + + /* + * Only do the mapping once per core, or else the + * transient mapping would cause problems. + */ +#ifdef CONFIG_SMP + if (hweight32(get_tensr()) > 1) + map = false; +#endif + + if (map) + linear_map_top = map_mem_in_cams(linear_map_top, + num_cams, false, true); + + /* A sync won't hurt us after mucking around with + * the MMU configuration + */ + mb(); +} + +static void __init early_init_mmu_global(void) +{ + /* + * Freescale booke only supports 4K pages in TLB0, so use that. + */ + mmu_vmemmap_psize = MMU_PAGE_4K; + + /* XXX This code only checks for TLB 0 capabilities and doesn't + * check what page size combos are supported by the HW. It + * also doesn't handle the case where a separate array holds + * the IND entries from the array loaded by the PT. + */ + /* Look for supported page sizes */ + setup_page_sizes(); + + /* + * If we want to use HW tablewalk, enable it by patching the TLB miss + * handlers to branch to the one dedicated to it. + */ + extlb_level_exc = EX_TLB_SIZE; + switch (book3e_htw_mode) { + case PPC_HTW_E6500: + patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e); + patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e); + break; + } + + pr_info("MMU: Book3E HW tablewalk %s\n", + book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported"); + + /* Set the global containing the top of the linear mapping + * for use by the TLB miss code + */ + linear_map_top = memblock_end_of_DRAM(); + + ioremap_bot = IOREMAP_BASE; +} + +static void __init early_mmu_set_memory_limit(void) +{ + /* + * Limit memory so we dont have linear faults. + * Unlike memblock_set_current_limit, which limits + * memory available during early boot, this permanently + * reduces the memory available to Linux. We need to + * do this because highmem is not supported on 64-bit. + */ + memblock_enforce_memory_limit(linear_map_top); + + memblock_set_current_limit(linear_map_top); +} + +/* boot cpu only */ +void __init early_init_mmu(void) +{ + early_init_mmu_global(); + early_init_this_mmu(); + early_mmu_set_memory_limit(); +} + +void early_init_mmu_secondary(void) +{ + early_init_this_mmu(); +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* + * On FSL Embedded 64-bit, usually all RAM is bolted, but with + * unusual memory sizes it's possible for some RAM to not be mapped + * (such RAM is not used at all by Linux, since we don't support + * highmem on 64-bit). We limit ppc64_rma_size to what would be + * mappable if this memblock is the only one. Additional memblocks + * can only increase, not decrease, the amount that ends up getting + * mapped. We still limit max to 1G even if we'll eventually map + * more. This is due to what the early init code is set up to do. + * + * We crop it to the size of the first MEMBLOCK to + * avoid going over total available memory just in case... + */ + unsigned long linear_sz; + unsigned int num_cams; + + /* use a quarter of the TLBCAM for bolted linear map */ + num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; + + linear_sz = map_mem_in_cams(first_memblock_size, num_cams, true, true); + ppc64_rma_size = min_t(u64, linear_sz, 0x40000000); + + /* Finally limit subsequent allocations */ + memblock_set_current_limit(first_memblock_base + ppc64_rma_size); +} diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S index 7e0b8fe1c279..de568297d5c5 100644 --- a/arch/powerpc/mm/nohash/tlb_low_64e.S +++ b/arch/powerpc/mm/nohash/tlb_low_64e.S @@ -450,11 +450,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SMT) tlb_miss_huge_e6500: beq tlb_miss_fault_e6500 - li r10,1 - andi. r15,r14,HUGEPD_SHIFT_MASK@l /* r15 = psize */ - rldimi r14,r10,63,0 /* Set PD_HUGE */ - xor r14,r14,r15 /* Clear size bits */ - ldx r14,0,r14 + rlwinm r15,r14,32-_PAGE_PSIZE_SHIFT,0x1e /* * Now we build the MAS for a huge page. @@ -465,7 +461,6 @@ tlb_miss_huge_e6500: * MAS 2,3+7: Needs to be redone similar to non-tablewalk handler */ - subi r15,r15,10 /* Convert psize to tsize */ mfspr r10,SPRN_MAS1 rlwinm r10,r10,0,~MAS1_IND rlwimi r10,r15,MAS1_TSIZE_SHIFT,MAS1_TSIZE_MASK @@ -511,232 +506,6 @@ itlb_miss_fault_e6500: tlb_epilog_bolted b exc_instruction_storage_book3e -/********************************************************************** - * * - * TLB miss handling for Book3E with TLB reservation and HES support * - * * - **********************************************************************/ - - -/* Data TLB miss */ - START_EXCEPTION(data_tlb_miss) - TLB_MISS_PROLOG - - /* Now we handle the fault proper. We only save DEAR in normal - * fault case since that's the only interesting values here. - * We could probably also optimize by not saving SRR0/1 in the - * linear mapping case but I'll leave that for later - */ - mfspr r14,SPRN_ESR - mfspr r16,SPRN_DEAR /* get faulting address */ - srdi r15,r16,44 /* get region */ - xoris r15,r15,0xc - cmpldi cr0,r15,0 /* linear mapping ? */ - beq tlb_load_linear /* yes -> go to linear map load */ - cmpldi cr1,r15,1 /* vmalloc mapping ? */ - - /* The page tables are mapped virtually linear. At this point, though, - * we don't know whether we are trying to fault in a first level - * virtual address or a virtual page table address. We can get that - * from bit 0x1 of the region ID which we have set for a page table - */ - andis. r10,r15,0x1 - bne- virt_page_table_tlb_miss - - std r14,EX_TLB_ESR(r12); /* save ESR */ - std r16,EX_TLB_DEAR(r12); /* save DEAR */ - - /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ - li r11,_PAGE_PRESENT - oris r11,r11,_PAGE_ACCESSED@h - - /* We do the user/kernel test for the PID here along with the RW test - */ - srdi. r15,r16,60 /* Check for user region */ - - /* We pre-test some combination of permissions to avoid double - * faults: - * - * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE - * ESR_ST is 0x00800000 - * _PAGE_BAP_SW is 0x00000010 - * So the shift is >> 19. This tests for supervisor writeability. - * If the page happens to be supervisor writeable and not user - * writeable, we will take a new fault later, but that should be - * a rare enough case. - * - * We also move ESR_ST in _PAGE_DIRTY position - * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 - * - * MAS1 is preset for all we need except for TID that needs to - * be cleared for kernel translations - */ - rlwimi r11,r14,32-19,27,27 - rlwimi r11,r14,32-16,19,19 - beq normal_tlb_miss_user - /* XXX replace the RMW cycles with immediate loads + writes */ -1: mfspr r10,SPRN_MAS1 - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - beq+ cr1,normal_tlb_miss - - /* We got a crappy address, just fault with whatever DEAR and ESR - * are here - */ - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e - -/* Instruction TLB miss */ - START_EXCEPTION(instruction_tlb_miss) - TLB_MISS_PROLOG - - /* If we take a recursive fault, the second level handler may need - * to know whether we are handling a data or instruction fault in - * order to get to the right store fault handler. We provide that - * info by writing a crazy value in ESR in our exception frame - */ - li r14,-1 /* store to exception frame is done later */ - - /* Now we handle the fault proper. We only save DEAR in the non - * linear mapping case since we know the linear mapping case will - * not re-enter. We could indeed optimize and also not save SRR0/1 - * in the linear mapping case but I'll leave that for later - * - * Faulting address is SRR0 which is already in r16 - */ - srdi r15,r16,44 /* get region */ - xoris r15,r15,0xc - cmpldi cr0,r15,0 /* linear mapping ? */ - beq tlb_load_linear /* yes -> go to linear map load */ - cmpldi cr1,r15,1 /* vmalloc mapping ? */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - li r11,_PAGE_PRESENT|_PAGE_BAP_UX /* Base perm */ - oris r11,r11,_PAGE_ACCESSED@h - - srdi. r15,r16,60 /* Check for user region */ - std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */ - beq normal_tlb_miss_user - - li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */ - oris r11,r11,_PAGE_ACCESSED@h - /* XXX replace the RMW cycles with immediate loads + writes */ - mfspr r10,SPRN_MAS1 - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - beq+ cr1,normal_tlb_miss - - /* We got a crappy address, just fault */ - TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - -/* - * This is the guts of the first-level TLB miss handler for direct - * misses. We are entered with: - * - * r16 = faulting address - * r15 = region ID - * r14 = crap (free to use) - * r13 = PACA - * r12 = TLB exception frame in PACA - * r11 = PTE permission mask - * r10 = crap (free to use) - */ -normal_tlb_miss_user: -#ifdef CONFIG_PPC_KUAP - mfspr r14,SPRN_MAS1 - rlwinm. r14,r14,0,0x3fff0000 - beq- normal_tlb_miss_access_fault /* KUAP fault */ -#endif -normal_tlb_miss: - /* So we first construct the page table address. We do that by - * shifting the bottom of the address (not the region ID) by - * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and - * or'ing the fourth high bit. - * - * NOTE: For 64K pages, we do things slightly differently in - * order to handle the weird page table format used by linux - */ - srdi r15,r16,44 - oris r10,r15,0x1 - rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 - sldi r15,r10,44 - clrrdi r14,r14,19 - or r10,r15,r14 - - ld r14,0(r10) - -finish_normal_tlb_miss: - /* Check if required permissions are met */ - andc. r15,r11,r14 - bne- normal_tlb_miss_access_fault - - /* Now we build the MAS: - * - * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG - * MAS 1 : Almost fully setup - * - PID already updated by caller if necessary - * - TSIZE need change if !base page size, not - * yet implemented for now - * MAS 2 : Defaults not useful, need to be redone - * MAS 3+7 : Needs to be done - * - * TODO: mix up code below for better scheduling - */ - clrrdi r10,r16,12 /* Clear low crap in EA */ - rlwimi r10,r14,32-19,27,31 /* Insert WIMGE */ - mtspr SPRN_MAS2,r10 - - /* Check page size, if not standard, update MAS1 */ - rldicl r10,r14,64-8,64-8 - cmpldi cr0,r10,BOOK3E_PAGESZ_4K - beq- 1f - mfspr r11,SPRN_MAS1 - rlwimi r11,r14,31,21,24 - rlwinm r11,r11,0,21,19 - mtspr SPRN_MAS1,r11 -1: - /* Move RPN in position */ - rldicr r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT - clrldi r15,r11,12 /* Clear crap at the top */ - rlwimi r15,r14,32-8,22,25 /* Move in U bits */ - rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ - - /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ - andi. r11,r14,_PAGE_DIRTY - bne 1f - li r11,MAS3_SW|MAS3_UW - andc r15,r15,r11 -1: - srdi r16,r15,32 - mtspr SPRN_MAS3,r15 - mtspr SPRN_MAS7,r16 - - tlbwe - -normal_tlb_miss_done: - /* We don't bother with restoring DEAR or ESR since we know we are - * level 0 and just going back to userland. They are only needed - * if you are going to take an access fault - */ - TLB_MISS_EPILOG_SUCCESS - rfi - -normal_tlb_miss_access_fault: - /* We need to check if it was an instruction miss */ - andi. r10,r11,_PAGE_BAP_UX - bne 1f - ld r14,EX_TLB_DEAR(r12) - ld r15,EX_TLB_ESR(r12) - mtspr SPRN_DEAR,r14 - mtspr SPRN_ESR,r15 - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e -1: TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - - /* * This is the guts of the second-level TLB miss handler for direct * misses. We are entered with: @@ -893,201 +662,6 @@ virt_page_table_tlb_miss_whacko_fault: TLB_MISS_EPILOG_ERROR b exc_data_storage_book3e - -/************************************************************** - * * - * TLB miss handling for Book3E with hw page table support * - * * - **************************************************************/ - - -/* Data TLB miss */ - START_EXCEPTION(data_tlb_miss_htw) - TLB_MISS_PROLOG - - /* Now we handle the fault proper. We only save DEAR in normal - * fault case since that's the only interesting values here. - * We could probably also optimize by not saving SRR0/1 in the - * linear mapping case but I'll leave that for later - */ - mfspr r14,SPRN_ESR - mfspr r16,SPRN_DEAR /* get faulting address */ - srdi r11,r16,44 /* get region */ - xoris r11,r11,0xc - cmpldi cr0,r11,0 /* linear mapping ? */ - beq tlb_load_linear /* yes -> go to linear map load */ - cmpldi cr1,r11,1 /* vmalloc mapping ? */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - srdi. r11,r16,60 /* Check for user region */ - ld r15,PACAPGD(r13) /* Load user pgdir */ - beq htw_tlb_miss - - /* XXX replace the RMW cycles with immediate loads + writes */ -1: mfspr r10,SPRN_MAS1 - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ - beq+ cr1,htw_tlb_miss - - /* We got a crappy address, just fault with whatever DEAR and ESR - * are here - */ - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e - -/* Instruction TLB miss */ - START_EXCEPTION(instruction_tlb_miss_htw) - TLB_MISS_PROLOG - - /* If we take a recursive fault, the second level handler may need - * to know whether we are handling a data or instruction fault in - * order to get to the right store fault handler. We provide that - * info by keeping a crazy value for ESR in r14 - */ - li r14,-1 /* store to exception frame is done later */ - - /* Now we handle the fault proper. We only save DEAR in the non - * linear mapping case since we know the linear mapping case will - * not re-enter. We could indeed optimize and also not save SRR0/1 - * in the linear mapping case but I'll leave that for later - * - * Faulting address is SRR0 which is already in r16 - */ - srdi r11,r16,44 /* get region */ - xoris r11,r11,0xc - cmpldi cr0,r11,0 /* linear mapping ? */ - beq tlb_load_linear /* yes -> go to linear map load */ - cmpldi cr1,r11,1 /* vmalloc mapping ? */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - srdi. r11,r16,60 /* Check for user region */ - ld r15,PACAPGD(r13) /* Load user pgdir */ - beq htw_tlb_miss - - /* XXX replace the RMW cycles with immediate loads + writes */ -1: mfspr r10,SPRN_MAS1 - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ - beq+ htw_tlb_miss - - /* We got a crappy address, just fault */ - TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - - -/* - * This is the guts of the second-level TLB miss handler for direct - * misses. We are entered with: - * - * r16 = virtual page table faulting address - * r15 = PGD pointer - * r14 = ESR - * r13 = PACA - * r12 = TLB exception frame in PACA - * r11 = crap (free to use) - * r10 = crap (free to use) - * - * It can be re-entered by the linear mapping miss handler. However, to - * avoid too much complication, it will save/restore things for us - */ -htw_tlb_miss: -#ifdef CONFIG_PPC_KUAP - mfspr r10,SPRN_MAS1 - rlwinm. r10,r10,0,0x3fff0000 - beq- htw_tlb_miss_fault /* KUAP fault */ -#endif - /* Search if we already have a TLB entry for that virtual address, and - * if we do, bail out. - * - * MAS1:IND should be already set based on MAS4 - */ - PPC_TLBSRX_DOT(0,R16) - beq htw_tlb_miss_done - - /* Now, we need to walk the page tables. First check if we are in - * range. - */ - rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 - bne- htw_tlb_miss_fault - - /* Get the PGD pointer */ - cmpldi cr0,r15,0 - beq- htw_tlb_miss_fault - - /* Get to PGD entry */ - rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge htw_tlb_miss_fault - - /* Get to PUD entry */ - rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge htw_tlb_miss_fault - - /* Get to PMD entry */ - rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge htw_tlb_miss_fault - - /* Ok, we're all right, we can now create an indirect entry for - * a 1M or 256M page. - * - * The last trick is now that because we use "half" pages for - * the HTW (1M IND is 2K and 256M IND is 32K) we need to account - * for an added LSB bit to the RPN. For 64K pages, there is no - * problem as we already use 32K arrays (half PTE pages), but for - * 4K page we need to extract a bit from the virtual address and - * insert it into the "PA52" bit of the RPN. - */ - rlwimi r15,r16,32-9,20,20 - /* Now we build the MAS: - * - * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG - * MAS 1 : Almost fully setup - * - PID already updated by caller if necessary - * - TSIZE for now is base ind page size always - * MAS 2 : Use defaults - * MAS 3+7 : Needs to be done - */ - ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) - - srdi r16,r10,32 - mtspr SPRN_MAS3,r10 - mtspr SPRN_MAS7,r16 - - tlbwe - -htw_tlb_miss_done: - /* We don't bother with restoring DEAR or ESR since we know we are - * level 0 and just going back to userland. They are only needed - * if you are going to take an access fault - */ - TLB_MISS_EPILOG_SUCCESS - rfi - -htw_tlb_miss_fault: - /* We need to check if it was an instruction miss. We know this - * though because r14 would contain -1 - */ - cmpdi cr0,r14,-1 - beq 1f - mtspr SPRN_DEAR,r16 - mtspr SPRN_ESR,r14 - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e -1: TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - /* * This is the guts of "any" level TLB miss handler for kernel linear * mapping misses. We are entered with: diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 9e7ba9c3851f..ab0656115424 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -297,11 +297,8 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, } #if defined(CONFIG_PPC_8xx) -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, - pte_t pte, unsigned long sz) +static void __set_huge_pte_at(pmd_t *pmd, pte_t *ptep, pte_basic_t val) { - pmd_t *pmd = pmd_off(mm, addr); - pte_basic_t val; pte_basic_t *entry = (pte_basic_t *)ptep; int num, i; @@ -311,15 +308,60 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, */ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); - pte = set_pte_filter(pte, addr); - - val = pte_val(pte); - num = number_of_cells_per_pte(pmd, val, 1); for (i = 0; i < num; i++, entry++, val += SZ_4K) *entry = val; } + +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned long sz) +{ + pmd_t *pmdp = pmd_off(mm, addr); + + pte = set_pte_filter(pte, addr); + + if (sz == SZ_8M) { /* Flag both PMD entries as 8M and fill both page tables */ + *pmdp = __pmd(pmd_val(*pmdp) | _PMD_PAGE_8M); + *(pmdp + 1) = __pmd(pmd_val(*(pmdp + 1)) | _PMD_PAGE_8M); + + __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp, 0), pte_val(pte)); + __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp + 1, 0), pte_val(pte) + SZ_4M); + } else { + __set_huge_pte_at(pmdp, ptep, pte_val(pte)); + } +} +#else +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned long sz) +{ + unsigned long pdsize; + int i; + + pte = set_pte_filter(pte, addr); + + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); + + if (sz < PMD_SIZE) + pdsize = PAGE_SIZE; + else if (sz < PUD_SIZE) + pdsize = PMD_SIZE; + else if (sz < P4D_SIZE) + pdsize = PUD_SIZE; + else if (sz < PGDIR_SIZE) + pdsize = P4D_SIZE; + else + pdsize = PGDIR_SIZE; + + for (i = 0; i < sz / pdsize; i++, ptep++, addr += pdsize) { + __set_pte_at(mm, addr, ptep, pte, 0); + pte = __pte(pte_val(pte) + ((unsigned long long)pdsize / PAGE_SIZE << PFN_PTE_SHIFT)); + } +} #endif #endif /* CONFIG_HUGETLB_PAGE */ @@ -367,11 +409,10 @@ unsigned long vmalloc_to_phys(void *va) EXPORT_SYMBOL_GPL(vmalloc_to_phys); /* - * We have 4 cases for pgds and pmds: + * We have 3 cases for pgds and pmds: * (1) invalid (all zeroes) * (2) pointer to next table, as normal; bottom 6 bits == 0 * (3) leaf pte for huge page _PAGE_PTE set - * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table * * So long as we atomically load page table pointers we are safe against teardown, * we can follow the address down to the page and take a ref on it. @@ -382,11 +423,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, bool *is_thp, unsigned *hpage_shift) { pgd_t *pgdp; +#ifdef CONFIG_PPC64 p4d_t p4d, *p4dp; pud_t pud, *pudp; +#endif pmd_t pmd, *pmdp; pte_t *ret_pte; - hugepd_t *hpdp = NULL; unsigned pdshift; if (hpage_shift) @@ -401,8 +443,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, * page fault or a page unmap. The return pte_t * is still not * stable. So should be checked there for above conditions. * Top level is an exception because it is folded into p4d. + * + * On PPC32, P4D/PUD/PMD are folded into PGD so go straight to + * PMD level. */ pgdp = pgdir + pgd_index(ea); +#ifdef CONFIG_PPC64 p4dp = p4d_offset(pgdp, ea); p4d = READ_ONCE(*p4dp); pdshift = P4D_SHIFT; @@ -415,11 +461,6 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, goto out; } - if (is_hugepd(__hugepd(p4d_val(p4d)))) { - hpdp = (hugepd_t *)&p4d; - goto out_huge; - } - /* * Even if we end up with an unmap, the pgtable will not * be freed, because we do an rcu free and here we are @@ -437,13 +478,11 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, goto out; } - if (is_hugepd(__hugepd(pud_val(pud)))) { - hpdp = (hugepd_t *)&pud; - goto out_huge; - } - - pdshift = PMD_SHIFT; pmdp = pmd_offset(&pud, ea); +#else + pmdp = pmd_offset(pud_offset(p4d_offset(pgdp, ea), ea), ea); +#endif + pdshift = PMD_SHIFT; pmd = READ_ONCE(*pmdp); /* @@ -476,19 +515,8 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, goto out; } - if (is_hugepd(__hugepd(pmd_val(pmd)))) { - hpdp = (hugepd_t *)&pmd; - goto out_huge; - } - return pte_offset_kernel(&pmd, ea); -out_huge: - if (!hpdp) - return NULL; - - ret_pte = hugepte_offset(*hpdp, ea, pdshift); - pdshift = hugepd_shift(*hpdp); out: if (hpage_shift) *hpage_shift = pdshift; diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index cfd622ebf774..787b22206386 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -48,7 +48,7 @@ notrace void __init early_ioremap_init(void) early_ioremap_setup(); } -static void __init *early_alloc_pgtable(unsigned long size) +void __init *early_alloc_pgtable(unsigned long size) { void *ptr = memblock_alloc(size, size); diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index b1ce97a9dbfc..faf3624d8057 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -44,7 +44,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, pte_t pte, int dirty); #define __HAVE_ARCH_HUGE_PTEP_GET -pte_t huge_ptep_get(pte_t *ptep); +pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags); #define arch_make_huge_pte arch_make_huge_pte diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index ab7a759e1a8c..089f3c9f56a3 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -514,8 +514,8 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf, #define update_mmu_cache(vma, addr, ptep) \ update_mmu_cache_range(NULL, vma, addr, ptep, 1) -#define __HAVE_ARCH_UPDATE_MMU_TLB -#define update_mmu_tlb update_mmu_cache +#define update_mmu_tlb_range(vma, addr, ptep, nr) \ + update_mmu_cache_range(NULL, vma, addr, ptep, nr) static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 0ebd968b33c9..42314f093922 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -3,7 +3,7 @@ #include <linux/err.h> #ifdef CONFIG_RISCV_ISA_SVNAPOT -pte_t huge_ptep_get(pte_t *ptep) +pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { unsigned long pte_num; int i; diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 371c2bf88149..59e0d861e26f 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -159,6 +159,7 @@ config S390 select HAVE_ARCH_KASAN select HAVE_ARCH_KASAN_VMALLOC select HAVE_ARCH_KCSAN + select HAVE_ARCH_KMSAN select HAVE_ARCH_KFENCE select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_SECCOMP_FILTER diff --git a/arch/s390/Makefile b/arch/s390/Makefile index f2b21c7a70ef..7fd57398221e 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -36,7 +36,7 @@ KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_NO_ARRAY_BOUNDS),-Wno-array-bounds) UTS_MACHINE := s390x -STACK_SIZE := $(if $(CONFIG_KASAN),65536,16384) +STACK_SIZE := $(if $(CONFIG_KASAN),65536,$(if $(CONFIG_KMSAN),65536,16384)) CHECKFLAGS += -D__s390__ -D__s390x__ export LD_BFD diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index 070c9b2e905f..e7658997452b 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -3,11 +3,13 @@ # Makefile for the linux s390-specific parts of the memory manager. # +# Tooling runtimes are unavailable and cannot be linked for early boot code KCOV_INSTRUMENT := n GCOV_PROFILE := n UBSAN_SANITIZE := n KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR) KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR) @@ -42,6 +44,7 @@ obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o obj-$(CONFIG_KERNEL_ZSTD) += clz_ctz.o +obj-$(CONFIG_KMSAN) += kmsan.o obj-all := $(obj-y) piggy.o syms.o targets := bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) diff --git a/arch/s390/boot/kmsan.c b/arch/s390/boot/kmsan.c new file mode 100644 index 000000000000..e7b3ac48143e --- /dev/null +++ b/arch/s390/boot/kmsan.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kmsan-checks.h> + +void kmsan_unpoison_memory(const void *address, size_t size) +{ +} diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 320c964cd603..c59014945af0 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -304,11 +304,18 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) MODULES_END = round_down(kernel_start, _SEGMENT_SIZE); MODULES_VADDR = MODULES_END - MODULES_LEN; VMALLOC_END = MODULES_VADDR; + if (IS_ENABLED(CONFIG_KMSAN)) + VMALLOC_END -= MODULES_LEN * 2; /* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */ vsize = (VMALLOC_END - FIXMAP_SIZE) / 2; vsize = round_down(vsize, _SEGMENT_SIZE); vmalloc_size = min(vmalloc_size, vsize); + if (IS_ENABLED(CONFIG_KMSAN)) { + /* take 2/3 of vmalloc area for KMSAN shadow and origins */ + vmalloc_size = round_down(vmalloc_size / 3, _SEGMENT_SIZE); + VMALLOC_END -= vmalloc_size * 2; + } VMALLOC_START = VMALLOC_END - vmalloc_size; __memcpy_real_area = round_down(VMALLOC_START - MEMCPY_REAL_SIZE, PAGE_SIZE); diff --git a/arch/s390/boot/string.c b/arch/s390/boot/string.c index faccb33b462c..f6b9b1df48a8 100644 --- a/arch/s390/boot/string.c +++ b/arch/s390/boot/string.c @@ -1,11 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 +#define IN_BOOT_STRING_C 1 #include <linux/ctype.h> #include <linux/kernel.h> #include <linux/errno.h> #undef CONFIG_KASAN #undef CONFIG_KASAN_GENERIC +#undef CONFIG_KMSAN #include "../lib/string.c" +/* + * Duplicate some functions from the common lib/string.c + * instead of fully including it. + */ + int strncmp(const char *cs, const char *ct, size_t count) { unsigned char c1, c2; @@ -22,6 +29,15 @@ int strncmp(const char *cs, const char *ct, size_t count) return 0; } +void *memset64(uint64_t *s, uint64_t v, size_t count) +{ + uint64_t *xs = s; + + while (count--) + *xs++ = v; + return s; +} + char *skip_spaces(const char *str) { while (isspace(*str)) diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h index b89159591ca0..46f5c9660616 100644 --- a/arch/s390/include/asm/checksum.h +++ b/arch/s390/include/asm/checksum.h @@ -13,6 +13,7 @@ #define _S390_CHECKSUM_H #include <linux/instrumented.h> +#include <linux/kmsan-checks.h> #include <linux/in6.h> static inline __wsum cksm(const void *buff, int len, __wsum sum) @@ -23,6 +24,7 @@ static inline __wsum cksm(const void *buff, int len, __wsum sum) }; instrument_read(buff, len); + kmsan_check_memory(buff, len); asm volatile("\n" "0: cksm %[sum],%[rp]\n" " jo 0b\n" diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h index c786538e397c..dae8843b164f 100644 --- a/arch/s390/include/asm/cpacf.h +++ b/arch/s390/include/asm/cpacf.h @@ -12,6 +12,7 @@ #define _ASM_S390_CPACF_H #include <asm/facility.h> +#include <linux/kmsan-checks.h> /* * Instruction opcodes for the CPACF instructions @@ -542,6 +543,8 @@ static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len, : [ucbuf] "+&d" (u.pair), [cbuf] "+&d" (c.pair) : [fc] "K" (CPACF_PRNO_TRNG), [opc] "i" (CPACF_PRNO) : "cc", "memory", "0"); + kmsan_unpoison_memory(ucbuf, ucbuf_len); + kmsan_unpoison_memory(cbuf, cbuf_len); } /** diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index a0de5b9b02ea..9e4bbc3e53f8 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -10,6 +10,7 @@ #define _ASM_S390_CPU_MF_H #include <linux/errno.h> +#include <linux/kmsan-checks.h> #include <asm/asm-extable.h> #include <asm/facility.h> @@ -239,6 +240,11 @@ static __always_inline int stcctm(enum stcctm_ctr_set set, u64 range, u64 *dest) : "=d" (cc) : "Q" (*dest), "d" (range), "i" (set) : "cc", "memory"); + /* + * If cc == 2, less than RANGE counters are stored, but it's not easy + * to tell how many. Always unpoison the whole range for simplicity. + */ + kmsan_unpoison_memory(dest, range * sizeof(u64)); return cc; } diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index ce5f4fe8be4d..cf1b5d6fb1a6 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -19,7 +19,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz); void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); -pte_t huge_ptep_get(pte_t *ptep); +pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); @@ -64,7 +64,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { - int changed = !pte_same(huge_ptep_get(ptep), pte); + int changed = !pte_same(huge_ptep_get(vma->vm_mm, addr, ptep), pte); if (changed) { huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); __set_huge_pte_at(vma->vm_mm, addr, ptep, pte); diff --git a/arch/s390/include/asm/irqflags.h b/arch/s390/include/asm/irqflags.h index 02427b205c11..bcab456dfb80 100644 --- a/arch/s390/include/asm/irqflags.h +++ b/arch/s390/include/asm/irqflags.h @@ -37,12 +37,18 @@ static __always_inline void __arch_local_irq_ssm(unsigned long flags) asm volatile("ssm %0" : : "Q" (flags) : "memory"); } -static __always_inline unsigned long arch_local_save_flags(void) +#ifdef CONFIG_KMSAN +#define arch_local_irq_attributes noinline notrace __no_sanitize_memory __maybe_unused +#else +#define arch_local_irq_attributes __always_inline +#endif + +static arch_local_irq_attributes unsigned long arch_local_save_flags(void) { return __arch_local_irq_stnsm(0xff); } -static __always_inline unsigned long arch_local_irq_save(void) +static arch_local_irq_attributes unsigned long arch_local_irq_save(void) { return __arch_local_irq_stnsm(0xfc); } @@ -52,7 +58,12 @@ static __always_inline void arch_local_irq_disable(void) arch_local_irq_save(); } -static __always_inline void arch_local_irq_enable(void) +static arch_local_irq_attributes void arch_local_irq_enable_external(void) +{ + __arch_local_irq_stosm(0x01); +} + +static arch_local_irq_attributes void arch_local_irq_enable(void) { __arch_local_irq_stosm(0x03); } diff --git a/arch/s390/include/asm/kmsan.h b/arch/s390/include/asm/kmsan.h new file mode 100644 index 000000000000..27db65fbf3f6 --- /dev/null +++ b/arch/s390/include/asm/kmsan.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_KMSAN_H +#define _ASM_S390_KMSAN_H + +#include <asm/lowcore.h> +#include <asm/page.h> +#include <linux/kmsan.h> +#include <linux/mmzone.h> +#include <linux/stddef.h> + +#ifndef MODULE + +static inline bool is_lowcore_addr(void *addr) +{ + return addr >= (void *)&S390_lowcore && + addr < (void *)(&S390_lowcore + 1); +} + +static inline void *arch_kmsan_get_meta_or_null(void *addr, bool is_origin) +{ + if (is_lowcore_addr(addr)) { + /* + * Different lowcores accessed via S390_lowcore are described + * by the same struct page. Resolve the prefix manually in + * order to get a distinct struct page. + */ + addr += (void *)lowcore_ptr[raw_smp_processor_id()] - + (void *)&S390_lowcore; + if (KMSAN_WARN_ON(is_lowcore_addr(addr))) + return NULL; + return kmsan_get_metadata(addr, is_origin); + } + return NULL; +} + +static inline bool kmsan_virt_addr_valid(void *addr) +{ + bool ret; + + /* + * pfn_valid() relies on RCU, and may call into the scheduler on exiting + * the critical section. However, this would result in recursion with + * KMSAN. Therefore, disable preemption here, and re-enable preemption + * below while suppressing reschedules to avoid recursion. + * + * Note, this sacrifices occasionally breaking scheduling guarantees. + * Although, a kernel compiled with KMSAN has already given up on any + * performance guarantees due to being heavily instrumented. + */ + preempt_disable(); + ret = virt_addr_valid(addr); + preempt_enable_no_resched(); + + return ret; +} + +#endif /* !MODULE */ + +#endif /* _ASM_S390_KMSAN_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index b5632dbe5438..3fa280d0672a 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -107,6 +107,18 @@ static inline int is_module_addr(void *addr) return 1; } +#ifdef CONFIG_KMSAN +#define KMSAN_VMALLOC_SIZE (VMALLOC_END - VMALLOC_START) +#define KMSAN_VMALLOC_SHADOW_START VMALLOC_END +#define KMSAN_VMALLOC_SHADOW_END (KMSAN_VMALLOC_SHADOW_START + KMSAN_VMALLOC_SIZE) +#define KMSAN_VMALLOC_ORIGIN_START KMSAN_VMALLOC_SHADOW_END +#define KMSAN_VMALLOC_ORIGIN_END (KMSAN_VMALLOC_ORIGIN_START + KMSAN_VMALLOC_SIZE) +#define KMSAN_MODULES_SHADOW_START KMSAN_VMALLOC_ORIGIN_END +#define KMSAN_MODULES_SHADOW_END (KMSAN_MODULES_SHADOW_START + MODULES_LEN) +#define KMSAN_MODULES_ORIGIN_START KMSAN_MODULES_SHADOW_END +#define KMSAN_MODULES_ORIGIN_END (KMSAN_MODULES_ORIGIN_START + MODULES_LEN) +#endif + #ifdef CONFIG_RANDOMIZE_BASE #define KASLR_LEN (1UL << 31) #else diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h index 351685de53d2..2ab868cbae6c 100644 --- a/arch/s390/include/asm/string.h +++ b/arch/s390/include/asm/string.h @@ -15,15 +15,12 @@ #define __HAVE_ARCH_MEMCPY /* gcc builtin & arch function */ #define __HAVE_ARCH_MEMMOVE /* gcc builtin & arch function */ #define __HAVE_ARCH_MEMSET /* gcc builtin & arch function */ -#define __HAVE_ARCH_MEMSET16 /* arch function */ -#define __HAVE_ARCH_MEMSET32 /* arch function */ -#define __HAVE_ARCH_MEMSET64 /* arch function */ void *memcpy(void *dest, const void *src, size_t n); void *memset(void *s, int c, size_t n); void *memmove(void *dest, const void *src, size_t n); -#ifndef CONFIG_KASAN +#if !defined(CONFIG_KASAN) && !defined(CONFIG_KMSAN) #define __HAVE_ARCH_MEMCHR /* inline & arch function */ #define __HAVE_ARCH_MEMCMP /* arch function */ #define __HAVE_ARCH_MEMSCAN /* inline & arch function */ @@ -36,6 +33,9 @@ void *memmove(void *dest, const void *src, size_t n); #define __HAVE_ARCH_STRNCPY /* arch function */ #define __HAVE_ARCH_STRNLEN /* inline & arch function */ #define __HAVE_ARCH_STRSTR /* arch function */ +#define __HAVE_ARCH_MEMSET16 /* arch function */ +#define __HAVE_ARCH_MEMSET32 /* arch function */ +#define __HAVE_ARCH_MEMSET64 /* arch function */ /* Prototypes for non-inlined arch strings functions. */ int memcmp(const void *s1, const void *s2, size_t n); @@ -44,7 +44,7 @@ size_t strlcat(char *dest, const char *src, size_t n); char *strncat(char *dest, const char *src, size_t n); char *strncpy(char *dest, const char *src, size_t n); char *strstr(const char *s1, const char *s2); -#endif /* !CONFIG_KASAN */ +#endif /* !defined(CONFIG_KASAN) && !defined(CONFIG_KMSAN) */ #undef __HAVE_ARCH_STRCHR #undef __HAVE_ARCH_STRNCHR @@ -74,20 +74,30 @@ void *__memset16(uint16_t *s, uint16_t v, size_t count); void *__memset32(uint32_t *s, uint32_t v, size_t count); void *__memset64(uint64_t *s, uint64_t v, size_t count); +#ifdef __HAVE_ARCH_MEMSET16 static inline void *memset16(uint16_t *s, uint16_t v, size_t count) { return __memset16(s, v, count * sizeof(v)); } +#endif +#ifdef __HAVE_ARCH_MEMSET32 static inline void *memset32(uint32_t *s, uint32_t v, size_t count) { return __memset32(s, v, count * sizeof(v)); } +#endif +#ifdef __HAVE_ARCH_MEMSET64 +#ifdef IN_BOOT_STRING_C +void *memset64(uint64_t *s, uint64_t v, size_t count); +#else static inline void *memset64(uint64_t *s, uint64_t v, size_t count) { return __memset64(s, v, count * sizeof(v)); } +#endif +#endif #if !defined(IN_ARCH_STRING_C) && (!defined(CONFIG_FORTIFY_SOURCE) || defined(__NO_FORTIFY)) diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index a674c7d25da5..d02a709717b8 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -16,7 +16,7 @@ /* * General size of kernel stacks */ -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN) || defined(CONFIG_KMSAN) #define THREAD_SIZE_ORDER 4 #else #define THREAD_SIZE_ORDER 2 diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 81ae8a98e7ec..9213be0529ee 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -18,6 +18,7 @@ #include <asm/extable.h> #include <asm/facility.h> #include <asm-generic/access_ok.h> +#include <linux/instrumented.h> void debug_user_asce(int exit); @@ -78,13 +79,24 @@ union oac { int __noreturn __put_user_bad(void); -#define __put_user_asm(to, from, size) \ -({ \ +#ifdef CONFIG_KMSAN +#define get_put_user_noinstr_attributes \ + noinline __maybe_unused __no_sanitize_memory +#else +#define get_put_user_noinstr_attributes __always_inline +#endif + +#define DEFINE_PUT_USER(type) \ +static get_put_user_noinstr_attributes int \ +__put_user_##type##_noinstr(unsigned type __user *to, \ + unsigned type *from, \ + unsigned long size) \ +{ \ union oac __oac_spec = { \ .oac1.as = PSW_BITS_AS_SECONDARY, \ .oac1.a = 1, \ }; \ - int __rc; \ + int rc; \ \ asm volatile( \ " lr 0,%[spec]\n" \ @@ -93,12 +105,28 @@ int __noreturn __put_user_bad(void); "2:\n" \ EX_TABLE_UA_STORE(0b, 2b, %[rc]) \ EX_TABLE_UA_STORE(1b, 2b, %[rc]) \ - : [rc] "=&d" (__rc), [_to] "+Q" (*(to)) \ + : [rc] "=&d" (rc), [_to] "+Q" (*(to)) \ : [_size] "d" (size), [_from] "Q" (*(from)), \ [spec] "d" (__oac_spec.val) \ : "cc", "0"); \ - __rc; \ -}) + return rc; \ +} \ + \ +static __always_inline int \ +__put_user_##type(unsigned type __user *to, unsigned type *from, \ + unsigned long size) \ +{ \ + int rc; \ + \ + rc = __put_user_##type##_noinstr(to, from, size); \ + instrument_put_user(*from, to, size); \ + return rc; \ +} + +DEFINE_PUT_USER(char); +DEFINE_PUT_USER(short); +DEFINE_PUT_USER(int); +DEFINE_PUT_USER(long); static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned long size) { @@ -106,24 +134,24 @@ static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned lon switch (size) { case 1: - rc = __put_user_asm((unsigned char __user *)ptr, - (unsigned char *)x, - size); + rc = __put_user_char((unsigned char __user *)ptr, + (unsigned char *)x, + size); break; case 2: - rc = __put_user_asm((unsigned short __user *)ptr, - (unsigned short *)x, - size); + rc = __put_user_short((unsigned short __user *)ptr, + (unsigned short *)x, + size); break; case 4: - rc = __put_user_asm((unsigned int __user *)ptr, + rc = __put_user_int((unsigned int __user *)ptr, (unsigned int *)x, size); break; case 8: - rc = __put_user_asm((unsigned long __user *)ptr, - (unsigned long *)x, - size); + rc = __put_user_long((unsigned long __user *)ptr, + (unsigned long *)x, + size); break; default: __put_user_bad(); @@ -134,13 +162,17 @@ static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned lon int __noreturn __get_user_bad(void); -#define __get_user_asm(to, from, size) \ -({ \ +#define DEFINE_GET_USER(type) \ +static get_put_user_noinstr_attributes int \ +__get_user_##type##_noinstr(unsigned type *to, \ + unsigned type __user *from, \ + unsigned long size) \ +{ \ union oac __oac_spec = { \ .oac2.as = PSW_BITS_AS_SECONDARY, \ .oac2.a = 1, \ }; \ - int __rc; \ + int rc; \ \ asm volatile( \ " lr 0,%[spec]\n" \ @@ -149,13 +181,29 @@ int __noreturn __get_user_bad(void); "2:\n" \ EX_TABLE_UA_LOAD_MEM(0b, 2b, %[rc], %[_to], %[_ksize]) \ EX_TABLE_UA_LOAD_MEM(1b, 2b, %[rc], %[_to], %[_ksize]) \ - : [rc] "=&d" (__rc), "=Q" (*(to)) \ + : [rc] "=&d" (rc), "=Q" (*(to)) \ : [_size] "d" (size), [_from] "Q" (*(from)), \ [spec] "d" (__oac_spec.val), [_to] "a" (to), \ [_ksize] "K" (size) \ : "cc", "0"); \ - __rc; \ -}) + return rc; \ +} \ + \ +static __always_inline int \ +__get_user_##type(unsigned type *to, unsigned type __user *from, \ + unsigned long size) \ +{ \ + int rc; \ + \ + rc = __get_user_##type##_noinstr(to, from, size); \ + instrument_get_user(*to); \ + return rc; \ +} + +DEFINE_GET_USER(char); +DEFINE_GET_USER(short); +DEFINE_GET_USER(int); +DEFINE_GET_USER(long); static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size) { @@ -163,24 +211,24 @@ static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsign switch (size) { case 1: - rc = __get_user_asm((unsigned char *)x, - (unsigned char __user *)ptr, - size); + rc = __get_user_char((unsigned char *)x, + (unsigned char __user *)ptr, + size); break; case 2: - rc = __get_user_asm((unsigned short *)x, - (unsigned short __user *)ptr, - size); + rc = __get_user_short((unsigned short *)x, + (unsigned short __user *)ptr, + size); break; case 4: - rc = __get_user_asm((unsigned int *)x, + rc = __get_user_int((unsigned int *)x, (unsigned int __user *)ptr, size); break; case 8: - rc = __get_user_asm((unsigned long *)x, - (unsigned long __user *)ptr, - size); + rc = __get_user_long((unsigned long *)x, + (unsigned long __user *)ptr, + size); break; default: __get_user_bad(); diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index 9b65f04c83de..ac7b8c8e3133 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -282,12 +282,14 @@ int diag224(void *ptr) int rc = -EOPNOTSUPP; diag_stat_inc(DIAG_STAT_X224); - asm volatile( - " diag %1,%2,0x224\n" - "0: lhi %0,0x0\n" + asm volatile("\n" + " diag %[type],%[addr],0x224\n" + "0: lhi %[rc],0\n" "1:\n" EX_TABLE(0b,1b) - : "+d" (rc) :"d" (0), "d" (addr) : "memory"); + : [rc] "+d" (rc) + , "=m" (*(struct { char buf[PAGE_SIZE]; } *)ptr) + : [type] "d" (0), [addr] "d" (addr)); return rc; } EXPORT_SYMBOL(diag224); diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index ddf2ee47cb87..0bd6adc40a34 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -12,6 +12,7 @@ #include <linux/ftrace.h> #include <linux/kernel.h> #include <linux/types.h> +#include <linux/kmsan-checks.h> #include <linux/kprobes.h> #include <linux/execmem.h> #include <trace/syscall.h> @@ -303,6 +304,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; + kmsan_unpoison_memory(fregs, sizeof(*fregs)); regs = ftrace_get_regs(fregs); p = get_kprobe((kprobe_opcode_t *)ip); if (!regs || unlikely(!p) || kprobe_disabled(p)) diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index a7c211a3a0c9..160b2acba8db 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -27,6 +27,7 @@ #include <linux/uaccess.h> #include <linux/cpu.h> #include <linux/entry-common.h> +#include <linux/kmsan.h> #include <asm/asm-extable.h> #include <asm/vtime.h> #include <asm/fpu.h> @@ -262,6 +263,11 @@ static void monitor_event_exception(struct pt_regs *regs) void kernel_stack_overflow(struct pt_regs *regs) { + /* + * Normally regs are unpoisoned by the generic entry code, but + * kernel_stack_overflow() is a rare case that is called bypassing it. + */ + kmsan_unpoison_entry_regs(regs); bust_spinlocks(1); printk("Kernel stack overflow.\n"); show_regs(regs); diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c index 0ece156fdd7c..cd44be2b6ce8 100644 --- a/arch/s390/kernel/unwind_bc.c +++ b/arch/s390/kernel/unwind_bc.c @@ -49,6 +49,8 @@ static inline bool is_final_pt_regs(struct unwind_state *state, READ_ONCE_NOCHECK(regs->psw.mask) & PSW_MASK_PSTATE; } +/* Avoid KMSAN false positives from touching uninitialized frames. */ +__no_kmsan_checks bool unwind_next_frame(struct unwind_state *state) { struct stack_info *info = &state->stack_info; @@ -118,6 +120,8 @@ out_stop: } EXPORT_SYMBOL_GPL(unwind_next_frame); +/* Avoid KMSAN false positives from touching uninitialized frames. */ +__no_kmsan_checks void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long first_frame) { diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 34d558164f0d..ded0eff58a19 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -169,7 +169,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, __set_huge_pte_at(mm, addr, ptep, pte); } -pte_t huge_ptep_get(pte_t *ptep) +pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { return __rste_to_pte(pte_val(*ptep)); } @@ -177,7 +177,7 @@ pte_t huge_ptep_get(pte_t *ptep) pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_t pte = huge_ptep_get(ptep); + pte_t pte = huge_ptep_get(mm, addr, ptep); pmd_t *pmdp = (pmd_t *) ptep; pud_t *pudp = (pud_t *) ptep; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 00b247d924a9..53d7cb5bbffe 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -490,7 +490,7 @@ void flush_dcache_folio(struct folio *folio) } set_dcache_dirty(folio, this_cpu); } else { - /* We could delay the flush for the !page_mapping + /* We could delay the flush for the !folio_mapping * case too. But that case is for exec env/arg * pages and those are %99 certainly going to get * faulted into the tlb (and thus flushed) anyways. diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 28002cc7a37d..d8dbeac8b206 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -988,8 +988,6 @@ static void __meminit free_pagetable(struct page *page, int order) /* bootmem page has reserved flag */ if (PageReserved(page)) { - __ClearPageReserved(page); - magic = page->index; if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { while (nr_pages--) @@ -1362,18 +1360,6 @@ void __init mem_init(void) preallocate_vmalloc_pages(); } -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask) -{ - /* - * More CPUs always led to greater speedups on tested systems, up to - * all the nodes' CPUs. Use all since the system is otherwise idle - * now. - */ - return max_t(int, cpumask_weight(node_cpumask), 1); -} -#endif - int kernel_set_to_readonly; void mark_rodata_ro(void) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 443a97e515c0..44f7b2ea6a07 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1119,8 +1119,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, lpinc = PMD_SIZE; /* * Clear the PSE flags if the PRESENT flag is not set - * otherwise pmd_present/pmd_huge will return true - * even on a non present pmd. + * otherwise pmd_present() will return true even on a non + * present pmd. */ if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) pgprot_val(ref_prot) &= ~_PAGE_PSE; diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 9a7e5e57ee9a..1647a7cc3fbf 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -410,9 +410,9 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, typedef pte_t *pte_addr_t; -void update_mmu_tlb(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep); -#define __HAVE_ARCH_UPDATE_MMU_TLB +void update_mmu_tlb_range(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr); +#define update_mmu_tlb_range update_mmu_tlb_range #endif /* !defined (__ASSEMBLY__) */ diff --git a/arch/xtensa/mm/tlb.c b/arch/xtensa/mm/tlb.c index d8b60d6e50a8..0a1a815dc796 100644 --- a/arch/xtensa/mm/tlb.c +++ b/arch/xtensa/mm/tlb.c @@ -163,10 +163,10 @@ void local_flush_tlb_kernel_range(unsigned long start, unsigned long end) } } -void update_mmu_tlb(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +void update_mmu_tlb_range(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) { - local_flush_tlb_page(vma, address); + local_flush_tlb_range(vma, address, address + PAGE_SIZE * nr); } #ifdef CONFIG_DEBUG_TLB_SANITY |