summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/include/asm/cacheflush.h2
-rw-r--r--arch/arm/include/asm/hugetlb-3level.h4
-rw-r--r--arch/arm64/include/asm/cacheflush.h2
-rw-r--r--arch/arm64/include/asm/hugetlb.h2
-rw-r--r--arch/arm64/mm/hugetlbpage.c2
-rw-r--r--arch/loongarch/include/asm/pgtable.h4
-rw-r--r--arch/mips/include/asm/pgtable.h4
-rw-r--r--arch/mips/mm/cache.c2
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/include/asm/book3s/32/pgalloc.h2
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash-4k.h15
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash.h40
-rw-r--r--arch/powerpc/include/asm/book3s/64/hugetlb.h38
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable-4k.h47
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable-64k.h20
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h22
-rw-r--r--arch/powerpc/include/asm/hugetlb.h15
-rw-r--r--arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h38
-rw-r--r--arch/powerpc/include/asm/nohash/32/mmu-8xx.h9
-rw-r--r--arch/powerpc/include/asm/nohash/32/pte-44x.h3
-rw-r--r--arch/powerpc/include/asm/nohash/32/pte-85xx.h3
-rw-r--r--arch/powerpc/include/asm/nohash/32/pte-8xx.h58
-rw-r--r--arch/powerpc/include/asm/nohash/hugetlb-e500.h39
-rw-r--r--arch/powerpc/include/asm/nohash/mmu-e500.h6
-rw-r--r--arch/powerpc/include/asm/nohash/pgalloc.h2
-rw-r--r--arch/powerpc/include/asm/nohash/pgtable.h46
-rw-r--r--arch/powerpc/include/asm/nohash/pte-e500.h63
-rw-r--r--arch/powerpc/include/asm/page.h32
-rw-r--r--arch/powerpc/include/asm/pgtable-be-types.h10
-rw-r--r--arch/powerpc/include/asm/pgtable-types.h13
-rw-r--r--arch/powerpc/include/asm/pgtable.h3
-rw-r--r--arch/powerpc/kernel/exceptions-64e.S4
-rw-r--r--arch/powerpc/kernel/head_85xx.S70
-rw-r--r--arch/powerpc/kernel/head_8xx.S10
-rw-r--r--arch/powerpc/kernel/setup_64.c6
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c11
-rw-r--r--arch/powerpc/mm/book3s64/hugetlbpage.c10
-rw-r--r--arch/powerpc/mm/book3s64/pgtable.c12
-rw-r--r--arch/powerpc/mm/hugetlbpage.c455
-rw-r--r--arch/powerpc/mm/init-common.c8
-rw-r--r--arch/powerpc/mm/kasan/8xx.c21
-rw-r--r--arch/powerpc/mm/nohash/8xx.c43
-rw-r--r--arch/powerpc/mm/nohash/Makefile2
-rw-r--r--arch/powerpc/mm/nohash/book3e_pgtable.c4
-rw-r--r--arch/powerpc/mm/nohash/tlb.c407
-rw-r--r--arch/powerpc/mm/nohash/tlb_64e.c314
-rw-r--r--arch/powerpc/mm/nohash/tlb_low_64e.S428
-rw-r--r--arch/powerpc/mm/pgtable.c94
-rw-r--r--arch/powerpc/mm/pgtable_32.c2
-rw-r--r--arch/riscv/include/asm/hugetlb.h2
-rw-r--r--arch/riscv/include/asm/pgtable.h4
-rw-r--r--arch/riscv/mm/hugetlbpage.c2
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/Makefile2
-rw-r--r--arch/s390/boot/Makefile3
-rw-r--r--arch/s390/boot/kmsan.c6
-rw-r--r--arch/s390/boot/startup.c7
-rw-r--r--arch/s390/boot/string.c16
-rw-r--r--arch/s390/include/asm/checksum.h2
-rw-r--r--arch/s390/include/asm/cpacf.h3
-rw-r--r--arch/s390/include/asm/cpu_mf.h6
-rw-r--r--arch/s390/include/asm/hugetlb.h4
-rw-r--r--arch/s390/include/asm/irqflags.h17
-rw-r--r--arch/s390/include/asm/kmsan.h59
-rw-r--r--arch/s390/include/asm/pgtable.h12
-rw-r--r--arch/s390/include/asm/string.h20
-rw-r--r--arch/s390/include/asm/thread_info.h2
-rw-r--r--arch/s390/include/asm/uaccess.h112
-rw-r--r--arch/s390/kernel/diag.c10
-rw-r--r--arch/s390/kernel/ftrace.c2
-rw-r--r--arch/s390/kernel/traps.c6
-rw-r--r--arch/s390/kernel/unwind_bc.c4
-rw-r--r--arch/s390/mm/hugetlbpage.c4
-rw-r--r--arch/sparc/mm/init_64.c2
-rw-r--r--arch/x86/mm/init_64.c14
-rw-r--r--arch/x86/mm/pat/set_memory.c4
-rw-r--r--arch/xtensa/include/asm/pgtable.h6
-rw-r--r--arch/xtensa/mm/tlb.c6
78 files changed, 958 insertions, 1828 deletions
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 1075534b0a2e..8ed8b9a24efe 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -283,7 +283,7 @@ void flush_cache_pages(struct vm_area_struct *vma, unsigned long user_addr,
* flush_dcache_page is used when the kernel has written to the page
* cache page at virtual address page->virtual.
*
- * If this page isn't mapped (ie, page_mapping == NULL), or it might
+ * If this page isn't mapped (ie, folio_mapping == NULL), or it might
* have userspace mappings, then we _must_ always clean + invalidate
* the dcache entries associated with the kernel mapping.
*
diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h
index a30be5505793..87d48e2d90ad 100644
--- a/arch/arm/include/asm/hugetlb-3level.h
+++ b/arch/arm/include/asm/hugetlb-3level.h
@@ -13,12 +13,12 @@
/*
* If our huge pte is non-zero then mark the valid bit.
- * This allows pte_present(huge_ptep_get(ptep)) to return true for non-zero
+ * This allows pte_present(huge_ptep_get(mm,addr,ptep)) to return true for non-zero
* ptes.
* (The valid bit is automatically cleared by set_pte_at for PROT_NONE ptes).
*/
#define __HAVE_ARCH_HUGE_PTEP_GET
-static inline pte_t huge_ptep_get(pte_t *ptep)
+static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
pte_t retval = *ptep;
if (pte_val(retval))
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index fefac75fa009..28ab96e808ef 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -117,7 +117,7 @@ extern void copy_to_user_page(struct vm_area_struct *, struct page *,
* flush_dcache_folio is used when the kernel has written to the page
* cache page at virtual address page->virtual.
*
- * If this page isn't mapped (ie, page_mapping == NULL), or it might
+ * If this page isn't mapped (ie, folio_mapping == NULL), or it might
* have userspace mappings, then we _must_ always clean + invalidate
* the dcache entries associated with the kernel mapping.
*
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 3954cbd2ff56..293f880865e8 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -46,7 +46,7 @@ extern pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long sz);
#define __HAVE_ARCH_HUGE_PTEP_GET
-extern pte_t huge_ptep_get(pte_t *ptep);
+extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void __init arm64_hugetlb_cma_reserve(void);
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 3f09ac73cce3..5f1e2103888b 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -127,7 +127,7 @@ static inline int num_contig_ptes(unsigned long size, size_t *pgsize)
return contig_ptes;
}
-pte_t huge_ptep_get(pte_t *ptep)
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
int ncontig, i;
size_t pgsize;
diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index af3acdf3481a..161dd6e10479 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -467,8 +467,8 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
#define update_mmu_cache(vma, addr, ptep) \
update_mmu_cache_range(NULL, vma, addr, ptep, 1)
-#define __HAVE_ARCH_UPDATE_MMU_TLB
-#define update_mmu_tlb update_mmu_cache
+#define update_mmu_tlb_range(vma, addr, ptep, nr) \
+ update_mmu_cache_range(NULL, vma, addr, ptep, nr)
static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index e27a4c83c548..c29a551eb0ca 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -594,8 +594,8 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
#define update_mmu_cache(vma, address, ptep) \
update_mmu_cache_range(NULL, vma, address, ptep, 1)
-#define __HAVE_ARCH_UPDATE_MMU_TLB
-#define update_mmu_tlb update_mmu_cache
+#define update_mmu_tlb_range(vma, address, ptep, nr) \
+ update_mmu_cache_range(NULL, vma, address, ptep, nr)
static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index df1ced4fc3b5..bf9a37c60e9f 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -112,7 +112,7 @@ void __flush_dcache_pages(struct page *page, unsigned int nr)
}
/*
- * We could delay the flush for the !page_mapping case too. But that
+ * We could delay the flush for the !folio_mapping case too. But that
* case is for exec env/arg pages and those are %99 certainly going to
* get faulted into the tlb (and thus flushed) anyways.
*/
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f8891fbe7c16..bc5a1612be72 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -135,7 +135,6 @@ config PPC
select ARCH_HAS_DMA_MAP_DIRECT if PPC_PSERIES
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
- select ARCH_HAS_HUGEPD if HUGETLB_PAGE
select ARCH_HAS_KCOV
select ARCH_HAS_KERNEL_FPU_SUPPORT if PPC64 && PPC_FPU
select ARCH_HAS_MEMBARRIER_CALLBACKS
diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index dc5c039eb28e..dd4eb3063175 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -47,8 +47,6 @@ static inline void pgtable_free(void *table, unsigned index_size)
}
}
-#define get_hugepd_cache_index(x) (x)
-
static inline void pgtable_free_tlb(struct mmu_gather *tlb,
void *table, int shift)
{
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 6472b08fa1b0..c654c376ef8b 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -74,21 +74,6 @@
#define remap_4k_pfn(vma, addr, pfn, prot) \
remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
-#ifdef CONFIG_HUGETLB_PAGE
-static inline int hash__hugepd_ok(hugepd_t hpd)
-{
- unsigned long hpdval = hpd_val(hpd);
- /*
- * if it is not a pte and have hugepd shift mask
- * set, then it is a hugepd directory pointer
- */
- if (!(hpdval & _PAGE_PTE) && (hpdval & _PAGE_PRESENT) &&
- ((hpdval & HUGEPD_SHIFT_MASK) != 0))
- return true;
- return false;
-}
-#endif
-
/*
* 4K PTE format is different from 64K PTE format. Saving the hash_slot is just
* a matter of returning the PTE bits that need to be modified. On 64K PTE,
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index faf3e3b4e4b2..0755f2567021 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -4,6 +4,7 @@
#ifdef __KERNEL__
#include <asm/asm-const.h>
+#include <asm/book3s/64/slice.h>
/*
* Common bits between 4K and 64K pages in a linux-style PTE.
@@ -161,14 +162,10 @@ extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long pte, int huge);
unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags);
/* Atomic PTE updates */
-static inline unsigned long hash__pte_update(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep, unsigned long clr,
- unsigned long set,
- int huge)
+static inline unsigned long hash__pte_update_one(pte_t *ptep, unsigned long clr,
+ unsigned long set)
{
__be64 old_be, tmp_be;
- unsigned long old;
__asm__ __volatile__(
"1: ldarx %0,0,%3 # pte_update\n\
@@ -182,11 +179,40 @@ static inline unsigned long hash__pte_update(struct mm_struct *mm,
: "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep),
"r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
: "cc" );
+
+ return be64_to_cpu(old_be);
+}
+
+static inline unsigned long hash__pte_update(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, unsigned long clr,
+ unsigned long set,
+ int huge)
+{
+ unsigned long old;
+
+ old = hash__pte_update_one(ptep, clr, set);
+
+ if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && huge) {
+ unsigned int psize = get_slice_psize(mm, addr);
+ int nb, i;
+
+ if (psize == MMU_PAGE_16M)
+ nb = SZ_16M / PMD_SIZE;
+ else if (psize == MMU_PAGE_16G)
+ nb = SZ_16G / PUD_SIZE;
+ else
+ nb = 1;
+
+ WARN_ON_ONCE(nb == 1); /* Should never happen */
+
+ for (i = 1; i < nb; i++)
+ hash__pte_update_one(ptep + i, clr, set);
+ }
/* huge pages use the old page table lock */
if (!huge)
assert_pte_locked(mm, addr);
- old = be64_to_cpu(old_be);
if (old & H_PAGE_HASHPTE)
hpte_need_flush(mm, addr, ptep, old, huge);
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index aa1c67c8bfc8..f0bba9c5f9c3 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -49,9 +49,6 @@ static inline bool gigantic_page_runtime_supported(void)
return true;
}
-/* hugepd entry valid bit */
-#define HUGEPD_VAL_BITS (0x8000000000000000UL)
-
#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
@@ -60,29 +57,7 @@ extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t old_pte, pte_t new_pte);
-/*
- * This should work for other subarchs too. But right now we use the
- * new format only for 64bit book3s
- */
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
- BUG_ON(!hugepd_ok(hpd));
- /*
- * We have only four bits to encode, MMU page size
- */
- BUILD_BUG_ON((MMU_PAGE_COUNT - 1) > 0xf);
- return __va(hpd_val(hpd) & HUGEPD_ADDR_MASK);
-}
-
-static inline unsigned int hugepd_mmu_psize(hugepd_t hpd)
-{
- return (hpd_val(hpd) & HUGEPD_SHIFT_MASK) >> 2;
-}
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
- return mmu_psize_to_shift(hugepd_mmu_psize(hpd));
-}
static inline void flush_hugetlb_page(struct vm_area_struct *vma,
unsigned long vmaddr)
{
@@ -90,19 +65,6 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma,
return radix__flush_hugetlb_page(vma, vmaddr);
}
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
- unsigned int pdshift)
-{
- unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(hpd);
-
- return hugepd_page(hpd) + idx;
-}
-
-static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift)
-{
- *hpdp = __hugepd(__pa(new) | HUGEPD_VAL_BITS | (shift_to_mmu_psize(pshift) << 2));
-}
-
void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
static inline int check_and_get_huge_psize(int shift)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
deleted file mode 100644
index baf934578c3a..000000000000
--- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H
-#define _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H
-/*
- * hash 4k can't share hugetlb and also doesn't support THP
- */
-#ifndef __ASSEMBLY__
-#ifdef CONFIG_HUGETLB_PAGE
-/*
- * With radix , we have hugepage ptes in the pud and pmd entries. We don't
- * need to setup hugepage directory for them. Our pte and page directory format
- * enable us to have this enabled.
- */
-static inline int hugepd_ok(hugepd_t hpd)
-{
- if (radix_enabled())
- return 0;
- return hash__hugepd_ok(hpd);
-}
-#define is_hugepd(hpd) (hugepd_ok(hpd))
-
-/*
- * 16M and 16G huge page directory tables are allocated from slab cache
- *
- */
-#define H_16M_CACHE_INDEX (PAGE_SHIFT + H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE - 24)
-#define H_16G_CACHE_INDEX \
- (PAGE_SHIFT + H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE + H_PUD_INDEX_SIZE - 34)
-
-static inline int get_hugepd_cache_index(int index)
-{
- switch (index) {
- case H_16M_CACHE_INDEX:
- return HTLB_16M_INDEX;
- case H_16G_CACHE_INDEX:
- return HTLB_16G_INDEX;
- default:
- BUG();
- }
- /* should not reach */
-}
-
-#endif /* CONFIG_HUGETLB_PAGE */
-
-#endif /* __ASSEMBLY__ */
-
-#endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
index 6ac73da7b80e..4d8d7b4ea16b 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
@@ -5,26 +5,6 @@
#ifndef __ASSEMBLY__
#ifdef CONFIG_HUGETLB_PAGE
-/*
- * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't
- * need to setup hugepage directory for them. Our pte and page directory format
- * enable us to have this enabled.
- */
-static inline int hugepd_ok(hugepd_t hpd)
-{
- return 0;
-}
-
-#define is_hugepd(pdep) 0
-
-/*
- * This should never get called
- */
-static __always_inline int get_hugepd_cache_index(int index)
-{
- BUILD_BUG();
-}
-
#endif /* CONFIG_HUGETLB_PAGE */
static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr,
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 8f9432e3855a..519b1743a0f4 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -274,6 +274,24 @@ static inline bool pud_leaf(pud_t pud)
{
return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
}
+
+#define pmd_leaf_size pmd_leaf_size
+static inline unsigned long pmd_leaf_size(pmd_t pmd)
+{
+ if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !radix_enabled())
+ return SZ_16M;
+ else
+ return PMD_SIZE;
+}
+
+#define pud_leaf_size pud_leaf_size
+static inline unsigned long pud_leaf_size(pud_t pud)
+{
+ if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !radix_enabled())
+ return SZ_16G;
+ else
+ return PUD_SIZE;
+}
#endif /* __ASSEMBLY__ */
#include <asm/book3s/64/hash.h>
@@ -285,11 +303,9 @@ static inline bool pud_leaf(pud_t pud)
#define MAX_PHYSMEM_BITS R_MAX_PHYSMEM_BITS
#endif
-
+/* hash 4k can't share hugetlb and also doesn't support THP */
#ifdef CONFIG_PPC_64K_PAGES
#include <asm/book3s/64/pgtable-64k.h>
-#else
-#include <asm/book3s/64/pgtable-4k.h>
#endif
#include <asm/barrier.h>
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index ea71f7245a63..18a3028ac3b6 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -30,10 +30,9 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
}
#define is_hugepage_only_range is_hugepage_only_range
-#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
-void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
- unsigned long end, unsigned long floor,
- unsigned long ceiling);
+#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte, unsigned long sz);
#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
@@ -67,14 +66,6 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma,
{
}
-#define hugepd_shift(x) 0
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
- unsigned pdshift)
-{
- return NULL;
-}
-
-
static inline void __init gigantic_hugetlb_cma_reserve(void)
{
}
diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
index 92df40c6cc6b..014799557f60 100644
--- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -4,42 +4,12 @@
#define PAGE_SHIFT_8M 23
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
- BUG_ON(!hugepd_ok(hpd));
-
- return (pte_t *)__va(hpd_val(hpd) & ~HUGEPD_SHIFT_MASK);
-}
-
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
- return PAGE_SHIFT_8M;
-}
-
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
- unsigned int pdshift)
-{
- unsigned long idx = (addr & (SZ_4M - 1)) >> PAGE_SHIFT;
-
- return hugepd_page(hpd) + idx;
-}
-
static inline void flush_hugetlb_page(struct vm_area_struct *vma,
unsigned long vmaddr)
{
flush_tlb_page(vma, vmaddr);
}
-static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift)
-{
- *hpdp = __hugepd(__pa(new) | _PMD_USER | _PMD_PRESENT | _PMD_PAGE_8M);
-}
-
-static inline void hugepd_populate_kernel(hugepd_t *hpdp, pte_t *new, unsigned int pshift)
-{
- *hpdp = __hugepd(__pa(new) | _PMD_PRESENT | _PMD_PAGE_8M);
-}
-
static inline int check_and_get_huge_psize(int shift)
{
return shift_to_mmu_psize(shift);
@@ -49,6 +19,14 @@ static inline int check_and_get_huge_psize(int shift)
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
pte_t pte, unsigned long sz);
+#define __HAVE_ARCH_HUGE_PTEP_GET
+static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ if (ptep_is_8m_pmdp(mm, addr, ptep))
+ ptep = pte_offset_kernel((pmd_t *)ptep, ALIGN_DOWN(addr, SZ_8M));
+ return ptep_get(ptep);
+}
+
#define __HAVE_ARCH_HUGE_PTE_CLEAR
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long sz)
diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
index 141d82e249a8..a756a1e59c54 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -189,19 +189,14 @@ typedef struct {
#define PHYS_IMMR_BASE (mfspr(SPRN_IMMR) & 0xfff80000)
-/* Page size definitions, common between 32 and 64-bit
+/*
+ * Page size definitions for 8xx
*
* shift : is the "PAGE_SHIFT" value for that page size
- * penc : is the pte encoding mask
*
*/
struct mmu_psize_def {
unsigned int shift; /* number of bits */
- unsigned int enc; /* PTE encoding */
- unsigned int ind; /* Corresponding indirect page size shift */
- unsigned int flags;
-#define MMU_PAGE_SIZE_DIRECT 0x1 /* Supported as a direct size */
-#define MMU_PAGE_SIZE_INDIRECT 0x2 /* Supported as an indirect size */
};
extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
diff --git a/arch/powerpc/include/asm/nohash/32/pte-44x.h b/arch/powerpc/include/asm/nohash/32/pte-44x.h
index 851813725237..da0469928273 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-44x.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-44x.h
@@ -75,9 +75,6 @@
#define _PAGE_NO_CACHE 0x00000400 /* H: I bit */
#define _PAGE_WRITETHRU 0x00000800 /* H: W bit */
-/* No page size encoding in the linux PTE */
-#define _PAGE_PSIZE 0
-
/* TODO: Add large page lowmem mapping support */
#define _PMD_PRESENT 0
#define _PMD_PRESENT_MASK (PAGE_MASK)
diff --git a/arch/powerpc/include/asm/nohash/32/pte-85xx.h b/arch/powerpc/include/asm/nohash/32/pte-85xx.h
index 653a342d3b25..14d64b4f3f14 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-85xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-85xx.h
@@ -31,9 +31,6 @@
#define _PAGE_WRITETHRU 0x00400 /* H: W bit */
#define _PAGE_SPECIAL 0x00800 /* S: Special page */
-/* No page size encoding in the linux PTE */
-#define _PAGE_PSIZE 0
-
#define _PMD_PRESENT 0
#define _PMD_PRESENT_MASK (PAGE_MASK)
#define _PMD_BAD (~PAGE_MASK)
diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 137dc3c84e45..54ebb91dbdcf 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -74,12 +74,11 @@
#define _PTE_NONE_MASK 0
#ifdef CONFIG_PPC_16K_PAGES
-#define _PAGE_PSIZE _PAGE_SPS
+#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_SPS)
#else
-#define _PAGE_PSIZE 0
+#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED)
#endif
-#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
#define _PAGE_BASE (_PAGE_BASE_NC)
#include <asm/pgtable-masks.h>
@@ -120,7 +119,7 @@ static inline pte_t pte_mkhuge(pte_t pte)
#define pte_mkhuge pte_mkhuge
-static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p,
+static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
unsigned long clr, unsigned long set, int huge);
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -142,19 +141,12 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, pte_t *pt
}
#define __ptep_set_access_flags __ptep_set_access_flags
-static inline unsigned long pgd_leaf_size(pgd_t pgd)
-{
- if (pgd_val(pgd) & _PMD_PAGE_8M)
- return SZ_8M;
- return SZ_4M;
-}
-
-#define pgd_leaf_size pgd_leaf_size
-
-static inline unsigned long pte_leaf_size(pte_t pte)
+static inline unsigned long __pte_leaf_size(pmd_t pmd, pte_t pte)
{
pte_basic_t val = pte_val(pte);
+ if (pmd_val(pmd) & _PMD_PAGE_8M)
+ return SZ_8M;
if (val & _PAGE_HUGE)
return SZ_512K;
if (val & _PAGE_SPS)
@@ -162,31 +154,38 @@ static inline unsigned long pte_leaf_size(pte_t pte)
return SZ_4K;
}
-#define pte_leaf_size pte_leaf_size
+#define __pte_leaf_size __pte_leaf_size
/*
* On the 8xx, the page tables are a bit special. For 16k pages, we have
* 4 identical entries. For 512k pages, we have 128 entries as if it was
* 4k pages, but they are flagged as 512k pages for the hardware.
- * For other page sizes, we have a single entry in the table.
+ * For 8M pages, we have 1024 entries as if it was 4M pages (PMD_SIZE)
+ * but they are flagged as 8M pages for the hardware.
+ * For 4k pages, we have a single entry in the table.
*/
static pmd_t *pmd_off(struct mm_struct *mm, unsigned long addr);
-static int hugepd_ok(hugepd_t hpd);
+static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address);
+
+static inline bool ptep_is_8m_pmdp(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ return (pmd_t *)ptep == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M));
+}
static inline int number_of_cells_per_pte(pmd_t *pmd, pte_basic_t val, int huge)
{
if (!huge)
return PAGE_SIZE / SZ_4K;
- else if (hugepd_ok(*((hugepd_t *)pmd)))
- return 1;
+ else if ((pmd_val(*pmd) & _PMD_PAGE_MASK) == _PMD_PAGE_8M)
+ return SZ_4M / SZ_4K;
else if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !(val & _PAGE_HUGE))
return SZ_16K / SZ_4K;
else
return SZ_512K / SZ_4K;
}
-static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p,
- unsigned long clr, unsigned long set, int huge)
+static inline pte_basic_t __pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p,
+ unsigned long clr, unsigned long set, int huge)
{
pte_basic_t *entry = (pte_basic_t *)p;
pte_basic_t old = pte_val(*p);
@@ -198,7 +197,7 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p
for (i = 0; i < num; i += PAGE_SIZE / SZ_4K, new += PAGE_SIZE) {
*entry++ = new;
- if (IS_ENABLED(CONFIG_PPC_16K_PAGES) && num != 1) {
+ if (IS_ENABLED(CONFIG_PPC_16K_PAGES)) {
*entry++ = new;
*entry++ = new;
*entry++ = new;
@@ -208,6 +207,21 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p
return old;
}
+static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ unsigned long clr, unsigned long set, int huge)
+{
+ pte_basic_t old;
+
+ if (huge && ptep_is_8m_pmdp(mm, addr, ptep)) {
+ pmd_t *pmdp = (pmd_t *)ptep;
+
+ old = __pte_update(mm, addr, pte_offset_kernel(pmdp, 0), clr, set, huge);
+ __pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), clr, set, huge);
+ } else {
+ old = __pte_update(mm, addr, ptep, clr, set, huge);
+ }
+ return old;
+}
#define pte_update pte_update
#ifdef CONFIG_PPC_16K_PAGES
diff --git a/arch/powerpc/include/asm/nohash/hugetlb-e500.h b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
index 8f04ad20e040..cab0e1f1eea0 100644
--- a/arch/powerpc/include/asm/nohash/hugetlb-e500.h
+++ b/arch/powerpc/include/asm/nohash/hugetlb-e500.h
@@ -2,38 +2,8 @@
#ifndef _ASM_POWERPC_NOHASH_HUGETLB_E500_H
#define _ASM_POWERPC_NOHASH_HUGETLB_E500_H
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
- if (WARN_ON(!hugepd_ok(hpd)))
- return NULL;
-
- return (pte_t *)((hpd_val(hpd) & ~HUGEPD_SHIFT_MASK) | PD_HUGE);
-}
-
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
- return hpd_val(hpd) & HUGEPD_SHIFT_MASK;
-}
-
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
- unsigned int pdshift)
-{
- /*
- * On FSL BookE, we have multiple higher-level table entries that
- * point to the same hugepte. Just use the first one since they're all
- * identical. So for that case, idx=0.
- */
- return hugepd_page(hpd);
-}
-
void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
-static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift)
-{
- /* We use the old format for PPC_E500 */
- *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
-}
-
static inline int check_and_get_huge_psize(int shift)
{
if (shift & 1) /* Not a power of 4 */
@@ -42,4 +12,13 @@ static inline int check_and_get_huge_psize(int shift)
return shift_to_mmu_psize(shift);
}
+static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
+{
+ unsigned int tsize = shift - _PAGE_PSIZE_SHIFT_OFFSET;
+ pte_basic_t val = (tsize << _PAGE_PSIZE_SHIFT) & _PAGE_PSIZE_MSK;
+
+ return __pte((pte_val(entry) & ~(pte_basic_t)_PAGE_PSIZE_MSK) | val);
+}
+#define arch_make_huge_pte arch_make_huge_pte
+
#endif /* _ASM_POWERPC_NOHASH_HUGETLB_E500_H */
diff --git a/arch/powerpc/include/asm/nohash/mmu-e500.h b/arch/powerpc/include/asm/nohash/mmu-e500.h
index 6ddced0415cb..b281d9eeaf1e 100644
--- a/arch/powerpc/include/asm/nohash/mmu-e500.h
+++ b/arch/powerpc/include/asm/nohash/mmu-e500.h
@@ -244,14 +244,11 @@ typedef struct {
/* Page size definitions, common between 32 and 64-bit
*
* shift : is the "PAGE_SHIFT" value for that page size
- * penc : is the pte encoding mask
*
*/
struct mmu_psize_def
{
unsigned int shift; /* number of bits */
- unsigned int enc; /* PTE encoding */
- unsigned int ind; /* Corresponding indirect page size shift */
unsigned int flags;
#define MMU_PAGE_SIZE_DIRECT 0x1 /* Supported as a direct size */
#define MMU_PAGE_SIZE_INDIRECT 0x2 /* Supported as an indirect size */
@@ -303,8 +300,7 @@ extern unsigned long linear_map_top;
extern int book3e_htw_mode;
#define PPC_HTW_NONE 0
-#define PPC_HTW_IBM 1
-#define PPC_HTW_E6500 2
+#define PPC_HTW_E6500 1
/*
* 64-bit booke platforms don't load the tlb in the tlb miss handler code.
diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h
index 4b62376318e1..d06efac6d7aa 100644
--- a/arch/powerpc/include/asm/nohash/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/pgalloc.h
@@ -44,8 +44,6 @@ static inline void pgtable_free(void *table, int shift)
}
}
-#define get_hugepd_cache_index(x) (x)
-
static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
{
unsigned long pgf = (unsigned long)table;
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h
index f5f39d4f03c8..8d1f0b7062eb 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -31,6 +31,13 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p
extern int icache_44x_need_flush;
+#ifndef pte_huge_size
+static inline unsigned long pte_huge_size(pte_t pte)
+{
+ return PAGE_SIZE;
+}
+#endif
+
/*
* PTE updates. This function is called whenever an existing
* valid PTE is updated. This does -not- include set_pte_at()
@@ -52,11 +59,34 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p
{
pte_basic_t old = pte_val(*p);
pte_basic_t new = (old & ~(pte_basic_t)clr) | set;
+ unsigned long sz;
+ unsigned long pdsize;
+ int i;
if (new == old)
return old;
- *p = __pte(new);
+ if (huge)
+ sz = pte_huge_size(__pte(old));
+ else
+ sz = PAGE_SIZE;
+
+ if (sz < PMD_SIZE)
+ pdsize = PAGE_SIZE;
+ else if (sz < PUD_SIZE)
+ pdsize = PMD_SIZE;
+ else if (sz < P4D_SIZE)
+ pdsize = PUD_SIZE;
+ else if (sz < PGDIR_SIZE)
+ pdsize = P4D_SIZE;
+ else
+ pdsize = PGDIR_SIZE;
+
+ for (i = 0; i < sz / pdsize; i++, p++) {
+ *p = __pte(new);
+ if (new)
+ new += (unsigned long long)(pdsize / PAGE_SIZE) << PTE_RPN_SHIFT;
+ }
if (IS_ENABLED(CONFIG_44x) && !is_kernel_addr(addr) && (old & _PAGE_EXEC))
icache_44x_need_flush = 1;
@@ -340,20 +370,6 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
#define pgprot_writecombine pgprot_noncached_wc
-#ifdef CONFIG_HUGETLB_PAGE
-static inline int hugepd_ok(hugepd_t hpd)
-{
-#ifdef CONFIG_PPC_8xx
- return ((hpd_val(hpd) & _PMD_PAGE_MASK) == _PMD_PAGE_8M);
-#else
- /* We clear the top bit to indicate hugepd */
- return (hpd_val(hpd) && (hpd_val(hpd) & PD_HUGE) == 0);
-#endif
-}
-
-#define is_hugepd(hpd) (hugepd_ok(hpd))
-#endif
-
int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
void unmap_kernel_page(unsigned long va);
diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h b/arch/powerpc/include/asm/nohash/pte-e500.h
index f516f0b5b7a8..cb78392494da 100644
--- a/arch/powerpc/include/asm/nohash/pte-e500.h
+++ b/arch/powerpc/include/asm/nohash/pte-e500.h
@@ -19,20 +19,7 @@
#define _PAGE_BAP_SX 0x000040
#define _PAGE_BAP_UX 0x000080
#define _PAGE_PSIZE_MSK 0x000f00
-#define _PAGE_PSIZE_4K 0x000200
-#define _PAGE_PSIZE_8K 0x000300
-#define _PAGE_PSIZE_16K 0x000400
-#define _PAGE_PSIZE_32K 0x000500
-#define _PAGE_PSIZE_64K 0x000600
-#define _PAGE_PSIZE_128K 0x000700
-#define _PAGE_PSIZE_256K 0x000800
-#define _PAGE_PSIZE_512K 0x000900
-#define _PAGE_PSIZE_1M 0x000a00
-#define _PAGE_PSIZE_2M 0x000b00
-#define _PAGE_PSIZE_4M 0x000c00
-#define _PAGE_PSIZE_8M 0x000d00
-#define _PAGE_PSIZE_16M 0x000e00
-#define _PAGE_PSIZE_32M 0x000f00
+#define _PAGE_TSIZE_4K 0x000100
#define _PAGE_DIRTY 0x001000 /* C: page changed */
#define _PAGE_SW0 0x002000
#define _PAGE_U3 0x004000
@@ -46,6 +33,9 @@
#define _PAGE_NO_CACHE 0x400000 /* I: cache inhibit */
#define _PAGE_WRITETHRU 0x800000 /* W: cache write-through */
+#define _PAGE_PSIZE_SHIFT 7
+#define _PAGE_PSIZE_SHIFT_OFFSET 10
+
/* "Higher level" linux bit combinations */
#define _PAGE_EXEC (_PAGE_BAP_SX | _PAGE_BAP_UX) /* .. and was cache cleaned */
#define _PAGE_READ (_PAGE_BAP_SR | _PAGE_BAP_UR) /* User read permission */
@@ -65,8 +55,6 @@
#define _PAGE_SPECIAL _PAGE_SW0
-/* Base page size */
-#define _PAGE_PSIZE _PAGE_PSIZE_4K
#define PTE_RPN_SHIFT (24)
#define PTE_WIMGE_SHIFT (19)
@@ -89,7 +77,7 @@
* pages. We always set _PAGE_COHERENT when SMP is enabled or
* the processor might need it for DMA coherency.
*/
-#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
+#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_TSIZE_4K)
#if defined(CONFIG_SMP)
#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT)
#else
@@ -105,6 +93,47 @@ static inline pte_t pte_mkexec(pte_t pte)
}
#define pte_mkexec pte_mkexec
+static inline unsigned long pte_huge_size(pte_t pte)
+{
+ pte_basic_t val = pte_val(pte);
+
+ return 1UL << (((val & _PAGE_PSIZE_MSK) >> _PAGE_PSIZE_SHIFT) + _PAGE_PSIZE_SHIFT_OFFSET);
+}
+#define pte_huge_size pte_huge_size
+
+static inline int pmd_leaf(pmd_t pmd)
+{
+ if (IS_ENABLED(CONFIG_PPC64))
+ return (long)pmd_val(pmd) > 0;
+ else
+ return pmd_val(pmd) & _PAGE_PSIZE_MSK;
+}
+#define pmd_leaf pmd_leaf
+
+static inline unsigned long pmd_leaf_size(pmd_t pmd)
+{
+ return pte_huge_size(__pte(pmd_val(pmd)));
+}
+#define pmd_leaf_size pmd_leaf_size
+
+#ifdef CONFIG_PPC64
+static inline int pud_leaf(pud_t pud)
+{
+ if (IS_ENABLED(CONFIG_PPC64))
+ return (long)pud_val(pud) > 0;
+ else
+ return pud_val(pud) & _PAGE_PSIZE_MSK;
+}
+#define pud_leaf pud_leaf
+
+static inline unsigned long pud_leaf_size(pud_t pud)
+{
+ return pte_huge_size(__pte(pud_val(pud)));
+}
+#define pud_leaf_size pud_leaf_size
+
+#endif
+
#endif /* __ASSEMBLY__ */
#endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index e411e5a70ea3..83d0a4fc5f75 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -269,38 +269,6 @@ static inline const void *pfn_to_kaddr(unsigned long pfn)
#define is_kernel_addr(x) ((x) >= TASK_SIZE)
#endif
-#ifndef CONFIG_PPC_BOOK3S_64
-/*
- * Use the top bit of the higher-level page table entries to indicate whether
- * the entries we point to contain hugepages. This works because we know that
- * the page tables live in kernel space. If we ever decide to support having
- * page tables at arbitrary addresses, this breaks and will have to change.
- */
-#ifdef CONFIG_PPC64
-#define PD_HUGE 0x8000000000000000UL
-#else
-#define PD_HUGE 0x80000000
-#endif
-
-#else /* CONFIG_PPC_BOOK3S_64 */
-/*
- * Book3S 64 stores real addresses in the hugepd entries to
- * avoid overlaps with _PAGE_PRESENT and _PAGE_PTE.
- */
-#define HUGEPD_ADDR_MASK (0x0ffffffffffffffful & ~HUGEPD_SHIFT_MASK)
-#endif /* CONFIG_PPC_BOOK3S_64 */
-
-/*
- * Some number of bits at the level of the page table that points to
- * a hugepte are used to encode the size. This masks those bits.
- * On 8xx, HW assistance requires 4k alignment for the hugepte.
- */
-#ifdef CONFIG_PPC_8xx
-#define HUGEPD_SHIFT_MASK 0xfff
-#else
-#define HUGEPD_SHIFT_MASK 0x3f
-#endif
-
#ifndef __ASSEMBLY__
#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h
index 82633200b500..6bd8f89b25dc 100644
--- a/arch/powerpc/include/asm/pgtable-be-types.h
+++ b/arch/powerpc/include/asm/pgtable-be-types.h
@@ -101,14 +101,4 @@ static inline bool pmd_xchg(pmd_t *pmdp, pmd_t old, pmd_t new)
return pmd_raw(old) == prev;
}
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-typedef struct { __be64 pdbe; } hugepd_t;
-#define __hugepd(x) ((hugepd_t) { cpu_to_be64(x) })
-
-static inline unsigned long hpd_val(hugepd_t x)
-{
- return be64_to_cpu(x.pdbe);
-}
-#endif
-
#endif /* _ASM_POWERPC_PGTABLE_BE_TYPES_H */
diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h
index 082c85cc09b1..7b3d4c592a10 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -49,7 +49,11 @@ static inline unsigned long pud_val(pud_t x)
#endif /* CONFIG_PPC64 */
/* PGD level */
+#if defined(CONFIG_PPC_E500) && defined(CONFIG_PTE_64BIT)
+typedef struct { unsigned long long pgd; } pgd_t;
+#else
typedef struct { unsigned long pgd; } pgd_t;
+#endif
#define __pgd(x) ((pgd_t) { (x) })
static inline unsigned long pgd_val(pgd_t x)
{
@@ -83,13 +87,4 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new)
}
#endif
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-typedef struct { unsigned long pd; } hugepd_t;
-#define __hugepd(x) ((hugepd_t) { (x) })
-static inline unsigned long hpd_val(hugepd_t x)
-{
- return x.pd;
-}
-#endif
-
#endif /* _ASM_POWERPC_PGTABLE_TYPES_H */
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 239709a2f68e..264a6c09517a 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -106,6 +106,9 @@ unsigned long vmalloc_to_phys(void *vmalloc_addr);
void pgtable_cache_add(unsigned int shift);
+#ifdef CONFIG_PPC32
+void __init *early_alloc_pgtable(unsigned long size);
+#endif
pte_t *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va);
#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_PPC32)
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index dcf0591ad3c2..63f6b9f513a4 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -485,8 +485,8 @@ interrupt_base_book3e: /* fake trap */
EXCEPTION_STUB(0x160, decrementer) /* 0x0900 */
EXCEPTION_STUB(0x180, fixed_interval) /* 0x0980 */
EXCEPTION_STUB(0x1a0, watchdog) /* 0x09f0 */
- EXCEPTION_STUB(0x1c0, data_tlb_miss)
- EXCEPTION_STUB(0x1e0, instruction_tlb_miss)
+ EXCEPTION_STUB(0x1c0, data_tlb_miss_bolted)
+ EXCEPTION_STUB(0x1e0, instruction_tlb_miss_bolted)
EXCEPTION_STUB(0x200, altivec_unavailable)
EXCEPTION_STUB(0x220, altivec_assist)
EXCEPTION_STUB(0x260, perfmon)
diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S
index 39724ff5ae1f..f9a73fae6464 100644
--- a/arch/powerpc/kernel/head_85xx.S
+++ b/arch/powerpc/kernel/head_85xx.S
@@ -294,9 +294,10 @@ set_ivor:
/* Macros to hide the PTE size differences
*
* FIND_PTE -- walks the page tables given EA & pgdir pointer
- * r10 -- EA of fault
+ * r10 -- free
* r11 -- PGDIR pointer
* r12 -- free
+ * r13 -- EA of fault
* label 2: is the bailout case
*
* if we find the pte (fall through):
@@ -307,34 +308,34 @@ set_ivor:
#ifdef CONFIG_PTE_64BIT
#ifdef CONFIG_HUGETLB_PAGE
#define FIND_PTE \
- rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \
- lwzx r11, r12, r11; /* Get pgd/pmd entry */ \
+ rlwinm r12, r13, 14, 18, 28; /* Compute pgdir/pmd offset */ \
+ add r12, r11, r12; \
+ lwz r11, 4(r12); /* Get pgd/pmd entry */ \
+ rlwinm. r10, r11, 32 - _PAGE_PSIZE_SHIFT, 0x1e; /* get tsize*/ \
+ bne 1000f; /* Huge page (leaf entry) */ \
rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */ \
- blt 1000f; /* Normal non-huge page */ \
beq 2f; /* Bail if no table */ \
- oris r11, r11, PD_HUGE@h; /* Put back address bit */ \
- andi. r10, r11, HUGEPD_SHIFT_MASK@l; /* extract size field */ \
- xor r12, r10, r11; /* drop size bits from pointer */ \
- b 1001f; \
-1000: rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \
+ rlwimi r12, r13, 23, 20, 28; /* Compute pte address */ \
li r10, 0; /* clear r10 */ \
-1001: lwz r11, 4(r12); /* Get pte entry */
+ lwz r11, 4(r12); /* Get pte entry */ \
+1000:
#else
#define FIND_PTE \
- rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \
- lwzx r11, r12, r11; /* Get pgd/pmd entry */ \
+ rlwinm r12, r13, 14, 18, 28; /* Compute pgdir/pmd offset */ \
+ add r12, r11, r12; \
+ lwz r11, 4(r12); /* Get pgd/pmd entry */ \
rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */ \
beq 2f; /* Bail if no table */ \
- rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \
+ rlwimi r12, r13, 23, 20, 28; /* Compute pte address */ \
lwz r11, 4(r12); /* Get pte entry */
#endif /* HUGEPAGE */
#else /* !PTE_64BIT */
#define FIND_PTE \
- rlwimi r11, r10, 12, 20, 29; /* Create L1 (pgdir/pmd) address */ \
+ rlwimi r11, r13, 12, 20, 29; /* Create L1 (pgdir/pmd) address */ \
lwz r11, 0(r11); /* Get L1 entry */ \
rlwinm. r12, r11, 0, 0, 19; /* Extract L2 (pte) base address */ \
beq 2f; /* Bail if no table */ \
- rlwimi r12, r10, 22, 20, 29; /* Compute PTE address */ \
+ rlwimi r12, r13, 22, 20, 29; /* Compute PTE address */ \
lwz r11, 0(r12); /* Get Linux PTE */
#endif
@@ -441,13 +442,13 @@ START_BTB_FLUSH_SECTION
BTB_FLUSH(r10)
1:
END_BTB_FLUSH_SECTION
- mfspr r10, SPRN_DEAR /* Get faulting address */
+ mfspr r13, SPRN_DEAR /* Get faulting address */
/* If we are faulting a kernel address, we have to use the
* kernel page tables.
*/
lis r11, PAGE_OFFSET@h
- cmplw 5, r10, r11
+ cmplw 5, r13, r11
blt 5, 3f
lis r11, swapper_pg_dir@h
ori r11, r11, swapper_pg_dir@l
@@ -470,29 +471,14 @@ END_BTB_FLUSH_SECTION
#endif
4:
- /* Mask of required permission bits. Note that while we
- * do copy ESR:ST to _PAGE_WRITE position as trying to write
- * to an RO page is pretty common, we don't do it with
- * _PAGE_DIRTY. We could do it, but it's a fairly rare
- * event so I'd rather take the overhead when it happens
- * rather than adding an instruction here. We should measure
- * whether the whole thing is worth it in the first place
- * as we could avoid loading SPRN_ESR completely in the first
- * place...
- *
- * TODO: Is it worth doing that mfspr & rlwimi in the first
- * place or can we save a couple of instructions here ?
- */
- mfspr r12,SPRN_ESR
+ FIND_PTE
+
#ifdef CONFIG_PTE_64BIT
li r13,_PAGE_PRESENT|_PAGE_BAP_SR
oris r13,r13,_PAGE_ACCESSED@h
#else
li r13,_PAGE_PRESENT|_PAGE_READ|_PAGE_ACCESSED
#endif
- rlwimi r13,r12,11,29,29
-
- FIND_PTE
andc. r13,r13,r11 /* Check permission */
#ifdef CONFIG_PTE_64BIT
@@ -549,13 +535,13 @@ START_BTB_FLUSH_SECTION
1:
END_BTB_FLUSH_SECTION
- mfspr r10, SPRN_SRR0 /* Get faulting address */
+ mfspr r13, SPRN_SRR0 /* Get faulting address */
/* If we are faulting a kernel address, we have to use the
* kernel page tables.
*/
lis r11, PAGE_OFFSET@h
- cmplw 5, r10, r11
+ cmplw 5, r13, r11
blt 5, 3f
lis r11, swapper_pg_dir@h
ori r11, r11, swapper_pg_dir@l
@@ -564,6 +550,7 @@ END_BTB_FLUSH_SECTION
rlwinm r12,r12,0,16,1
mtspr SPRN_MAS1,r12
+ FIND_PTE
/* Make up the required permissions for kernel code */
#ifdef CONFIG_PTE_64BIT
li r13,_PAGE_PRESENT | _PAGE_BAP_SX
@@ -584,6 +571,7 @@ END_BTB_FLUSH_SECTION
beq 2f /* KUAP fault */
#endif
+ FIND_PTE
/* Make up the required permissions for user code */
#ifdef CONFIG_PTE_64BIT
li r13,_PAGE_PRESENT | _PAGE_BAP_UX
@@ -593,7 +581,6 @@ END_BTB_FLUSH_SECTION
#endif
4:
- FIND_PTE
andc. r13,r13,r11 /* Check permission */
#ifdef CONFIG_PTE_64BIT
@@ -746,17 +733,12 @@ finish_tlb_load:
lwz r15, 0(r14)
100: stw r15, 0(r17)
- /*
- * Calc MAS1_TSIZE from r10 (which has pshift encoded)
- * tlb_enc = (pshift - 10).
- */
- subi r15, r10, 10
mfspr r16, SPRN_MAS1
- rlwimi r16, r15, 7, 20, 24
+ rlwimi r16, r10, MAS1_TSIZE_SHIFT, MAS1_TSIZE_MASK
mtspr SPRN_MAS1, r16
/* copy the pshift for use later */
- mr r14, r10
+ addi r14, r10, _PAGE_PSIZE_SHIFT_OFFSET
/* fall through */
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index edc479a7c2bc..ac74321b1192 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -415,14 +415,13 @@ FixupDAR:/* Entry point for dcbx workaround. */
oris r11, r11, (swapper_pg_dir - PAGE_OFFSET)@ha
3:
lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11) /* Get the level 1 entry */
+ rlwinm r11, r11, 0, ~_PMD_PAGE_8M
mtspr SPRN_MD_TWC, r11
- mtcrf 0x01, r11
mfspr r11, SPRN_MD_TWC
lwz r11, 0(r11) /* Get the pte */
- bt 28,200f /* bit 28 = Large page (8M) */
/* concat physical page address(r11) and page offset(r10) */
rlwimi r11, r10, 0, 32 - PAGE_SHIFT, 31
-201: lwz r11,0(r11)
+ lwz r11,0(r11)
/* Check if it really is a dcbx instruction. */
/* dcbt and dcbtst does not generate DTLB Misses/Errors,
* no need to include them here */
@@ -441,11 +440,6 @@ FixupDAR:/* Entry point for dcbx workaround. */
141: mfspr r10,SPRN_M_TW
b DARFixed /* Nope, go back to normal TLB processing */
-200:
- /* concat physical page address(r11) and page offset(r10) */
- rlwimi r11, r10, 0, 32 - PAGE_SHIFT_8M, 31
- b 201b
-
144: mfspr r10, SPRN_DSISR
rlwinm r10, r10,0,7,5 /* Clear store bit for buggy dcbst insn */
mtspr SPRN_DSISR, r10
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index ae36a129789f..22f83fbbc762 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -696,11 +696,7 @@ __init u64 ppc64_bolted_size(void)
{
#ifdef CONFIG_PPC_BOOK3E_64
/* Freescale BookE bolts the entire linear mapping */
- /* XXX: BookE ppc64_rma_limit setup seems to disagree? */
- if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E))
- return linear_map_top;
- /* Other BookE, we assume the first GB is bolted */
- return 1ul << 30;
+ return linear_map_top;
#else
/* BookS radix, does not take faults on linear mapping */
if (early_radix_enabled())
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 01c3b4b65241..6727a15ab94f 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1233,10 +1233,6 @@ void __init hash__early_init_mmu(void)
__pmd_table_size = H_PMD_TABLE_SIZE;
__pud_table_size = H_PUD_TABLE_SIZE;
__pgd_table_size = H_PGD_TABLE_SIZE;
- /*
- * 4k use hugepd format, so for hash set then to
- * zero
- */
__pmd_val_bits = HASH_PMD_VAL_BITS;
__pud_val_bits = HASH_PUD_VAL_BITS;
__pgd_val_bits = HASH_PGD_VAL_BITS;
@@ -1546,6 +1542,13 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
goto bail;
}
+ if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !radix_enabled()) {
+ if (hugeshift == PMD_SHIFT && psize == MMU_PAGE_16M)
+ hugeshift = mmu_psize_defs[MMU_PAGE_16M].shift;
+ if (hugeshift == PUD_SHIFT && psize == MMU_PAGE_16G)
+ hugeshift = mmu_psize_defs[MMU_PAGE_16G].shift;
+ }
+
/*
* Add _PAGE_PRESENT to the required access perm. If there are parallel
* updates to the pte that can possibly clear _PAGE_PTE, catch that too.
diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c
index 5a2e512e96db..83c3361b358b 100644
--- a/arch/powerpc/mm/book3s64/hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/hugetlbpage.c
@@ -53,6 +53,16 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
/* If PTE permissions don't match, take page fault */
if (unlikely(!check_pte_access(access, old_pte)))
return 1;
+ /*
+ * If hash-4k, hugepages use seeral contiguous PxD entries
+ * so bail out and let mm make the page young or dirty
+ */
+ if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) {
+ if (!(old_pte & _PAGE_ACCESSED))
+ return 1;
+ if ((access & _PAGE_WRITE) && !(old_pte & _PAGE_DIRTY))
+ return 1;
+ }
/*
* Try to lock the PTE, add ACCESSED and DIRTY if it was
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 2975ea0841ba..f4d8d3c40e5c 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -461,18 +461,6 @@ static inline void pgtable_free(void *table, int index)
case PUD_INDEX:
__pud_free(table);
break;
-#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
- /* 16M hugepd directory at pud level */
- case HTLB_16M_INDEX:
- BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0);
- kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table);
- break;
- /* 16G hugepd directory at the pgd level */
- case HTLB_16G_INDEX:
- BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0);
- kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table);
- break;
-#endif
/* We don't free pgd table via RCU callback */
default:
BUG();
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 594a4b7b2ca2..6b043180220a 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -28,8 +28,6 @@
bool hugetlb_disabled = false;
-#define hugepd_none(hpd) (hpd_val(hpd) == 0)
-
#define PTE_T_ORDER (__builtin_ffs(sizeof(pte_basic_t)) - \
__builtin_ffs(sizeof(void *)))
@@ -42,156 +40,43 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long s
return __find_linux_pte(mm->pgd, addr, NULL, NULL);
}
-static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
- unsigned long address, unsigned int pdshift,
- unsigned int pshift, spinlock_t *ptl)
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long sz)
{
- struct kmem_cache *cachep;
- pte_t *new;
- int i;
- int num_hugepd;
-
- if (pshift >= pdshift) {
- cachep = PGT_CACHE(PTE_T_ORDER);
- num_hugepd = 1 << (pshift - pdshift);
- } else {
- cachep = PGT_CACHE(pdshift - pshift);
- num_hugepd = 1;
- }
-
- if (!cachep) {
- WARN_ONCE(1, "No page table cache created for hugetlb tables");
- return -ENOMEM;
- }
-
- new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
- BUG_ON(pshift > HUGEPD_SHIFT_MASK);
- BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
+ addr &= ~(sz - 1);
- if (!new)
- return -ENOMEM;
+ p4d = p4d_offset(pgd_offset(mm, addr), addr);
+ if (!mm_pud_folded(mm) && sz >= P4D_SIZE)
+ return (pte_t *)p4d;
- /*
- * Make sure other cpus find the hugepd set only after a
- * properly initialized page table is visible to them.
- * For more details look for comment in __pte_alloc().
- */
- smp_wmb();
+ pud = pud_alloc(mm, p4d, addr);
+ if (!pud)
+ return NULL;
+ if (!mm_pmd_folded(mm) && sz >= PUD_SIZE)
+ return (pte_t *)pud;
- spin_lock(ptl);
- /*
- * We have multiple higher-level entries that point to the same
- * actual pte location. Fill in each as we go and backtrack on error.
- * We need all of these so the DTLB pgtable walk code can find the
- * right higher-level entry without knowing if it's a hugepage or not.
- */
- for (i = 0; i < num_hugepd; i++, hpdp++) {
- if (unlikely(!hugepd_none(*hpdp)))
- break;
- hugepd_populate(hpdp, new, pshift);
- }
- /* If we bailed from the for loop early, an error occurred, clean up */
- if (i < num_hugepd) {
- for (i = i - 1 ; i >= 0; i--, hpdp--)
- *hpdp = __hugepd(0);
- kmem_cache_free(cachep, new);
- } else {
- kmemleak_ignore(new);
- }
- spin_unlock(ptl);
- return 0;
-}
+ pmd = pmd_alloc(mm, pud, addr);
+ if (!pmd)
+ return NULL;
-/*
- * At this point we do the placement change only for BOOK3S 64. This would
- * possibly work on other subarchs.
- */
-pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, unsigned long sz)
-{
- pgd_t *pg;
- p4d_t *p4;
- pud_t *pu;
- pmd_t *pm;
- hugepd_t *hpdp = NULL;
- unsigned pshift = __ffs(sz);
- unsigned pdshift = PGDIR_SHIFT;
- spinlock_t *ptl;
-
- addr &= ~(sz-1);
- pg = pgd_offset(mm, addr);
- p4 = p4d_offset(pg, addr);
+ if (sz >= PMD_SIZE) {
+ /* On 8xx, all hugepages are handled as contiguous PTEs */
+ if (IS_ENABLED(CONFIG_PPC_8xx)) {
+ int i;
-#ifdef CONFIG_PPC_BOOK3S_64
- if (pshift == PGDIR_SHIFT)
- /* 16GB huge page */
- return (pte_t *) p4;
- else if (pshift > PUD_SHIFT) {
- /*
- * We need to use hugepd table
- */
- ptl = &mm->page_table_lock;
- hpdp = (hugepd_t *)p4;
- } else {
- pdshift = PUD_SHIFT;
- pu = pud_alloc(mm, p4, addr);
- if (!pu)
- return NULL;
- if (pshift == PUD_SHIFT)
- return (pte_t *)pu;
- else if (pshift > PMD_SHIFT) {
- ptl = pud_lockptr(mm, pu);
- hpdp = (hugepd_t *)pu;
- } else {
- pdshift = PMD_SHIFT;
- pm = pmd_alloc(mm, pu, addr);
- if (!pm)
- return NULL;
- if (pshift == PMD_SHIFT)
- /* 16MB hugepage */
- return (pte_t *)pm;
- else {
- ptl = pmd_lockptr(mm, pm);
- hpdp = (hugepd_t *)pm;
+ for (i = 0; i < sz / PMD_SIZE; i++) {
+ if (!pte_alloc_huge(mm, pmd + i, addr))
+ return NULL;
}
}
+ return (pte_t *)pmd;
}
-#else
- if (pshift >= PGDIR_SHIFT) {
- ptl = &mm->page_table_lock;
- hpdp = (hugepd_t *)p4;
- } else {
- pdshift = PUD_SHIFT;
- pu = pud_alloc(mm, p4, addr);
- if (!pu)
- return NULL;
- if (pshift >= PUD_SHIFT) {
- ptl = pud_lockptr(mm, pu);
- hpdp = (hugepd_t *)pu;
- } else {
- pdshift = PMD_SHIFT;
- pm = pmd_alloc(mm, pu, addr);
- if (!pm)
- return NULL;
- ptl = pmd_lockptr(mm, pm);
- hpdp = (hugepd_t *)pm;
- }
- }
-#endif
- if (!hpdp)
- return NULL;
-
- if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT)
- return pte_alloc_huge(mm, (pmd_t *)hpdp, addr);
-
- BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
- if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
- pdshift, pshift, ptl))
- return NULL;
-
- return hugepte_offset(*hpdp, addr, pdshift);
+ return pte_alloc_huge(mm, pmd, addr);
}
#ifdef CONFIG_PPC_BOOK3S_64
@@ -248,264 +133,6 @@ int __init alloc_bootmem_huge_page(struct hstate *h, int nid)
return __alloc_bootmem_huge_page(h, nid);
}
-#ifndef CONFIG_PPC_BOOK3S_64
-#define HUGEPD_FREELIST_SIZE \
- ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
-
-struct hugepd_freelist {
- struct rcu_head rcu;
- unsigned int index;
- void *ptes[];
-};
-
-static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
-
-static void hugepd_free_rcu_callback(struct rcu_head *head)
-{
- struct hugepd_freelist *batch =
- container_of(head, struct hugepd_freelist, rcu);
- unsigned int i;
-
- for (i = 0; i < batch->index; i++)
- kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]);
-
- free_page((unsigned long)batch);
-}
-
-static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
-{
- struct hugepd_freelist **batchp;
-
- batchp = &get_cpu_var(hugepd_freelist_cur);
-
- if (atomic_read(&tlb->mm->mm_users) < 2 ||
- mm_is_thread_local(tlb->mm)) {
- kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte);
- put_cpu_var(hugepd_freelist_cur);
- return;
- }
-
- if (*batchp == NULL) {
- *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
- (*batchp)->index = 0;
- }
-
- (*batchp)->ptes[(*batchp)->index++] = hugepte;
- if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
- call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback);
- *batchp = NULL;
- }
- put_cpu_var(hugepd_freelist_cur);
-}
-#else
-static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
-#endif
-
-/* Return true when the entry to be freed maps more than the area being freed */
-static bool range_is_outside_limits(unsigned long start, unsigned long end,
- unsigned long floor, unsigned long ceiling,
- unsigned long mask)
-{
- if ((start & mask) < floor)
- return true;
- if (ceiling) {
- ceiling &= mask;
- if (!ceiling)
- return true;
- }
- return end - 1 > ceiling - 1;
-}
-
-static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
- unsigned long start, unsigned long end,
- unsigned long floor, unsigned long ceiling)
-{
- pte_t *hugepte = hugepd_page(*hpdp);
- int i;
-
- unsigned long pdmask = ~((1UL << pdshift) - 1);
- unsigned int num_hugepd = 1;
- unsigned int shift = hugepd_shift(*hpdp);
-
- /* Note: On fsl the hpdp may be the first of several */
- if (shift > pdshift)
- num_hugepd = 1 << (shift - pdshift);
-
- if (range_is_outside_limits(start, end, floor, ceiling, pdmask))
- return;
-
- for (i = 0; i < num_hugepd; i++, hpdp++)
- *hpdp = __hugepd(0);
-
- if (shift >= pdshift)
- hugepd_free(tlb, hugepte);
- else
- pgtable_free_tlb(tlb, hugepte,
- get_hugepd_cache_index(pdshift - shift));
-}
-
-static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
-{
- pgtable_t token = pmd_pgtable(*pmd);
-
- if (range_is_outside_limits(addr, end, floor, ceiling, PMD_MASK))
- return;
-
- pmd_clear(pmd);
- pte_free_tlb(tlb, token, addr);
- mm_dec_nr_ptes(tlb->mm);
-}
-
-static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
- unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
-{
- pmd_t *pmd;
- unsigned long next;
- unsigned long start;
-
- start = addr;
- do {
- unsigned long more;
-
- pmd = pmd_offset(pud, addr);
- next = pmd_addr_end(addr, end);
- if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
- if (pmd_none_or_clear_bad(pmd))
- continue;
-
- /*
- * if it is not hugepd pointer, we should already find
- * it cleared.
- */
- WARN_ON(!IS_ENABLED(CONFIG_PPC_8xx));
-
- hugetlb_free_pte_range(tlb, pmd, addr, end, floor, ceiling);
-
- continue;
- }
- /*
- * Increment next by the size of the huge mapping since
- * there may be more than one entry at this level for a
- * single hugepage, but all of them point to
- * the same kmem cache that holds the hugepte.
- */
- more = addr + (1UL << hugepd_shift(*(hugepd_t *)pmd));
- if (more > next)
- next = more;
-
- free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
- addr, next, floor, ceiling);
- } while (addr = next, addr != end);
-
- if (range_is_outside_limits(start, end, floor, ceiling, PUD_MASK))
- return;
-
- pmd = pmd_offset(pud, start & PUD_MASK);
- pud_clear(pud);
- pmd_free_tlb(tlb, pmd, start & PUD_MASK);
- mm_dec_nr_pmds(tlb->mm);
-}
-
-static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
- unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
-{
- pud_t *pud;
- unsigned long next;
- unsigned long start;
-
- start = addr;
- do {
- pud = pud_offset(p4d, addr);
- next = pud_addr_end(addr, end);
- if (!is_hugepd(__hugepd(pud_val(*pud)))) {
- if (pud_none_or_clear_bad(pud))
- continue;
- hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
- ceiling);
- } else {
- unsigned long more;
- /*
- * Increment next by the size of the huge mapping since
- * there may be more than one entry at this level for a
- * single hugepage, but all of them point to
- * the same kmem cache that holds the hugepte.
- */
- more = addr + (1UL << hugepd_shift(*(hugepd_t *)pud));
- if (more > next)
- next = more;
-
- free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
- addr, next, floor, ceiling);
- }
- } while (addr = next, addr != end);
-
- if (range_is_outside_limits(start, end, floor, ceiling, PGDIR_MASK))
- return;
-
- pud = pud_offset(p4d, start & PGDIR_MASK);
- p4d_clear(p4d);
- pud_free_tlb(tlb, pud, start & PGDIR_MASK);
- mm_dec_nr_puds(tlb->mm);
-}
-
-/*
- * This function frees user-level page tables of a process.
- */
-void hugetlb_free_pgd_range(struct mmu_gather *tlb,
- unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
-{
- pgd_t *pgd;
- p4d_t *p4d;
- unsigned long next;
-
- /*
- * Because there are a number of different possible pagetable
- * layouts for hugepage ranges, we limit knowledge of how
- * things should be laid out to the allocation path
- * (huge_pte_alloc(), above). Everything else works out the
- * structure as it goes from information in the hugepd
- * pointers. That means that we can't here use the
- * optimization used in the normal page free_pgd_range(), of
- * checking whether we're actually covering a large enough
- * range to have to do anything at the top level of the walk
- * instead of at the bottom.
- *
- * To make sense of this, you should probably go read the big
- * block comment at the top of the normal free_pgd_range(),
- * too.
- */
-
- do {
- next = pgd_addr_end(addr, end);
- pgd = pgd_offset(tlb->mm, addr);
- p4d = p4d_offset(pgd, addr);
- if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
- if (p4d_none_or_clear_bad(p4d))
- continue;
- hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling);
- } else {
- unsigned long more;
- /*
- * Increment next by the size of the huge mapping since
- * there may be more than one entry at the pgd level
- * for a single hugepage, but all of them point to the
- * same kmem cache that holds the hugepte.
- */
- more = addr + (1UL << hugepd_shift(*(hugepd_t *)pgd));
- if (more > next)
- next = more;
-
- free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT,
- addr, next, floor, ceiling);
- }
- } while (addr = next, addr != end);
-}
-
bool __init arch_hugetlb_valid_size(unsigned long size)
{
int shift = __ffs(size);
@@ -552,44 +179,14 @@ static int __init hugetlbpage_init(void)
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
unsigned shift;
- unsigned pdshift;
if (!mmu_psize_defs[psize].shift)
continue;
shift = mmu_psize_to_shift(psize);
-#ifdef CONFIG_PPC_BOOK3S_64
- if (shift > PGDIR_SHIFT)
- continue;
- else if (shift > PUD_SHIFT)
- pdshift = PGDIR_SHIFT;
- else if (shift > PMD_SHIFT)
- pdshift = PUD_SHIFT;
- else
- pdshift = PMD_SHIFT;
-#else
- if (shift < PUD_SHIFT)
- pdshift = PMD_SHIFT;
- else if (shift < PGDIR_SHIFT)
- pdshift = PUD_SHIFT;
- else
- pdshift = PGDIR_SHIFT;
-#endif
-
if (add_huge_page_size(1ULL << shift) < 0)
continue;
- /*
- * if we have pdshift and shift value same, we don't
- * use pgt cache for hugepd.
- */
- if (pdshift > shift) {
- if (!IS_ENABLED(CONFIG_PPC_8xx))
- pgtable_cache_add(pdshift - shift);
- } else if (IS_ENABLED(CONFIG_PPC_E500) ||
- IS_ENABLED(CONFIG_PPC_8xx)) {
- pgtable_cache_add(PTE_T_ORDER);
- }
configured = true;
}
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index 21131b96d209..9b4a675eb8f8 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -123,12 +123,8 @@ void pgtable_cache_add(unsigned int shift)
/* When batching pgtable pointers for RCU freeing, we store
* the index size in the low bits. Table alignment must be
* big enough to fit it.
- *
- * Likewise, hugeapge pagetable pointers contain a (different)
- * shift value in the low bits. All tables must be aligned so
- * as to leave enough 0 bits in the address to contain it. */
- unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
- HUGEPD_SHIFT_MASK + 1);
+ */
+ unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1;
struct kmem_cache *new = NULL;
/* It would be nice if this was a BUILD_BUG_ON(), but at the
diff --git a/arch/powerpc/mm/kasan/8xx.c b/arch/powerpc/mm/kasan/8xx.c
index 2784224054f8..989d6cdf4141 100644
--- a/arch/powerpc/mm/kasan/8xx.c
+++ b/arch/powerpc/mm/kasan/8xx.c
@@ -6,28 +6,33 @@
#include <linux/memblock.h>
#include <linux/hugetlb.h>
+#include <asm/pgalloc.h>
+
static int __init
kasan_init_shadow_8M(unsigned long k_start, unsigned long k_end, void *block)
{
pmd_t *pmd = pmd_off_k(k_start);
unsigned long k_cur, k_next;
- for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd += 2, block += SZ_8M) {
- pte_basic_t *new;
+ for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++, block += SZ_4M) {
+ pte_t *ptep;
+ int i;
k_next = pgd_addr_end(k_cur, k_end);
- k_next = pgd_addr_end(k_next, k_end);
if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte)
continue;
- new = memblock_alloc(sizeof(pte_basic_t), SZ_4K);
- if (!new)
+ ptep = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE);
+ if (!ptep)
return -ENOMEM;
- *new = pte_val(pte_mkhuge(pfn_pte(PHYS_PFN(__pa(block)), PAGE_KERNEL)));
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte_t pte = pte_mkhuge(pfn_pte(PHYS_PFN(__pa(block + i * PAGE_SIZE)), PAGE_KERNEL));
- hugepd_populate_kernel((hugepd_t *)pmd, (pte_t *)new, PAGE_SHIFT_8M);
- hugepd_populate_kernel((hugepd_t *)pmd + 1, (pte_t *)new, PAGE_SHIFT_8M);
+ __set_pte_at(&init_mm, k_cur, ptep + i, pte, 1);
+ }
+ pmd_populate_kernel(&init_mm, pmd, ptep);
+ *pmd = __pmd(pmd_val(*pmd) | _PMD_PAGE_8M);
}
return 0;
}
diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index 43d4842bb1c7..388bba0ab3e7 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -11,6 +11,7 @@
#include <linux/hugetlb.h>
#include <asm/fixmap.h>
+#include <asm/pgalloc.h>
#include <mm/mmu_decl.h>
@@ -48,20 +49,6 @@ unsigned long p_block_mapped(phys_addr_t pa)
return 0;
}
-static pte_t __init *early_hugepd_alloc_kernel(hugepd_t *pmdp, unsigned long va)
-{
- if (hpd_val(*pmdp) == 0) {
- pte_t *ptep = memblock_alloc(sizeof(pte_basic_t), SZ_4K);
-
- if (!ptep)
- return NULL;
-
- hugepd_populate_kernel((hugepd_t *)pmdp, ptep, PAGE_SHIFT_8M);
- hugepd_populate_kernel((hugepd_t *)pmdp + 1, ptep, PAGE_SHIFT_8M);
- }
- return hugepte_offset(*(hugepd_t *)pmdp, va, PGDIR_SHIFT);
-}
-
static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa,
pgprot_t prot, int psize, bool new)
{
@@ -75,26 +62,36 @@ static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa,
if (WARN_ON(slab_is_available()))
return -EINVAL;
- if (psize == MMU_PAGE_512K)
+ if (psize == MMU_PAGE_512K) {
ptep = early_pte_alloc_kernel(pmdp, va);
- else
- ptep = early_hugepd_alloc_kernel((hugepd_t *)pmdp, va);
+ /* The PTE should never be already present */
+ if (WARN_ON(pte_present(*ptep) && pgprot_val(prot)))
+ return -EINVAL;
+ } else {
+ if (WARN_ON(!pmd_none(*pmdp) || !pmd_none(*(pmdp + 1))))
+ return -EINVAL;
+
+ ptep = early_alloc_pgtable(PTE_FRAG_SIZE);
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+
+ ptep = early_alloc_pgtable(PTE_FRAG_SIZE);
+ pmd_populate_kernel(&init_mm, pmdp + 1, ptep);
+
+ ptep = (pte_t *)pmdp;
+ }
} else {
if (psize == MMU_PAGE_512K)
ptep = pte_offset_kernel(pmdp, va);
else
- ptep = hugepte_offset(*(hugepd_t *)pmdp, va, PGDIR_SHIFT);
+ ptep = (pte_t *)pmdp;
}
if (WARN_ON(!ptep))
return -ENOMEM;
- /* The PTE should never be already present */
- if (new && WARN_ON(pte_present(*ptep) && pgprot_val(prot)))
- return -EINVAL;
-
set_huge_pte_at(&init_mm, va, ptep,
- pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)), psize);
+ pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)),
+ 1UL << mmu_psize_to_shift(psize));
return 0;
}
diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile
index 86d0fe434824..cf60c776c883 100644
--- a/arch/powerpc/mm/nohash/Makefile
+++ b/arch/powerpc/mm/nohash/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
obj-y += mmu_context.o tlb.o tlb_low.o kup.o
-obj-$(CONFIG_PPC_BOOK3E_64) += tlb_low_64e.o book3e_pgtable.o
+obj-$(CONFIG_PPC_BOOK3E_64) += tlb_64e.o tlb_low_64e.o book3e_pgtable.o
obj-$(CONFIG_44x) += 44x.o
obj-$(CONFIG_PPC_8xx) += 8xx.o
obj-$(CONFIG_PPC_E500) += e500.o
diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c
index 1c5e4ecbebeb..ad2a7c26f2a0 100644
--- a/arch/powerpc/mm/nohash/book3e_pgtable.c
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -29,10 +29,10 @@ int __meminit vmemmap_create_mapping(unsigned long start,
_PAGE_KERNEL_RW;
/* PTEs only contain page size encodings up to 32M */
- BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
+ BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].shift - 10 > 0xf);
/* Encode the size in the PTE */
- flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
+ flags |= (mmu_psize_defs[mmu_vmemmap_psize].shift - 10) << 8;
/* For each PTE for that area, map things. Note that we don't
* increment phys because all PTEs are of the large size and
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index 5ffa0af4328a..b653a7be4cb1 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -53,37 +53,30 @@
struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
[MMU_PAGE_4K] = {
.shift = 12,
- .enc = BOOK3E_PAGESZ_4K,
},
[MMU_PAGE_2M] = {
.shift = 21,
- .enc = BOOK3E_PAGESZ_2M,
},
[MMU_PAGE_4M] = {
.shift = 22,
- .enc = BOOK3E_PAGESZ_4M,
},
[MMU_PAGE_16M] = {
.shift = 24,
- .enc = BOOK3E_PAGESZ_16M,
},
[MMU_PAGE_64M] = {
.shift = 26,
- .enc = BOOK3E_PAGESZ_64M,
},
[MMU_PAGE_256M] = {
.shift = 28,
- .enc = BOOK3E_PAGESZ_256M,
},
[MMU_PAGE_1G] = {
.shift = 30,
- .enc = BOOK3E_PAGESZ_1GB,
},
};
static inline int mmu_get_tsize(int psize)
{
- return mmu_psize_defs[psize].enc;
+ return mmu_psize_defs[psize].shift - 10;
}
#else
static inline int mmu_get_tsize(int psize)
@@ -110,28 +103,6 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
};
#endif
-/* The variables below are currently only used on 64-bit Book3E
- * though this will probably be made common with other nohash
- * implementations at some point
- */
-#ifdef CONFIG_PPC64
-
-int mmu_pte_psize; /* Page size used for PTE pages */
-int mmu_vmemmap_psize; /* Page size used for the virtual mem map */
-int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */
-unsigned long linear_map_top; /* Top of linear mapping */
-
-
-/*
- * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug
- * exceptions. This is used for bolted and e6500 TLB miss handlers which
- * do not modify this SPRG in the TLB miss code; for other TLB miss handlers,
- * this is set to zero.
- */
-int extlb_level_exc;
-
-#endif /* CONFIG_PPC64 */
-
#ifdef CONFIG_PPC_E500
/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */
DEFINE_PER_CPU(int, next_tlbcam_idx);
@@ -358,381 +329,7 @@ void tlb_flush(struct mmu_gather *tlb)
flush_tlb_mm(tlb->mm);
}
-/*
- * Below are functions specific to the 64-bit variant of Book3E though that
- * may change in the future
- */
-
-#ifdef CONFIG_PPC64
-
-/*
- * Handling of virtual linear page tables or indirect TLB entries
- * flushing when PTE pages are freed
- */
-void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
-{
- int tsize = mmu_psize_defs[mmu_pte_psize].enc;
-
- if (book3e_htw_mode != PPC_HTW_NONE) {
- unsigned long start = address & PMD_MASK;
- unsigned long end = address + PMD_SIZE;
- unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
-
- /* This isn't the most optimal, ideally we would factor out the
- * while preempt & CPU mask mucking around, or even the IPI but
- * it will do for now
- */
- while (start < end) {
- __flush_tlb_page(tlb->mm, start, tsize, 1);
- start += size;
- }
- } else {
- unsigned long rmask = 0xf000000000000000ul;
- unsigned long rid = (address & rmask) | 0x1000000000000000ul;
- unsigned long vpte = address & ~rmask;
-
- vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
- vpte |= rid;
- __flush_tlb_page(tlb->mm, vpte, tsize, 0);
- }
-}
-
-static void __init setup_page_sizes(void)
-{
- unsigned int tlb0cfg;
- unsigned int tlb0ps;
- unsigned int eptcfg;
- int i, psize;
-
-#ifdef CONFIG_PPC_E500
- unsigned int mmucfg = mfspr(SPRN_MMUCFG);
- int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E);
-
- if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
- unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG);
- unsigned int min_pg, max_pg;
-
- min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT;
- max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT;
-
- for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
- struct mmu_psize_def *def;
- unsigned int shift;
-
- def = &mmu_psize_defs[psize];
- shift = def->shift;
-
- if (shift == 0 || shift & 1)
- continue;
-
- /* adjust to be in terms of 4^shift Kb */
- shift = (shift - 10) >> 1;
-
- if ((shift >= min_pg) && (shift <= max_pg))
- def->flags |= MMU_PAGE_SIZE_DIRECT;
- }
-
- goto out;
- }
-
- if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
- u32 tlb1cfg, tlb1ps;
-
- tlb0cfg = mfspr(SPRN_TLB0CFG);
- tlb1cfg = mfspr(SPRN_TLB1CFG);
- tlb1ps = mfspr(SPRN_TLB1PS);
- eptcfg = mfspr(SPRN_EPTCFG);
-
- if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT))
- book3e_htw_mode = PPC_HTW_E6500;
-
- /*
- * We expect 4K subpage size and unrestricted indirect size.
- * The lack of a restriction on indirect size is a Freescale
- * extension, indicated by PSn = 0 but SPSn != 0.
- */
- if (eptcfg != 2)
- book3e_htw_mode = PPC_HTW_NONE;
-
- for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
- struct mmu_psize_def *def = &mmu_psize_defs[psize];
-
- if (!def->shift)
- continue;
-
- if (tlb1ps & (1U << (def->shift - 10))) {
- def->flags |= MMU_PAGE_SIZE_DIRECT;
-
- if (book3e_htw_mode && psize == MMU_PAGE_2M)
- def->flags |= MMU_PAGE_SIZE_INDIRECT;
- }
- }
-
- goto out;
- }
-#endif
-
- tlb0cfg = mfspr(SPRN_TLB0CFG);
- tlb0ps = mfspr(SPRN_TLB0PS);
- eptcfg = mfspr(SPRN_EPTCFG);
-
- /* Look for supported direct sizes */
- for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
- struct mmu_psize_def *def = &mmu_psize_defs[psize];
-
- if (tlb0ps & (1U << (def->shift - 10)))
- def->flags |= MMU_PAGE_SIZE_DIRECT;
- }
-
- /* Indirect page sizes supported ? */
- if ((tlb0cfg & TLBnCFG_IND) == 0 ||
- (tlb0cfg & TLBnCFG_PT) == 0)
- goto out;
-
- book3e_htw_mode = PPC_HTW_IBM;
-
- /* Now, we only deal with one IND page size for each
- * direct size. Hopefully all implementations today are
- * unambiguous, but we might want to be careful in the
- * future.
- */
- for (i = 0; i < 3; i++) {
- unsigned int ps, sps;
-
- sps = eptcfg & 0x1f;
- eptcfg >>= 5;
- ps = eptcfg & 0x1f;
- eptcfg >>= 5;
- if (!ps || !sps)
- continue;
- for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
- struct mmu_psize_def *def = &mmu_psize_defs[psize];
-
- if (ps == (def->shift - 10))
- def->flags |= MMU_PAGE_SIZE_INDIRECT;
- if (sps == (def->shift - 10))
- def->ind = ps + 10;
- }
- }
-
-out:
- /* Cleanup array and print summary */
- pr_info("MMU: Supported page sizes\n");
- for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
- struct mmu_psize_def *def = &mmu_psize_defs[psize];
- const char *__page_type_names[] = {
- "unsupported",
- "direct",
- "indirect",
- "direct & indirect"
- };
- if (def->flags == 0) {
- def->shift = 0;
- continue;
- }
- pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10),
- __page_type_names[def->flags & 0x3]);
- }
-}
-
-static void __init setup_mmu_htw(void)
-{
- /*
- * If we want to use HW tablewalk, enable it by patching the TLB miss
- * handlers to branch to the one dedicated to it.
- */
-
- switch (book3e_htw_mode) {
- case PPC_HTW_IBM:
- patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e);
- patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e);
- break;
-#ifdef CONFIG_PPC_E500
- case PPC_HTW_E6500:
- extlb_level_exc = EX_TLB_SIZE;
- patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e);
- patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e);
- break;
-#endif
- }
- pr_info("MMU: Book3E HW tablewalk %s\n",
- book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported");
-}
-
-/*
- * Early initialization of the MMU TLB code
- */
-static void early_init_this_mmu(void)
-{
- unsigned int mas4;
-
- /* Set MAS4 based on page table setting */
-
- mas4 = 0x4 << MAS4_WIMGED_SHIFT;
- switch (book3e_htw_mode) {
- case PPC_HTW_E6500:
- mas4 |= MAS4_INDD;
- mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT;
- mas4 |= MAS4_TLBSELD(1);
- mmu_pte_psize = MMU_PAGE_2M;
- break;
-
- case PPC_HTW_IBM:
- mas4 |= MAS4_INDD;
- mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
- mmu_pte_psize = MMU_PAGE_1M;
- break;
-
- case PPC_HTW_NONE:
- mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
- mmu_pte_psize = mmu_virtual_psize;
- break;
- }
- mtspr(SPRN_MAS4, mas4);
-
-#ifdef CONFIG_PPC_E500
- if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
- unsigned int num_cams;
- bool map = true;
-
- /* use a quarter of the TLBCAM for bolted linear map */
- num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
-
- /*
- * Only do the mapping once per core, or else the
- * transient mapping would cause problems.
- */
-#ifdef CONFIG_SMP
- if (hweight32(get_tensr()) > 1)
- map = false;
-#endif
-
- if (map)
- linear_map_top = map_mem_in_cams(linear_map_top,
- num_cams, false, true);
- }
-#endif
-
- /* A sync won't hurt us after mucking around with
- * the MMU configuration
- */
- mb();
-}
-
-static void __init early_init_mmu_global(void)
-{
- /* XXX This should be decided at runtime based on supported
- * page sizes in the TLB, but for now let's assume 16M is
- * always there and a good fit (which it probably is)
- *
- * Freescale booke only supports 4K pages in TLB0, so use that.
- */
- if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
- mmu_vmemmap_psize = MMU_PAGE_4K;
- else
- mmu_vmemmap_psize = MMU_PAGE_16M;
-
- /* XXX This code only checks for TLB 0 capabilities and doesn't
- * check what page size combos are supported by the HW. It
- * also doesn't handle the case where a separate array holds
- * the IND entries from the array loaded by the PT.
- */
- /* Look for supported page sizes */
- setup_page_sizes();
-
- /* Look for HW tablewalk support */
- setup_mmu_htw();
-
-#ifdef CONFIG_PPC_E500
- if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
- if (book3e_htw_mode == PPC_HTW_NONE) {
- extlb_level_exc = EX_TLB_SIZE;
- patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
- patch_exception(0x1e0,
- exc_instruction_tlb_miss_bolted_book3e);
- }
- }
-#endif
-
- /* Set the global containing the top of the linear mapping
- * for use by the TLB miss code
- */
- linear_map_top = memblock_end_of_DRAM();
-
- ioremap_bot = IOREMAP_BASE;
-}
-
-static void __init early_mmu_set_memory_limit(void)
-{
-#ifdef CONFIG_PPC_E500
- if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
- /*
- * Limit memory so we dont have linear faults.
- * Unlike memblock_set_current_limit, which limits
- * memory available during early boot, this permanently
- * reduces the memory available to Linux. We need to
- * do this because highmem is not supported on 64-bit.
- */
- memblock_enforce_memory_limit(linear_map_top);
- }
-#endif
-
- memblock_set_current_limit(linear_map_top);
-}
-
-/* boot cpu only */
-void __init early_init_mmu(void)
-{
- early_init_mmu_global();
- early_init_this_mmu();
- early_mmu_set_memory_limit();
-}
-
-void early_init_mmu_secondary(void)
-{
- early_init_this_mmu();
-}
-
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size)
-{
- /* On non-FSL Embedded 64-bit, we adjust the RMA size to match
- * the bolted TLB entry. We know for now that only 1G
- * entries are supported though that may eventually
- * change.
- *
- * on FSL Embedded 64-bit, usually all RAM is bolted, but with
- * unusual memory sizes it's possible for some RAM to not be mapped
- * (such RAM is not used at all by Linux, since we don't support
- * highmem on 64-bit). We limit ppc64_rma_size to what would be
- * mappable if this memblock is the only one. Additional memblocks
- * can only increase, not decrease, the amount that ends up getting
- * mapped. We still limit max to 1G even if we'll eventually map
- * more. This is due to what the early init code is set up to do.
- *
- * We crop it to the size of the first MEMBLOCK to
- * avoid going over total available memory just in case...
- */
-#ifdef CONFIG_PPC_E500
- if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
- unsigned long linear_sz;
- unsigned int num_cams;
-
- /* use a quarter of the TLBCAM for bolted linear map */
- num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
-
- linear_sz = map_mem_in_cams(first_memblock_size, num_cams,
- true, true);
-
- ppc64_rma_size = min_t(u64, linear_sz, 0x40000000);
- } else
-#endif
- ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
-
- /* Finally limit subsequent allocations */
- memblock_set_current_limit(first_memblock_base + ppc64_rma_size);
-}
-#else /* ! CONFIG_PPC64 */
+#ifndef CONFIG_PPC64
void __init early_init_mmu(void)
{
unsigned long root = of_get_flat_dt_root();
diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c
new file mode 100644
index 000000000000..113edf76d3ce
--- /dev/null
+++ b/arch/powerpc/mm/nohash/tlb_64e.c
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org>
+ * IBM Corp.
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/memblock.h>
+
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/code-patching.h>
+#include <asm/cputhreads.h>
+
+#include <mm/mmu_decl.h>
+
+/* The variables below are currently only used on 64-bit Book3E
+ * though this will probably be made common with other nohash
+ * implementations at some point
+ */
+int mmu_pte_psize; /* Page size used for PTE pages */
+int mmu_vmemmap_psize; /* Page size used for the virtual mem map */
+int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */
+unsigned long linear_map_top; /* Top of linear mapping */
+
+
+/*
+ * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug
+ * exceptions. This is used for bolted and e6500 TLB miss handlers which
+ * do not modify this SPRG in the TLB miss code; for other TLB miss handlers,
+ * this is set to zero.
+ */
+int extlb_level_exc;
+
+/*
+ * Handling of virtual linear page tables or indirect TLB entries
+ * flushing when PTE pages are freed
+ */
+void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
+{
+ int tsize = mmu_psize_defs[mmu_pte_psize].shift - 10;
+
+ if (book3e_htw_mode != PPC_HTW_NONE) {
+ unsigned long start = address & PMD_MASK;
+ unsigned long end = address + PMD_SIZE;
+ unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
+
+ /* This isn't the most optimal, ideally we would factor out the
+ * while preempt & CPU mask mucking around, or even the IPI but
+ * it will do for now
+ */
+ while (start < end) {
+ __flush_tlb_page(tlb->mm, start, tsize, 1);
+ start += size;
+ }
+ } else {
+ unsigned long rmask = 0xf000000000000000ul;
+ unsigned long rid = (address & rmask) | 0x1000000000000000ul;
+ unsigned long vpte = address & ~rmask;
+
+ vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
+ vpte |= rid;
+ __flush_tlb_page(tlb->mm, vpte, tsize, 0);
+ }
+}
+
+static void __init setup_page_sizes(void)
+{
+ unsigned int tlb0cfg;
+ unsigned int eptcfg;
+ int psize;
+
+ unsigned int mmucfg = mfspr(SPRN_MMUCFG);
+
+ if ((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
+ unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG);
+ unsigned int min_pg, max_pg;
+
+ min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT;
+ max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT;
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def;
+ unsigned int shift;
+
+ def = &mmu_psize_defs[psize];
+ shift = def->shift;
+
+ if (shift == 0 || shift & 1)
+ continue;
+
+ /* adjust to be in terms of 4^shift Kb */
+ shift = (shift - 10) >> 1;
+
+ if ((shift >= min_pg) && (shift <= max_pg))
+ def->flags |= MMU_PAGE_SIZE_DIRECT;
+ }
+
+ goto out;
+ }
+
+ if ((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
+ u32 tlb1cfg, tlb1ps;
+
+ tlb0cfg = mfspr(SPRN_TLB0CFG);
+ tlb1cfg = mfspr(SPRN_TLB1CFG);
+ tlb1ps = mfspr(SPRN_TLB1PS);
+ eptcfg = mfspr(SPRN_EPTCFG);
+
+ if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT))
+ book3e_htw_mode = PPC_HTW_E6500;
+
+ /*
+ * We expect 4K subpage size and unrestricted indirect size.
+ * The lack of a restriction on indirect size is a Freescale
+ * extension, indicated by PSn = 0 but SPSn != 0.
+ */
+ if (eptcfg != 2)
+ book3e_htw_mode = PPC_HTW_NONE;
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+ if (!def->shift)
+ continue;
+
+ if (tlb1ps & (1U << (def->shift - 10))) {
+ def->flags |= MMU_PAGE_SIZE_DIRECT;
+
+ if (book3e_htw_mode && psize == MMU_PAGE_2M)
+ def->flags |= MMU_PAGE_SIZE_INDIRECT;
+ }
+ }
+
+ goto out;
+ }
+out:
+ /* Cleanup array and print summary */
+ pr_info("MMU: Supported page sizes\n");
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def = &mmu_psize_defs[psize];
+ const char *__page_type_names[] = {
+ "unsupported",
+ "direct",
+ "indirect",
+ "direct & indirect"
+ };
+ if (def->flags == 0) {
+ def->shift = 0;
+ continue;
+ }
+ pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10),
+ __page_type_names[def->flags & 0x3]);
+ }
+}
+
+/*
+ * Early initialization of the MMU TLB code
+ */
+static void early_init_this_mmu(void)
+{
+ unsigned int mas4;
+
+ /* Set MAS4 based on page table setting */
+
+ mas4 = 0x4 << MAS4_WIMGED_SHIFT;
+ switch (book3e_htw_mode) {
+ case PPC_HTW_E6500:
+ mas4 |= MAS4_INDD;
+ mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT;
+ mas4 |= MAS4_TLBSELD(1);
+ mmu_pte_psize = MMU_PAGE_2M;
+ break;
+
+ case PPC_HTW_NONE:
+ mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
+ mmu_pte_psize = mmu_virtual_psize;
+ break;
+ }
+ mtspr(SPRN_MAS4, mas4);
+
+ unsigned int num_cams;
+ bool map = true;
+
+ /* use a quarter of the TLBCAM for bolted linear map */
+ num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
+
+ /*
+ * Only do the mapping once per core, or else the
+ * transient mapping would cause problems.
+ */
+#ifdef CONFIG_SMP
+ if (hweight32(get_tensr()) > 1)
+ map = false;
+#endif
+
+ if (map)
+ linear_map_top = map_mem_in_cams(linear_map_top,
+ num_cams, false, true);
+
+ /* A sync won't hurt us after mucking around with
+ * the MMU configuration
+ */
+ mb();
+}
+
+static void __init early_init_mmu_global(void)
+{
+ /*
+ * Freescale booke only supports 4K pages in TLB0, so use that.
+ */
+ mmu_vmemmap_psize = MMU_PAGE_4K;
+
+ /* XXX This code only checks for TLB 0 capabilities and doesn't
+ * check what page size combos are supported by the HW. It
+ * also doesn't handle the case where a separate array holds
+ * the IND entries from the array loaded by the PT.
+ */
+ /* Look for supported page sizes */
+ setup_page_sizes();
+
+ /*
+ * If we want to use HW tablewalk, enable it by patching the TLB miss
+ * handlers to branch to the one dedicated to it.
+ */
+ extlb_level_exc = EX_TLB_SIZE;
+ switch (book3e_htw_mode) {
+ case PPC_HTW_E6500:
+ patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e);
+ patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e);
+ break;
+ }
+
+ pr_info("MMU: Book3E HW tablewalk %s\n",
+ book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported");
+
+ /* Set the global containing the top of the linear mapping
+ * for use by the TLB miss code
+ */
+ linear_map_top = memblock_end_of_DRAM();
+
+ ioremap_bot = IOREMAP_BASE;
+}
+
+static void __init early_mmu_set_memory_limit(void)
+{
+ /*
+ * Limit memory so we dont have linear faults.
+ * Unlike memblock_set_current_limit, which limits
+ * memory available during early boot, this permanently
+ * reduces the memory available to Linux. We need to
+ * do this because highmem is not supported on 64-bit.
+ */
+ memblock_enforce_memory_limit(linear_map_top);
+
+ memblock_set_current_limit(linear_map_top);
+}
+
+/* boot cpu only */
+void __init early_init_mmu(void)
+{
+ early_init_mmu_global();
+ early_init_this_mmu();
+ early_mmu_set_memory_limit();
+}
+
+void early_init_mmu_secondary(void)
+{
+ early_init_this_mmu();
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ /*
+ * On FSL Embedded 64-bit, usually all RAM is bolted, but with
+ * unusual memory sizes it's possible for some RAM to not be mapped
+ * (such RAM is not used at all by Linux, since we don't support
+ * highmem on 64-bit). We limit ppc64_rma_size to what would be
+ * mappable if this memblock is the only one. Additional memblocks
+ * can only increase, not decrease, the amount that ends up getting
+ * mapped. We still limit max to 1G even if we'll eventually map
+ * more. This is due to what the early init code is set up to do.
+ *
+ * We crop it to the size of the first MEMBLOCK to
+ * avoid going over total available memory just in case...
+ */
+ unsigned long linear_sz;
+ unsigned int num_cams;
+
+ /* use a quarter of the TLBCAM for bolted linear map */
+ num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
+
+ linear_sz = map_mem_in_cams(first_memblock_size, num_cams, true, true);
+ ppc64_rma_size = min_t(u64, linear_sz, 0x40000000);
+
+ /* Finally limit subsequent allocations */
+ memblock_set_current_limit(first_memblock_base + ppc64_rma_size);
+}
diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S
index 7e0b8fe1c279..de568297d5c5 100644
--- a/arch/powerpc/mm/nohash/tlb_low_64e.S
+++ b/arch/powerpc/mm/nohash/tlb_low_64e.S
@@ -450,11 +450,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SMT)
tlb_miss_huge_e6500:
beq tlb_miss_fault_e6500
- li r10,1
- andi. r15,r14,HUGEPD_SHIFT_MASK@l /* r15 = psize */
- rldimi r14,r10,63,0 /* Set PD_HUGE */
- xor r14,r14,r15 /* Clear size bits */
- ldx r14,0,r14
+ rlwinm r15,r14,32-_PAGE_PSIZE_SHIFT,0x1e
/*
* Now we build the MAS for a huge page.
@@ -465,7 +461,6 @@ tlb_miss_huge_e6500:
* MAS 2,3+7: Needs to be redone similar to non-tablewalk handler
*/
- subi r15,r15,10 /* Convert psize to tsize */
mfspr r10,SPRN_MAS1
rlwinm r10,r10,0,~MAS1_IND
rlwimi r10,r15,MAS1_TSIZE_SHIFT,MAS1_TSIZE_MASK
@@ -511,232 +506,6 @@ itlb_miss_fault_e6500:
tlb_epilog_bolted
b exc_instruction_storage_book3e
-/**********************************************************************
- * *
- * TLB miss handling for Book3E with TLB reservation and HES support *
- * *
- **********************************************************************/
-
-
-/* Data TLB miss */
- START_EXCEPTION(data_tlb_miss)
- TLB_MISS_PROLOG
-
- /* Now we handle the fault proper. We only save DEAR in normal
- * fault case since that's the only interesting values here.
- * We could probably also optimize by not saving SRR0/1 in the
- * linear mapping case but I'll leave that for later
- */
- mfspr r14,SPRN_ESR
- mfspr r16,SPRN_DEAR /* get faulting address */
- srdi r15,r16,44 /* get region */
- xoris r15,r15,0xc
- cmpldi cr0,r15,0 /* linear mapping ? */
- beq tlb_load_linear /* yes -> go to linear map load */
- cmpldi cr1,r15,1 /* vmalloc mapping ? */
-
- /* The page tables are mapped virtually linear. At this point, though,
- * we don't know whether we are trying to fault in a first level
- * virtual address or a virtual page table address. We can get that
- * from bit 0x1 of the region ID which we have set for a page table
- */
- andis. r10,r15,0x1
- bne- virt_page_table_tlb_miss
-
- std r14,EX_TLB_ESR(r12); /* save ESR */
- std r16,EX_TLB_DEAR(r12); /* save DEAR */
-
- /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */
- li r11,_PAGE_PRESENT
- oris r11,r11,_PAGE_ACCESSED@h
-
- /* We do the user/kernel test for the PID here along with the RW test
- */
- srdi. r15,r16,60 /* Check for user region */
-
- /* We pre-test some combination of permissions to avoid double
- * faults:
- *
- * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
- * ESR_ST is 0x00800000
- * _PAGE_BAP_SW is 0x00000010
- * So the shift is >> 19. This tests for supervisor writeability.
- * If the page happens to be supervisor writeable and not user
- * writeable, we will take a new fault later, but that should be
- * a rare enough case.
- *
- * We also move ESR_ST in _PAGE_DIRTY position
- * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
- *
- * MAS1 is preset for all we need except for TID that needs to
- * be cleared for kernel translations
- */
- rlwimi r11,r14,32-19,27,27
- rlwimi r11,r14,32-16,19,19
- beq normal_tlb_miss_user
- /* XXX replace the RMW cycles with immediate loads + writes */
-1: mfspr r10,SPRN_MAS1
- rlwinm r10,r10,0,16,1 /* Clear TID */
- mtspr SPRN_MAS1,r10
- beq+ cr1,normal_tlb_miss
-
- /* We got a crappy address, just fault with whatever DEAR and ESR
- * are here
- */
- TLB_MISS_EPILOG_ERROR
- b exc_data_storage_book3e
-
-/* Instruction TLB miss */
- START_EXCEPTION(instruction_tlb_miss)
- TLB_MISS_PROLOG
-
- /* If we take a recursive fault, the second level handler may need
- * to know whether we are handling a data or instruction fault in
- * order to get to the right store fault handler. We provide that
- * info by writing a crazy value in ESR in our exception frame
- */
- li r14,-1 /* store to exception frame is done later */
-
- /* Now we handle the fault proper. We only save DEAR in the non
- * linear mapping case since we know the linear mapping case will
- * not re-enter. We could indeed optimize and also not save SRR0/1
- * in the linear mapping case but I'll leave that for later
- *
- * Faulting address is SRR0 which is already in r16
- */
- srdi r15,r16,44 /* get region */
- xoris r15,r15,0xc
- cmpldi cr0,r15,0 /* linear mapping ? */
- beq tlb_load_linear /* yes -> go to linear map load */
- cmpldi cr1,r15,1 /* vmalloc mapping ? */
-
- /* We do the user/kernel test for the PID here along with the RW test
- */
- li r11,_PAGE_PRESENT|_PAGE_BAP_UX /* Base perm */
- oris r11,r11,_PAGE_ACCESSED@h
-
- srdi. r15,r16,60 /* Check for user region */
- std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */
- beq normal_tlb_miss_user
-
- li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */
- oris r11,r11,_PAGE_ACCESSED@h
- /* XXX replace the RMW cycles with immediate loads + writes */
- mfspr r10,SPRN_MAS1
- rlwinm r10,r10,0,16,1 /* Clear TID */
- mtspr SPRN_MAS1,r10
- beq+ cr1,normal_tlb_miss
-
- /* We got a crappy address, just fault */
- TLB_MISS_EPILOG_ERROR
- b exc_instruction_storage_book3e
-
-/*
- * This is the guts of the first-level TLB miss handler for direct
- * misses. We are entered with:
- *
- * r16 = faulting address
- * r15 = region ID
- * r14 = crap (free to use)
- * r13 = PACA
- * r12 = TLB exception frame in PACA
- * r11 = PTE permission mask
- * r10 = crap (free to use)
- */
-normal_tlb_miss_user:
-#ifdef CONFIG_PPC_KUAP
- mfspr r14,SPRN_MAS1
- rlwinm. r14,r14,0,0x3fff0000
- beq- normal_tlb_miss_access_fault /* KUAP fault */
-#endif
-normal_tlb_miss:
- /* So we first construct the page table address. We do that by
- * shifting the bottom of the address (not the region ID) by
- * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and
- * or'ing the fourth high bit.
- *
- * NOTE: For 64K pages, we do things slightly differently in
- * order to handle the weird page table format used by linux
- */
- srdi r15,r16,44
- oris r10,r15,0x1
- rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4
- sldi r15,r10,44
- clrrdi r14,r14,19
- or r10,r15,r14
-
- ld r14,0(r10)
-
-finish_normal_tlb_miss:
- /* Check if required permissions are met */
- andc. r15,r11,r14
- bne- normal_tlb_miss_access_fault
-
- /* Now we build the MAS:
- *
- * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
- * MAS 1 : Almost fully setup
- * - PID already updated by caller if necessary
- * - TSIZE need change if !base page size, not
- * yet implemented for now
- * MAS 2 : Defaults not useful, need to be redone
- * MAS 3+7 : Needs to be done
- *
- * TODO: mix up code below for better scheduling
- */
- clrrdi r10,r16,12 /* Clear low crap in EA */
- rlwimi r10,r14,32-19,27,31 /* Insert WIMGE */
- mtspr SPRN_MAS2,r10
-
- /* Check page size, if not standard, update MAS1 */
- rldicl r10,r14,64-8,64-8
- cmpldi cr0,r10,BOOK3E_PAGESZ_4K
- beq- 1f
- mfspr r11,SPRN_MAS1
- rlwimi r11,r14,31,21,24
- rlwinm r11,r11,0,21,19
- mtspr SPRN_MAS1,r11
-1:
- /* Move RPN in position */
- rldicr r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
- clrldi r15,r11,12 /* Clear crap at the top */
- rlwimi r15,r14,32-8,22,25 /* Move in U bits */
- rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */
-
- /* Mask out SW and UW if !DIRTY (XXX optimize this !) */
- andi. r11,r14,_PAGE_DIRTY
- bne 1f
- li r11,MAS3_SW|MAS3_UW
- andc r15,r15,r11
-1:
- srdi r16,r15,32
- mtspr SPRN_MAS3,r15
- mtspr SPRN_MAS7,r16
-
- tlbwe
-
-normal_tlb_miss_done:
- /* We don't bother with restoring DEAR or ESR since we know we are
- * level 0 and just going back to userland. They are only needed
- * if you are going to take an access fault
- */
- TLB_MISS_EPILOG_SUCCESS
- rfi
-
-normal_tlb_miss_access_fault:
- /* We need to check if it was an instruction miss */
- andi. r10,r11,_PAGE_BAP_UX
- bne 1f
- ld r14,EX_TLB_DEAR(r12)
- ld r15,EX_TLB_ESR(r12)
- mtspr SPRN_DEAR,r14
- mtspr SPRN_ESR,r15
- TLB_MISS_EPILOG_ERROR
- b exc_data_storage_book3e
-1: TLB_MISS_EPILOG_ERROR
- b exc_instruction_storage_book3e
-
-
/*
* This is the guts of the second-level TLB miss handler for direct
* misses. We are entered with:
@@ -893,201 +662,6 @@ virt_page_table_tlb_miss_whacko_fault:
TLB_MISS_EPILOG_ERROR
b exc_data_storage_book3e
-
-/**************************************************************
- * *
- * TLB miss handling for Book3E with hw page table support *
- * *
- **************************************************************/
-
-
-/* Data TLB miss */
- START_EXCEPTION(data_tlb_miss_htw)
- TLB_MISS_PROLOG
-
- /* Now we handle the fault proper. We only save DEAR in normal
- * fault case since that's the only interesting values here.
- * We could probably also optimize by not saving SRR0/1 in the
- * linear mapping case but I'll leave that for later
- */
- mfspr r14,SPRN_ESR
- mfspr r16,SPRN_DEAR /* get faulting address */
- srdi r11,r16,44 /* get region */
- xoris r11,r11,0xc
- cmpldi cr0,r11,0 /* linear mapping ? */
- beq tlb_load_linear /* yes -> go to linear map load */
- cmpldi cr1,r11,1 /* vmalloc mapping ? */
-
- /* We do the user/kernel test for the PID here along with the RW test
- */
- srdi. r11,r16,60 /* Check for user region */
- ld r15,PACAPGD(r13) /* Load user pgdir */
- beq htw_tlb_miss
-
- /* XXX replace the RMW cycles with immediate loads + writes */
-1: mfspr r10,SPRN_MAS1
- rlwinm r10,r10,0,16,1 /* Clear TID */
- mtspr SPRN_MAS1,r10
- ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */
- beq+ cr1,htw_tlb_miss
-
- /* We got a crappy address, just fault with whatever DEAR and ESR
- * are here
- */
- TLB_MISS_EPILOG_ERROR
- b exc_data_storage_book3e
-
-/* Instruction TLB miss */
- START_EXCEPTION(instruction_tlb_miss_htw)
- TLB_MISS_PROLOG
-
- /* If we take a recursive fault, the second level handler may need
- * to know whether we are handling a data or instruction fault in
- * order to get to the right store fault handler. We provide that
- * info by keeping a crazy value for ESR in r14
- */
- li r14,-1 /* store to exception frame is done later */
-
- /* Now we handle the fault proper. We only save DEAR in the non
- * linear mapping case since we know the linear mapping case will
- * not re-enter. We could indeed optimize and also not save SRR0/1
- * in the linear mapping case but I'll leave that for later
- *
- * Faulting address is SRR0 which is already in r16
- */
- srdi r11,r16,44 /* get region */
- xoris r11,r11,0xc
- cmpldi cr0,r11,0 /* linear mapping ? */
- beq tlb_load_linear /* yes -> go to linear map load */
- cmpldi cr1,r11,1 /* vmalloc mapping ? */
-
- /* We do the user/kernel test for the PID here along with the RW test
- */
- srdi. r11,r16,60 /* Check for user region */
- ld r15,PACAPGD(r13) /* Load user pgdir */
- beq htw_tlb_miss
-
- /* XXX replace the RMW cycles with immediate loads + writes */
-1: mfspr r10,SPRN_MAS1
- rlwinm r10,r10,0,16,1 /* Clear TID */
- mtspr SPRN_MAS1,r10
- ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */
- beq+ htw_tlb_miss
-
- /* We got a crappy address, just fault */
- TLB_MISS_EPILOG_ERROR
- b exc_instruction_storage_book3e
-
-
-/*
- * This is the guts of the second-level TLB miss handler for direct
- * misses. We are entered with:
- *
- * r16 = virtual page table faulting address
- * r15 = PGD pointer
- * r14 = ESR
- * r13 = PACA
- * r12 = TLB exception frame in PACA
- * r11 = crap (free to use)
- * r10 = crap (free to use)
- *
- * It can be re-entered by the linear mapping miss handler. However, to
- * avoid too much complication, it will save/restore things for us
- */
-htw_tlb_miss:
-#ifdef CONFIG_PPC_KUAP
- mfspr r10,SPRN_MAS1
- rlwinm. r10,r10,0,0x3fff0000
- beq- htw_tlb_miss_fault /* KUAP fault */
-#endif
- /* Search if we already have a TLB entry for that virtual address, and
- * if we do, bail out.
- *
- * MAS1:IND should be already set based on MAS4
- */
- PPC_TLBSRX_DOT(0,R16)
- beq htw_tlb_miss_done
-
- /* Now, we need to walk the page tables. First check if we are in
- * range.
- */
- rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
- bne- htw_tlb_miss_fault
-
- /* Get the PGD pointer */
- cmpldi cr0,r15,0
- beq- htw_tlb_miss_fault
-
- /* Get to PGD entry */
- rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3
- clrrdi r10,r11,3
- ldx r15,r10,r15
- cmpdi cr0,r15,0
- bge htw_tlb_miss_fault
-
- /* Get to PUD entry */
- rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3
- clrrdi r10,r11,3
- ldx r15,r10,r15
- cmpdi cr0,r15,0
- bge htw_tlb_miss_fault
-
- /* Get to PMD entry */
- rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3
- clrrdi r10,r11,3
- ldx r15,r10,r15
- cmpdi cr0,r15,0
- bge htw_tlb_miss_fault
-
- /* Ok, we're all right, we can now create an indirect entry for
- * a 1M or 256M page.
- *
- * The last trick is now that because we use "half" pages for
- * the HTW (1M IND is 2K and 256M IND is 32K) we need to account
- * for an added LSB bit to the RPN. For 64K pages, there is no
- * problem as we already use 32K arrays (half PTE pages), but for
- * 4K page we need to extract a bit from the virtual address and
- * insert it into the "PA52" bit of the RPN.
- */
- rlwimi r15,r16,32-9,20,20
- /* Now we build the MAS:
- *
- * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
- * MAS 1 : Almost fully setup
- * - PID already updated by caller if necessary
- * - TSIZE for now is base ind page size always
- * MAS 2 : Use defaults
- * MAS 3+7 : Needs to be done
- */
- ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
-
- srdi r16,r10,32
- mtspr SPRN_MAS3,r10
- mtspr SPRN_MAS7,r16
-
- tlbwe
-
-htw_tlb_miss_done:
- /* We don't bother with restoring DEAR or ESR since we know we are
- * level 0 and just going back to userland. They are only needed
- * if you are going to take an access fault
- */
- TLB_MISS_EPILOG_SUCCESS
- rfi
-
-htw_tlb_miss_fault:
- /* We need to check if it was an instruction miss. We know this
- * though because r14 would contain -1
- */
- cmpdi cr0,r14,-1
- beq 1f
- mtspr SPRN_DEAR,r16
- mtspr SPRN_ESR,r14
- TLB_MISS_EPILOG_ERROR
- b exc_data_storage_book3e
-1: TLB_MISS_EPILOG_ERROR
- b exc_instruction_storage_book3e
-
/*
* This is the guts of "any" level TLB miss handler for kernel linear
* mapping misses. We are entered with:
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 9e7ba9c3851f..ab0656115424 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -297,11 +297,8 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
}
#if defined(CONFIG_PPC_8xx)
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
- pte_t pte, unsigned long sz)
+static void __set_huge_pte_at(pmd_t *pmd, pte_t *ptep, pte_basic_t val)
{
- pmd_t *pmd = pmd_off(mm, addr);
- pte_basic_t val;
pte_basic_t *entry = (pte_basic_t *)ptep;
int num, i;
@@ -311,15 +308,60 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
*/
VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
- pte = set_pte_filter(pte, addr);
-
- val = pte_val(pte);
-
num = number_of_cells_per_pte(pmd, val, 1);
for (i = 0; i < num; i++, entry++, val += SZ_4K)
*entry = val;
}
+
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte, unsigned long sz)
+{
+ pmd_t *pmdp = pmd_off(mm, addr);
+
+ pte = set_pte_filter(pte, addr);
+
+ if (sz == SZ_8M) { /* Flag both PMD entries as 8M and fill both page tables */
+ *pmdp = __pmd(pmd_val(*pmdp) | _PMD_PAGE_8M);
+ *(pmdp + 1) = __pmd(pmd_val(*(pmdp + 1)) | _PMD_PAGE_8M);
+
+ __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp, 0), pte_val(pte));
+ __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp + 1, 0), pte_val(pte) + SZ_4M);
+ } else {
+ __set_huge_pte_at(pmdp, ptep, pte_val(pte));
+ }
+}
+#else
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte, unsigned long sz)
+{
+ unsigned long pdsize;
+ int i;
+
+ pte = set_pte_filter(pte, addr);
+
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
+
+ if (sz < PMD_SIZE)
+ pdsize = PAGE_SIZE;
+ else if (sz < PUD_SIZE)
+ pdsize = PMD_SIZE;
+ else if (sz < P4D_SIZE)
+ pdsize = PUD_SIZE;
+ else if (sz < PGDIR_SIZE)
+ pdsize = P4D_SIZE;
+ else
+ pdsize = PGDIR_SIZE;
+
+ for (i = 0; i < sz / pdsize; i++, ptep++, addr += pdsize) {
+ __set_pte_at(mm, addr, ptep, pte, 0);
+ pte = __pte(pte_val(pte) + ((unsigned long long)pdsize / PAGE_SIZE << PFN_PTE_SHIFT));
+ }
+}
#endif
#endif /* CONFIG_HUGETLB_PAGE */
@@ -367,11 +409,10 @@ unsigned long vmalloc_to_phys(void *va)
EXPORT_SYMBOL_GPL(vmalloc_to_phys);
/*
- * We have 4 cases for pgds and pmds:
+ * We have 3 cases for pgds and pmds:
* (1) invalid (all zeroes)
* (2) pointer to next table, as normal; bottom 6 bits == 0
* (3) leaf pte for huge page _PAGE_PTE set
- * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
*
* So long as we atomically load page table pointers we are safe against teardown,
* we can follow the address down to the page and take a ref on it.
@@ -382,11 +423,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
bool *is_thp, unsigned *hpage_shift)
{
pgd_t *pgdp;
+#ifdef CONFIG_PPC64
p4d_t p4d, *p4dp;
pud_t pud, *pudp;
+#endif
pmd_t pmd, *pmdp;
pte_t *ret_pte;
- hugepd_t *hpdp = NULL;
unsigned pdshift;
if (hpage_shift)
@@ -401,8 +443,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
* page fault or a page unmap. The return pte_t * is still not
* stable. So should be checked there for above conditions.
* Top level is an exception because it is folded into p4d.
+ *
+ * On PPC32, P4D/PUD/PMD are folded into PGD so go straight to
+ * PMD level.
*/
pgdp = pgdir + pgd_index(ea);
+#ifdef CONFIG_PPC64
p4dp = p4d_offset(pgdp, ea);
p4d = READ_ONCE(*p4dp);
pdshift = P4D_SHIFT;
@@ -415,11 +461,6 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
goto out;
}
- if (is_hugepd(__hugepd(p4d_val(p4d)))) {
- hpdp = (hugepd_t *)&p4d;
- goto out_huge;
- }
-
/*
* Even if we end up with an unmap, the pgtable will not
* be freed, because we do an rcu free and here we are
@@ -437,13 +478,11 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
goto out;
}
- if (is_hugepd(__hugepd(pud_val(pud)))) {
- hpdp = (hugepd_t *)&pud;
- goto out_huge;
- }
-
- pdshift = PMD_SHIFT;
pmdp = pmd_offset(&pud, ea);
+#else
+ pmdp = pmd_offset(pud_offset(p4d_offset(pgdp, ea), ea), ea);
+#endif
+ pdshift = PMD_SHIFT;
pmd = READ_ONCE(*pmdp);
/*
@@ -476,19 +515,8 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
goto out;
}
- if (is_hugepd(__hugepd(pmd_val(pmd)))) {
- hpdp = (hugepd_t *)&pmd;
- goto out_huge;
- }
-
return pte_offset_kernel(&pmd, ea);
-out_huge:
- if (!hpdp)
- return NULL;
-
- ret_pte = hugepte_offset(*hpdp, ea, pdshift);
- pdshift = hugepd_shift(*hpdp);
out:
if (hpage_shift)
*hpage_shift = pdshift;
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index cfd622ebf774..787b22206386 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -48,7 +48,7 @@ notrace void __init early_ioremap_init(void)
early_ioremap_setup();
}
-static void __init *early_alloc_pgtable(unsigned long size)
+void __init *early_alloc_pgtable(unsigned long size)
{
void *ptr = memblock_alloc(size, size);
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
index b1ce97a9dbfc..faf3624d8057 100644
--- a/arch/riscv/include/asm/hugetlb.h
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -44,7 +44,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
pte_t pte, int dirty);
#define __HAVE_ARCH_HUGE_PTEP_GET
-pte_t huge_ptep_get(pte_t *ptep);
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
#define arch_make_huge_pte arch_make_huge_pte
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index ab7a759e1a8c..089f3c9f56a3 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -514,8 +514,8 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
#define update_mmu_cache(vma, addr, ptep) \
update_mmu_cache_range(NULL, vma, addr, ptep, 1)
-#define __HAVE_ARCH_UPDATE_MMU_TLB
-#define update_mmu_tlb update_mmu_cache
+#define update_mmu_tlb_range(vma, addr, ptep, nr) \
+ update_mmu_cache_range(NULL, vma, addr, ptep, nr)
static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 0ebd968b33c9..42314f093922 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -3,7 +3,7 @@
#include <linux/err.h>
#ifdef CONFIG_RISCV_ISA_SVNAPOT
-pte_t huge_ptep_get(pte_t *ptep)
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
unsigned long pte_num;
int i;
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 371c2bf88149..59e0d861e26f 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -159,6 +159,7 @@ config S390
select HAVE_ARCH_KASAN
select HAVE_ARCH_KASAN_VMALLOC
select HAVE_ARCH_KCSAN
+ select HAVE_ARCH_KMSAN
select HAVE_ARCH_KFENCE
select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_SECCOMP_FILTER
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index f2b21c7a70ef..7fd57398221e 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -36,7 +36,7 @@ KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option
KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_NO_ARRAY_BOUNDS),-Wno-array-bounds)
UTS_MACHINE := s390x
-STACK_SIZE := $(if $(CONFIG_KASAN),65536,16384)
+STACK_SIZE := $(if $(CONFIG_KASAN),65536,$(if $(CONFIG_KMSAN),65536,16384))
CHECKFLAGS += -D__s390__ -D__s390x__
export LD_BFD
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile
index 070c9b2e905f..e7658997452b 100644
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -3,11 +3,13 @@
# Makefile for the linux s390-specific parts of the memory manager.
#
+# Tooling runtimes are unavailable and cannot be linked for early boot code
KCOV_INSTRUMENT := n
GCOV_PROFILE := n
UBSAN_SANITIZE := n
KASAN_SANITIZE := n
KCSAN_SANITIZE := n
+KMSAN_SANITIZE := n
KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR)
KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR)
@@ -42,6 +44,7 @@ obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) +=
obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o
obj-$(CONFIG_KERNEL_ZSTD) += clz_ctz.o
+obj-$(CONFIG_KMSAN) += kmsan.o
obj-all := $(obj-y) piggy.o syms.o
targets := bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y)
diff --git a/arch/s390/boot/kmsan.c b/arch/s390/boot/kmsan.c
new file mode 100644
index 000000000000..e7b3ac48143e
--- /dev/null
+++ b/arch/s390/boot/kmsan.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kmsan-checks.h>
+
+void kmsan_unpoison_memory(const void *address, size_t size)
+{
+}
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index 320c964cd603..c59014945af0 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -304,11 +304,18 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size)
MODULES_END = round_down(kernel_start, _SEGMENT_SIZE);
MODULES_VADDR = MODULES_END - MODULES_LEN;
VMALLOC_END = MODULES_VADDR;
+ if (IS_ENABLED(CONFIG_KMSAN))
+ VMALLOC_END -= MODULES_LEN * 2;
/* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */
vsize = (VMALLOC_END - FIXMAP_SIZE) / 2;
vsize = round_down(vsize, _SEGMENT_SIZE);
vmalloc_size = min(vmalloc_size, vsize);
+ if (IS_ENABLED(CONFIG_KMSAN)) {
+ /* take 2/3 of vmalloc area for KMSAN shadow and origins */
+ vmalloc_size = round_down(vmalloc_size / 3, _SEGMENT_SIZE);
+ VMALLOC_END -= vmalloc_size * 2;
+ }
VMALLOC_START = VMALLOC_END - vmalloc_size;
__memcpy_real_area = round_down(VMALLOC_START - MEMCPY_REAL_SIZE, PAGE_SIZE);
diff --git a/arch/s390/boot/string.c b/arch/s390/boot/string.c
index faccb33b462c..f6b9b1df48a8 100644
--- a/arch/s390/boot/string.c
+++ b/arch/s390/boot/string.c
@@ -1,11 +1,18 @@
// SPDX-License-Identifier: GPL-2.0
+#define IN_BOOT_STRING_C 1
#include <linux/ctype.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#undef CONFIG_KASAN
#undef CONFIG_KASAN_GENERIC
+#undef CONFIG_KMSAN
#include "../lib/string.c"
+/*
+ * Duplicate some functions from the common lib/string.c
+ * instead of fully including it.
+ */
+
int strncmp(const char *cs, const char *ct, size_t count)
{
unsigned char c1, c2;
@@ -22,6 +29,15 @@ int strncmp(const char *cs, const char *ct, size_t count)
return 0;
}
+void *memset64(uint64_t *s, uint64_t v, size_t count)
+{
+ uint64_t *xs = s;
+
+ while (count--)
+ *xs++ = v;
+ return s;
+}
+
char *skip_spaces(const char *str)
{
while (isspace(*str))
diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h
index b89159591ca0..46f5c9660616 100644
--- a/arch/s390/include/asm/checksum.h
+++ b/arch/s390/include/asm/checksum.h
@@ -13,6 +13,7 @@
#define _S390_CHECKSUM_H
#include <linux/instrumented.h>
+#include <linux/kmsan-checks.h>
#include <linux/in6.h>
static inline __wsum cksm(const void *buff, int len, __wsum sum)
@@ -23,6 +24,7 @@ static inline __wsum cksm(const void *buff, int len, __wsum sum)
};
instrument_read(buff, len);
+ kmsan_check_memory(buff, len);
asm volatile("\n"
"0: cksm %[sum],%[rp]\n"
" jo 0b\n"
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index c786538e397c..dae8843b164f 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -12,6 +12,7 @@
#define _ASM_S390_CPACF_H
#include <asm/facility.h>
+#include <linux/kmsan-checks.h>
/*
* Instruction opcodes for the CPACF instructions
@@ -542,6 +543,8 @@ static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len,
: [ucbuf] "+&d" (u.pair), [cbuf] "+&d" (c.pair)
: [fc] "K" (CPACF_PRNO_TRNG), [opc] "i" (CPACF_PRNO)
: "cc", "memory", "0");
+ kmsan_unpoison_memory(ucbuf, ucbuf_len);
+ kmsan_unpoison_memory(cbuf, cbuf_len);
}
/**
diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index a0de5b9b02ea..9e4bbc3e53f8 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -10,6 +10,7 @@
#define _ASM_S390_CPU_MF_H
#include <linux/errno.h>
+#include <linux/kmsan-checks.h>
#include <asm/asm-extable.h>
#include <asm/facility.h>
@@ -239,6 +240,11 @@ static __always_inline int stcctm(enum stcctm_ctr_set set, u64 range, u64 *dest)
: "=d" (cc)
: "Q" (*dest), "d" (range), "i" (set)
: "cc", "memory");
+ /*
+ * If cc == 2, less than RANGE counters are stored, but it's not easy
+ * to tell how many. Always unpoison the whole range for simplicity.
+ */
+ kmsan_unpoison_memory(dest, range * sizeof(u64));
return cc;
}
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index ce5f4fe8be4d..cf1b5d6fb1a6 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -19,7 +19,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned long sz);
void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
-pte_t huge_ptep_get(pte_t *ptep);
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep);
@@ -64,7 +64,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t pte, int dirty)
{
- int changed = !pte_same(huge_ptep_get(ptep), pte);
+ int changed = !pte_same(huge_ptep_get(vma->vm_mm, addr, ptep), pte);
if (changed) {
huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
__set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
diff --git a/arch/s390/include/asm/irqflags.h b/arch/s390/include/asm/irqflags.h
index 02427b205c11..bcab456dfb80 100644
--- a/arch/s390/include/asm/irqflags.h
+++ b/arch/s390/include/asm/irqflags.h
@@ -37,12 +37,18 @@ static __always_inline void __arch_local_irq_ssm(unsigned long flags)
asm volatile("ssm %0" : : "Q" (flags) : "memory");
}
-static __always_inline unsigned long arch_local_save_flags(void)
+#ifdef CONFIG_KMSAN
+#define arch_local_irq_attributes noinline notrace __no_sanitize_memory __maybe_unused
+#else
+#define arch_local_irq_attributes __always_inline
+#endif
+
+static arch_local_irq_attributes unsigned long arch_local_save_flags(void)
{
return __arch_local_irq_stnsm(0xff);
}
-static __always_inline unsigned long arch_local_irq_save(void)
+static arch_local_irq_attributes unsigned long arch_local_irq_save(void)
{
return __arch_local_irq_stnsm(0xfc);
}
@@ -52,7 +58,12 @@ static __always_inline void arch_local_irq_disable(void)
arch_local_irq_save();
}
-static __always_inline void arch_local_irq_enable(void)
+static arch_local_irq_attributes void arch_local_irq_enable_external(void)
+{
+ __arch_local_irq_stosm(0x01);
+}
+
+static arch_local_irq_attributes void arch_local_irq_enable(void)
{
__arch_local_irq_stosm(0x03);
}
diff --git a/arch/s390/include/asm/kmsan.h b/arch/s390/include/asm/kmsan.h
new file mode 100644
index 000000000000..27db65fbf3f6
--- /dev/null
+++ b/arch/s390/include/asm/kmsan.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_KMSAN_H
+#define _ASM_S390_KMSAN_H
+
+#include <asm/lowcore.h>
+#include <asm/page.h>
+#include <linux/kmsan.h>
+#include <linux/mmzone.h>
+#include <linux/stddef.h>
+
+#ifndef MODULE
+
+static inline bool is_lowcore_addr(void *addr)
+{
+ return addr >= (void *)&S390_lowcore &&
+ addr < (void *)(&S390_lowcore + 1);
+}
+
+static inline void *arch_kmsan_get_meta_or_null(void *addr, bool is_origin)
+{
+ if (is_lowcore_addr(addr)) {
+ /*
+ * Different lowcores accessed via S390_lowcore are described
+ * by the same struct page. Resolve the prefix manually in
+ * order to get a distinct struct page.
+ */
+ addr += (void *)lowcore_ptr[raw_smp_processor_id()] -
+ (void *)&S390_lowcore;
+ if (KMSAN_WARN_ON(is_lowcore_addr(addr)))
+ return NULL;
+ return kmsan_get_metadata(addr, is_origin);
+ }
+ return NULL;
+}
+
+static inline bool kmsan_virt_addr_valid(void *addr)
+{
+ bool ret;
+
+ /*
+ * pfn_valid() relies on RCU, and may call into the scheduler on exiting
+ * the critical section. However, this would result in recursion with
+ * KMSAN. Therefore, disable preemption here, and re-enable preemption
+ * below while suppressing reschedules to avoid recursion.
+ *
+ * Note, this sacrifices occasionally breaking scheduling guarantees.
+ * Although, a kernel compiled with KMSAN has already given up on any
+ * performance guarantees due to being heavily instrumented.
+ */
+ preempt_disable();
+ ret = virt_addr_valid(addr);
+ preempt_enable_no_resched();
+
+ return ret;
+}
+
+#endif /* !MODULE */
+
+#endif /* _ASM_S390_KMSAN_H */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index b5632dbe5438..3fa280d0672a 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -107,6 +107,18 @@ static inline int is_module_addr(void *addr)
return 1;
}
+#ifdef CONFIG_KMSAN
+#define KMSAN_VMALLOC_SIZE (VMALLOC_END - VMALLOC_START)
+#define KMSAN_VMALLOC_SHADOW_START VMALLOC_END
+#define KMSAN_VMALLOC_SHADOW_END (KMSAN_VMALLOC_SHADOW_START + KMSAN_VMALLOC_SIZE)
+#define KMSAN_VMALLOC_ORIGIN_START KMSAN_VMALLOC_SHADOW_END
+#define KMSAN_VMALLOC_ORIGIN_END (KMSAN_VMALLOC_ORIGIN_START + KMSAN_VMALLOC_SIZE)
+#define KMSAN_MODULES_SHADOW_START KMSAN_VMALLOC_ORIGIN_END
+#define KMSAN_MODULES_SHADOW_END (KMSAN_MODULES_SHADOW_START + MODULES_LEN)
+#define KMSAN_MODULES_ORIGIN_START KMSAN_MODULES_SHADOW_END
+#define KMSAN_MODULES_ORIGIN_END (KMSAN_MODULES_ORIGIN_START + MODULES_LEN)
+#endif
+
#ifdef CONFIG_RANDOMIZE_BASE
#define KASLR_LEN (1UL << 31)
#else
diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h
index 351685de53d2..2ab868cbae6c 100644
--- a/arch/s390/include/asm/string.h
+++ b/arch/s390/include/asm/string.h
@@ -15,15 +15,12 @@
#define __HAVE_ARCH_MEMCPY /* gcc builtin & arch function */
#define __HAVE_ARCH_MEMMOVE /* gcc builtin & arch function */
#define __HAVE_ARCH_MEMSET /* gcc builtin & arch function */
-#define __HAVE_ARCH_MEMSET16 /* arch function */
-#define __HAVE_ARCH_MEMSET32 /* arch function */
-#define __HAVE_ARCH_MEMSET64 /* arch function */
void *memcpy(void *dest, const void *src, size_t n);
void *memset(void *s, int c, size_t n);
void *memmove(void *dest, const void *src, size_t n);
-#ifndef CONFIG_KASAN
+#if !defined(CONFIG_KASAN) && !defined(CONFIG_KMSAN)
#define __HAVE_ARCH_MEMCHR /* inline & arch function */
#define __HAVE_ARCH_MEMCMP /* arch function */
#define __HAVE_ARCH_MEMSCAN /* inline & arch function */
@@ -36,6 +33,9 @@ void *memmove(void *dest, const void *src, size_t n);
#define __HAVE_ARCH_STRNCPY /* arch function */
#define __HAVE_ARCH_STRNLEN /* inline & arch function */
#define __HAVE_ARCH_STRSTR /* arch function */
+#define __HAVE_ARCH_MEMSET16 /* arch function */
+#define __HAVE_ARCH_MEMSET32 /* arch function */
+#define __HAVE_ARCH_MEMSET64 /* arch function */
/* Prototypes for non-inlined arch strings functions. */
int memcmp(const void *s1, const void *s2, size_t n);
@@ -44,7 +44,7 @@ size_t strlcat(char *dest, const char *src, size_t n);
char *strncat(char *dest, const char *src, size_t n);
char *strncpy(char *dest, const char *src, size_t n);
char *strstr(const char *s1, const char *s2);
-#endif /* !CONFIG_KASAN */
+#endif /* !defined(CONFIG_KASAN) && !defined(CONFIG_KMSAN) */
#undef __HAVE_ARCH_STRCHR
#undef __HAVE_ARCH_STRNCHR
@@ -74,20 +74,30 @@ void *__memset16(uint16_t *s, uint16_t v, size_t count);
void *__memset32(uint32_t *s, uint32_t v, size_t count);
void *__memset64(uint64_t *s, uint64_t v, size_t count);
+#ifdef __HAVE_ARCH_MEMSET16
static inline void *memset16(uint16_t *s, uint16_t v, size_t count)
{
return __memset16(s, v, count * sizeof(v));
}
+#endif
+#ifdef __HAVE_ARCH_MEMSET32
static inline void *memset32(uint32_t *s, uint32_t v, size_t count)
{
return __memset32(s, v, count * sizeof(v));
}
+#endif
+#ifdef __HAVE_ARCH_MEMSET64
+#ifdef IN_BOOT_STRING_C
+void *memset64(uint64_t *s, uint64_t v, size_t count);
+#else
static inline void *memset64(uint64_t *s, uint64_t v, size_t count)
{
return __memset64(s, v, count * sizeof(v));
}
+#endif
+#endif
#if !defined(IN_ARCH_STRING_C) && (!defined(CONFIG_FORTIFY_SOURCE) || defined(__NO_FORTIFY))
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index a674c7d25da5..d02a709717b8 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -16,7 +16,7 @@
/*
* General size of kernel stacks
*/
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN) || defined(CONFIG_KMSAN)
#define THREAD_SIZE_ORDER 4
#else
#define THREAD_SIZE_ORDER 2
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 81ae8a98e7ec..9213be0529ee 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -18,6 +18,7 @@
#include <asm/extable.h>
#include <asm/facility.h>
#include <asm-generic/access_ok.h>
+#include <linux/instrumented.h>
void debug_user_asce(int exit);
@@ -78,13 +79,24 @@ union oac {
int __noreturn __put_user_bad(void);
-#define __put_user_asm(to, from, size) \
-({ \
+#ifdef CONFIG_KMSAN
+#define get_put_user_noinstr_attributes \
+ noinline __maybe_unused __no_sanitize_memory
+#else
+#define get_put_user_noinstr_attributes __always_inline
+#endif
+
+#define DEFINE_PUT_USER(type) \
+static get_put_user_noinstr_attributes int \
+__put_user_##type##_noinstr(unsigned type __user *to, \
+ unsigned type *from, \
+ unsigned long size) \
+{ \
union oac __oac_spec = { \
.oac1.as = PSW_BITS_AS_SECONDARY, \
.oac1.a = 1, \
}; \
- int __rc; \
+ int rc; \
\
asm volatile( \
" lr 0,%[spec]\n" \
@@ -93,12 +105,28 @@ int __noreturn __put_user_bad(void);
"2:\n" \
EX_TABLE_UA_STORE(0b, 2b, %[rc]) \
EX_TABLE_UA_STORE(1b, 2b, %[rc]) \
- : [rc] "=&d" (__rc), [_to] "+Q" (*(to)) \
+ : [rc] "=&d" (rc), [_to] "+Q" (*(to)) \
: [_size] "d" (size), [_from] "Q" (*(from)), \
[spec] "d" (__oac_spec.val) \
: "cc", "0"); \
- __rc; \
-})
+ return rc; \
+} \
+ \
+static __always_inline int \
+__put_user_##type(unsigned type __user *to, unsigned type *from, \
+ unsigned long size) \
+{ \
+ int rc; \
+ \
+ rc = __put_user_##type##_noinstr(to, from, size); \
+ instrument_put_user(*from, to, size); \
+ return rc; \
+}
+
+DEFINE_PUT_USER(char);
+DEFINE_PUT_USER(short);
+DEFINE_PUT_USER(int);
+DEFINE_PUT_USER(long);
static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned long size)
{
@@ -106,24 +134,24 @@ static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned lon
switch (size) {
case 1:
- rc = __put_user_asm((unsigned char __user *)ptr,
- (unsigned char *)x,
- size);
+ rc = __put_user_char((unsigned char __user *)ptr,
+ (unsigned char *)x,
+ size);
break;
case 2:
- rc = __put_user_asm((unsigned short __user *)ptr,
- (unsigned short *)x,
- size);
+ rc = __put_user_short((unsigned short __user *)ptr,
+ (unsigned short *)x,
+ size);
break;
case 4:
- rc = __put_user_asm((unsigned int __user *)ptr,
+ rc = __put_user_int((unsigned int __user *)ptr,
(unsigned int *)x,
size);
break;
case 8:
- rc = __put_user_asm((unsigned long __user *)ptr,
- (unsigned long *)x,
- size);
+ rc = __put_user_long((unsigned long __user *)ptr,
+ (unsigned long *)x,
+ size);
break;
default:
__put_user_bad();
@@ -134,13 +162,17 @@ static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned lon
int __noreturn __get_user_bad(void);
-#define __get_user_asm(to, from, size) \
-({ \
+#define DEFINE_GET_USER(type) \
+static get_put_user_noinstr_attributes int \
+__get_user_##type##_noinstr(unsigned type *to, \
+ unsigned type __user *from, \
+ unsigned long size) \
+{ \
union oac __oac_spec = { \
.oac2.as = PSW_BITS_AS_SECONDARY, \
.oac2.a = 1, \
}; \
- int __rc; \
+ int rc; \
\
asm volatile( \
" lr 0,%[spec]\n" \
@@ -149,13 +181,29 @@ int __noreturn __get_user_bad(void);
"2:\n" \
EX_TABLE_UA_LOAD_MEM(0b, 2b, %[rc], %[_to], %[_ksize]) \
EX_TABLE_UA_LOAD_MEM(1b, 2b, %[rc], %[_to], %[_ksize]) \
- : [rc] "=&d" (__rc), "=Q" (*(to)) \
+ : [rc] "=&d" (rc), "=Q" (*(to)) \
: [_size] "d" (size), [_from] "Q" (*(from)), \
[spec] "d" (__oac_spec.val), [_to] "a" (to), \
[_ksize] "K" (size) \
: "cc", "0"); \
- __rc; \
-})
+ return rc; \
+} \
+ \
+static __always_inline int \
+__get_user_##type(unsigned type *to, unsigned type __user *from, \
+ unsigned long size) \
+{ \
+ int rc; \
+ \
+ rc = __get_user_##type##_noinstr(to, from, size); \
+ instrument_get_user(*to); \
+ return rc; \
+}
+
+DEFINE_GET_USER(char);
+DEFINE_GET_USER(short);
+DEFINE_GET_USER(int);
+DEFINE_GET_USER(long);
static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size)
{
@@ -163,24 +211,24 @@ static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsign
switch (size) {
case 1:
- rc = __get_user_asm((unsigned char *)x,
- (unsigned char __user *)ptr,
- size);
+ rc = __get_user_char((unsigned char *)x,
+ (unsigned char __user *)ptr,
+ size);
break;
case 2:
- rc = __get_user_asm((unsigned short *)x,
- (unsigned short __user *)ptr,
- size);
+ rc = __get_user_short((unsigned short *)x,
+ (unsigned short __user *)ptr,
+ size);
break;
case 4:
- rc = __get_user_asm((unsigned int *)x,
+ rc = __get_user_int((unsigned int *)x,
(unsigned int __user *)ptr,
size);
break;
case 8:
- rc = __get_user_asm((unsigned long *)x,
- (unsigned long __user *)ptr,
- size);
+ rc = __get_user_long((unsigned long *)x,
+ (unsigned long __user *)ptr,
+ size);
break;
default:
__get_user_bad();
diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c
index 9b65f04c83de..ac7b8c8e3133 100644
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@ -282,12 +282,14 @@ int diag224(void *ptr)
int rc = -EOPNOTSUPP;
diag_stat_inc(DIAG_STAT_X224);
- asm volatile(
- " diag %1,%2,0x224\n"
- "0: lhi %0,0x0\n"
+ asm volatile("\n"
+ " diag %[type],%[addr],0x224\n"
+ "0: lhi %[rc],0\n"
"1:\n"
EX_TABLE(0b,1b)
- : "+d" (rc) :"d" (0), "d" (addr) : "memory");
+ : [rc] "+d" (rc)
+ , "=m" (*(struct { char buf[PAGE_SIZE]; } *)ptr)
+ : [type] "d" (0), [addr] "d" (addr));
return rc;
}
EXPORT_SYMBOL(diag224);
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index ddf2ee47cb87..0bd6adc40a34 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -12,6 +12,7 @@
#include <linux/ftrace.h>
#include <linux/kernel.h>
#include <linux/types.h>
+#include <linux/kmsan-checks.h>
#include <linux/kprobes.h>
#include <linux/execmem.h>
#include <trace/syscall.h>
@@ -303,6 +304,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
if (bit < 0)
return;
+ kmsan_unpoison_memory(fregs, sizeof(*fregs));
regs = ftrace_get_regs(fregs);
p = get_kprobe((kprobe_opcode_t *)ip);
if (!regs || unlikely(!p) || kprobe_disabled(p))
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index a7c211a3a0c9..160b2acba8db 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -27,6 +27,7 @@
#include <linux/uaccess.h>
#include <linux/cpu.h>
#include <linux/entry-common.h>
+#include <linux/kmsan.h>
#include <asm/asm-extable.h>
#include <asm/vtime.h>
#include <asm/fpu.h>
@@ -262,6 +263,11 @@ static void monitor_event_exception(struct pt_regs *regs)
void kernel_stack_overflow(struct pt_regs *regs)
{
+ /*
+ * Normally regs are unpoisoned by the generic entry code, but
+ * kernel_stack_overflow() is a rare case that is called bypassing it.
+ */
+ kmsan_unpoison_entry_regs(regs);
bust_spinlocks(1);
printk("Kernel stack overflow.\n");
show_regs(regs);
diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c
index 0ece156fdd7c..cd44be2b6ce8 100644
--- a/arch/s390/kernel/unwind_bc.c
+++ b/arch/s390/kernel/unwind_bc.c
@@ -49,6 +49,8 @@ static inline bool is_final_pt_regs(struct unwind_state *state,
READ_ONCE_NOCHECK(regs->psw.mask) & PSW_MASK_PSTATE;
}
+/* Avoid KMSAN false positives from touching uninitialized frames. */
+__no_kmsan_checks
bool unwind_next_frame(struct unwind_state *state)
{
struct stack_info *info = &state->stack_info;
@@ -118,6 +120,8 @@ out_stop:
}
EXPORT_SYMBOL_GPL(unwind_next_frame);
+/* Avoid KMSAN false positives from touching uninitialized frames. */
+__no_kmsan_checks
void __unwind_start(struct unwind_state *state, struct task_struct *task,
struct pt_regs *regs, unsigned long first_frame)
{
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 34d558164f0d..ded0eff58a19 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -169,7 +169,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
__set_huge_pte_at(mm, addr, ptep, pte);
}
-pte_t huge_ptep_get(pte_t *ptep)
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
return __rste_to_pte(pte_val(*ptep));
}
@@ -177,7 +177,7 @@ pte_t huge_ptep_get(pte_t *ptep)
pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- pte_t pte = huge_ptep_get(ptep);
+ pte_t pte = huge_ptep_get(mm, addr, ptep);
pmd_t *pmdp = (pmd_t *) ptep;
pud_t *pudp = (pud_t *) ptep;
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 00b247d924a9..53d7cb5bbffe 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -490,7 +490,7 @@ void flush_dcache_folio(struct folio *folio)
}
set_dcache_dirty(folio, this_cpu);
} else {
- /* We could delay the flush for the !page_mapping
+ /* We could delay the flush for the !folio_mapping
* case too. But that case is for exec env/arg
* pages and those are %99 certainly going to get
* faulted into the tlb (and thus flushed) anyways.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 28002cc7a37d..d8dbeac8b206 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -988,8 +988,6 @@ static void __meminit free_pagetable(struct page *page, int order)
/* bootmem page has reserved flag */
if (PageReserved(page)) {
- __ClearPageReserved(page);
-
magic = page->index;
if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
while (nr_pages--)
@@ -1362,18 +1360,6 @@ void __init mem_init(void)
preallocate_vmalloc_pages();
}
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
-{
- /*
- * More CPUs always led to greater speedups on tested systems, up to
- * all the nodes' CPUs. Use all since the system is otherwise idle
- * now.
- */
- return max_t(int, cpumask_weight(node_cpumask), 1);
-}
-#endif
-
int kernel_set_to_readonly;
void mark_rodata_ro(void)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 443a97e515c0..44f7b2ea6a07 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1119,8 +1119,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
lpinc = PMD_SIZE;
/*
* Clear the PSE flags if the PRESENT flag is not set
- * otherwise pmd_present/pmd_huge will return true
- * even on a non present pmd.
+ * otherwise pmd_present() will return true even on a non
+ * present pmd.
*/
if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
pgprot_val(ref_prot) &= ~_PAGE_PSE;
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index 9a7e5e57ee9a..1647a7cc3fbf 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -410,9 +410,9 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
typedef pte_t *pte_addr_t;
-void update_mmu_tlb(struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep);
-#define __HAVE_ARCH_UPDATE_MMU_TLB
+void update_mmu_tlb_range(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, unsigned int nr);
+#define update_mmu_tlb_range update_mmu_tlb_range
#endif /* !defined (__ASSEMBLY__) */
diff --git a/arch/xtensa/mm/tlb.c b/arch/xtensa/mm/tlb.c
index d8b60d6e50a8..0a1a815dc796 100644
--- a/arch/xtensa/mm/tlb.c
+++ b/arch/xtensa/mm/tlb.c
@@ -163,10 +163,10 @@ void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
}
}
-void update_mmu_tlb(struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep)
+void update_mmu_tlb_range(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, unsigned int nr)
{
- local_flush_tlb_page(vma, address);
+ local_flush_tlb_range(vma, address, address + PAGE_SIZE * nr);
}
#ifdef CONFIG_DEBUG_TLB_SANITY