diff options
Diffstat (limited to 'arch/arm64/mm')
-rw-r--r-- | arch/arm64/mm/fault.c | 30 | ||||
-rw-r--r-- | arch/arm64/mm/fixmap.c | 39 | ||||
-rw-r--r-- | arch/arm64/mm/init.c | 2 | ||||
-rw-r--r-- | arch/arm64/mm/kasan_init.c | 159 | ||||
-rw-r--r-- | arch/arm64/mm/mmap.c | 4 | ||||
-rw-r--r-- | arch/arm64/mm/mmu.c | 257 | ||||
-rw-r--r-- | arch/arm64/mm/pgd.c | 17 | ||||
-rw-r--r-- | arch/arm64/mm/proc.S | 116 | ||||
-rw-r--r-- | arch/arm64/mm/ptdump.c | 77 |
9 files changed, 448 insertions, 253 deletions
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 55f6455a8284..60265ede48fe 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -257,16 +257,14 @@ static bool is_el1_data_abort(unsigned long esr) static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { - unsigned long fsc_type = esr & ESR_ELx_FSC_TYPE; - if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr)) return false; - if (fsc_type == ESR_ELx_FSC_PERM) + if (esr_fsc_is_permission_fault(esr)) return true; if (is_ttbr0_addr(addr) && system_uses_ttbr0_pan()) - return fsc_type == ESR_ELx_FSC_FAULT && + return esr_fsc_is_translation_fault(esr) && (regs->pstate & PSR_PAN_BIT); return false; @@ -279,8 +277,7 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, unsigned long flags; u64 par, dfsc; - if (!is_el1_data_abort(esr) || - (esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT) + if (!is_el1_data_abort(esr) || !esr_fsc_is_translation_fault(esr)) return false; local_irq_save(flags); @@ -301,7 +298,7 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, * treat the translation fault as spurious. */ dfsc = FIELD_GET(SYS_PAR_EL1_FST, par); - return (dfsc & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT; + return !esr_fsc_is_translation_fault(dfsc); } static void die_kernel_fault(const char *msg, unsigned long addr, @@ -368,11 +365,6 @@ static bool is_el1_mte_sync_tag_check_fault(unsigned long esr) return false; } -static bool is_translation_fault(unsigned long esr) -{ - return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT; -} - static void __do_kernel_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { @@ -405,7 +397,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr, } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { - if (is_translation_fault(esr) && + if (esr_fsc_is_translation_fault(esr) && kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) return; @@ -782,18 +774,18 @@ static const struct fault_info fault_info[] = { { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 8" }, + { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 0 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 access flag fault" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 12" }, + { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 0 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, { do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" }, { do_tag_check_fault, SIGSEGV, SEGV_MTESERR, "synchronous tag check fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 18" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 19" }, + { do_sea, SIGKILL, SI_KERNEL, "level -1 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 1 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 2 (translation table walk)" }, @@ -801,7 +793,7 @@ static const struct fault_info fault_info[] = { { do_sea, SIGBUS, BUS_OBJERR, "synchronous parity or ECC error" }, // Reserved when RAS is implemented { do_bad, SIGKILL, SI_KERNEL, "unknown 25" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 26" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 27" }, + { do_sea, SIGKILL, SI_KERNEL, "level -1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented @@ -815,9 +807,9 @@ static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "unknown 38" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 39" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 40" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 41" }, + { do_bad, SIGKILL, SI_KERNEL, "level -1 address size fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 42" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 43" }, + { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level -1 translation fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 44" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 45" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 46" }, diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c index c0a3301203bd..d22506e9c7fd 100644 --- a/arch/arm64/mm/fixmap.c +++ b/arch/arm64/mm/fixmap.c @@ -16,6 +16,9 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> +/* ensure that the fixmap region does not grow down into the PCI I/O region */ +static_assert(FIXADDR_TOT_START > PCI_IO_END); + #define NR_BM_PTE_TABLES \ SPAN_NR_ENTRIES(FIXADDR_TOT_START, FIXADDR_TOP, PMD_SHIFT) #define NR_BM_PMD_TABLES \ @@ -101,7 +104,7 @@ void __init early_fixmap_init(void) unsigned long end = FIXADDR_TOP; pgd_t *pgdp = pgd_offset_k(addr); - p4d_t *p4dp = p4d_offset(pgdp, addr); + p4d_t *p4dp = p4d_offset_kimg(pgdp, addr); early_fixmap_init_pud(p4dp, addr, end); } @@ -167,37 +170,3 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot) return dt_virt; } - -/* - * Copy the fixmap region into a new pgdir. - */ -void __init fixmap_copy(pgd_t *pgdir) -{ - if (!READ_ONCE(pgd_val(*pgd_offset_pgd(pgdir, FIXADDR_TOT_START)))) { - /* - * The fixmap falls in a separate pgd to the kernel, and doesn't - * live in the carveout for the swapper_pg_dir. We can simply - * re-use the existing dir for the fixmap. - */ - set_pgd(pgd_offset_pgd(pgdir, FIXADDR_TOT_START), - READ_ONCE(*pgd_offset_k(FIXADDR_TOT_START))); - } else if (CONFIG_PGTABLE_LEVELS > 3) { - pgd_t *bm_pgdp; - p4d_t *bm_p4dp; - pud_t *bm_pudp; - /* - * The fixmap shares its top level pgd entry with the kernel - * mapping. This can really only occur when we are running - * with 16k/4 levels, so we can simply reuse the pud level - * entry instead. - */ - BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); - bm_pgdp = pgd_offset_pgd(pgdir, FIXADDR_TOT_START); - bm_p4dp = p4d_offset(bm_pgdp, FIXADDR_TOT_START); - bm_pudp = pud_set_fixmap_offset(bm_p4dp, FIXADDR_TOT_START); - pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd)); - pud_clear_fixmap(); - } else { - BUG(); - } -} diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 74c1db8ce271..0f427b50fdc3 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -238,7 +238,7 @@ void __init arm64_memblock_init(void) * physical address of PAGE_OFFSET, we have to *subtract* from it. */ if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52)) - memstart_addr -= _PAGE_OFFSET(48) - _PAGE_OFFSET(52); + memstart_addr -= _PAGE_OFFSET(vabits_actual) - _PAGE_OFFSET(52); /* * Apply the memory limit if it was set. Since the kernel may be loaded diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 4c7ad574b946..fbddbf9faf19 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -23,7 +23,7 @@ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) -static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE); +static pgd_t tmp_pg_dir[PTRS_PER_PTE] __initdata __aligned(PAGE_SIZE); /* * The p*d_populate functions call virt_to_phys implicitly so they can't be used @@ -99,6 +99,19 @@ static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node, return early ? pud_offset_kimg(p4dp, addr) : pud_offset(p4dp, addr); } +static p4d_t *__init kasan_p4d_offset(pgd_t *pgdp, unsigned long addr, int node, + bool early) +{ + if (pgd_none(READ_ONCE(*pgdp))) { + phys_addr_t p4d_phys = early ? + __pa_symbol(kasan_early_shadow_p4d) + : kasan_alloc_zeroed_page(node); + __pgd_populate(pgdp, p4d_phys, PGD_TYPE_TABLE); + } + + return early ? p4d_offset_kimg(pgdp, addr) : p4d_offset(pgdp, addr); +} + static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, unsigned long end, int node, bool early) { @@ -144,12 +157,12 @@ static void __init kasan_p4d_populate(pgd_t *pgdp, unsigned long addr, unsigned long end, int node, bool early) { unsigned long next; - p4d_t *p4dp = p4d_offset(pgdp, addr); + p4d_t *p4dp = kasan_p4d_offset(pgdp, addr, node, early); do { next = p4d_addr_end(addr, end); kasan_pud_populate(p4dp, addr, next, node, early); - } while (p4dp++, addr = next, addr != end); + } while (p4dp++, addr = next, addr != end && p4d_none(READ_ONCE(*p4dp))); } static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, @@ -165,19 +178,48 @@ static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, } while (pgdp++, addr = next, addr != end); } +#if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS > 4 +#define SHADOW_ALIGN P4D_SIZE +#else +#define SHADOW_ALIGN PUD_SIZE +#endif + +/* + * Return whether 'addr' is aligned to the size covered by a root level + * descriptor. + */ +static bool __init root_level_aligned(u64 addr) +{ + int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 1) * (PAGE_SHIFT - 3); + + return (addr % (PAGE_SIZE << shift)) == 0; +} + /* The early shadow maps everything to a single page of zeroes */ asmlinkage void __init kasan_early_init(void) { BUILD_BUG_ON(KASAN_SHADOW_OFFSET != KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT))); - /* - * We cannot check the actual value of KASAN_SHADOW_START during build, - * as it depends on vabits_actual. As a best-effort approach, check - * potential values calculated based on VA_BITS and VA_BITS_MIN. - */ - BUILD_BUG_ON(!IS_ALIGNED(_KASAN_SHADOW_START(VA_BITS), PGDIR_SIZE)); - BUILD_BUG_ON(!IS_ALIGNED(_KASAN_SHADOW_START(VA_BITS_MIN), PGDIR_SIZE)); - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE)); + BUILD_BUG_ON(!IS_ALIGNED(_KASAN_SHADOW_START(VA_BITS), SHADOW_ALIGN)); + BUILD_BUG_ON(!IS_ALIGNED(_KASAN_SHADOW_START(VA_BITS_MIN), SHADOW_ALIGN)); + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, SHADOW_ALIGN)); + + if (!root_level_aligned(KASAN_SHADOW_START)) { + /* + * The start address is misaligned, and so the next level table + * will be shared with the linear region. This can happen with + * 4 or 5 level paging, so install a generic pte_t[] as the + * next level. This prevents the kasan_pgd_populate call below + * from inserting an entry that refers to the shared KASAN zero + * shadow pud_t[]/p4d_t[], which could end up getting corrupted + * when the linear region is mapped. + */ + static pte_t tbl[PTRS_PER_PTE] __page_aligned_bss; + pgd_t *pgdp = pgd_offset_k(KASAN_SHADOW_START); + + set_pgd(pgdp, __pgd(__pa_symbol(tbl) | PGD_TYPE_TABLE)); + } + kasan_pgd_populate(KASAN_SHADOW_START, KASAN_SHADOW_END, NUMA_NO_NODE, true); } @@ -190,34 +232,74 @@ static void __init kasan_map_populate(unsigned long start, unsigned long end, } /* - * Copy the current shadow region into a new pgdir. + * Return the descriptor index of 'addr' in the root level table */ -void __init kasan_copy_shadow(pgd_t *pgdir) +static int __init root_level_idx(u64 addr) { - pgd_t *pgdp, *pgdp_new, *pgdp_end; + /* + * On 64k pages, the TTBR1 range root tables are extended for 52-bit + * virtual addressing, and TTBR1 will simply point to the pgd_t entry + * that covers the start of the 48-bit addressable VA space if LVA is + * not implemented. This means we need to index the table as usual, + * instead of masking off bits based on vabits_actual. + */ + u64 vabits = IS_ENABLED(CONFIG_ARM64_64K_PAGES) ? VA_BITS + : vabits_actual; + int shift = (ARM64_HW_PGTABLE_LEVELS(vabits) - 1) * (PAGE_SHIFT - 3); - pgdp = pgd_offset_k(KASAN_SHADOW_START); - pgdp_end = pgd_offset_k(KASAN_SHADOW_END); - pgdp_new = pgd_offset_pgd(pgdir, KASAN_SHADOW_START); - do { - set_pgd(pgdp_new, READ_ONCE(*pgdp)); - } while (pgdp++, pgdp_new++, pgdp != pgdp_end); + return (addr & ~_PAGE_OFFSET(vabits)) >> (shift + PAGE_SHIFT); +} + +/* + * Clone a next level table from swapper_pg_dir into tmp_pg_dir + */ +static void __init clone_next_level(u64 addr, pgd_t *tmp_pg_dir, pud_t *pud) +{ + int idx = root_level_idx(addr); + pgd_t pgd = READ_ONCE(swapper_pg_dir[idx]); + pud_t *pudp = (pud_t *)__phys_to_kimg(__pgd_to_phys(pgd)); + + memcpy(pud, pudp, PAGE_SIZE); + tmp_pg_dir[idx] = __pgd(__phys_to_pgd_val(__pa_symbol(pud)) | + PUD_TYPE_TABLE); } -static void __init clear_pgds(unsigned long start, - unsigned long end) +/* + * Return the descriptor index of 'addr' in the next level table + */ +static int __init next_level_idx(u64 addr) { - /* - * Remove references to kasan page tables from - * swapper_pg_dir. pgd_clear() can't be used - * here because it's nop on 2,3-level pagetable setups - */ - for (; start < end; start += PGDIR_SIZE) - set_pgd(pgd_offset_k(start), __pgd(0)); + int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 2) * (PAGE_SHIFT - 3); + + return (addr >> (shift + PAGE_SHIFT)) % PTRS_PER_PTE; +} + +/* + * Dereference the table descriptor at 'pgd_idx' and clear the entries from + * 'start' to 'end' (exclusive) from the table. + */ +static void __init clear_next_level(int pgd_idx, int start, int end) +{ + pgd_t pgd = READ_ONCE(swapper_pg_dir[pgd_idx]); + pud_t *pudp = (pud_t *)__phys_to_kimg(__pgd_to_phys(pgd)); + + memset(&pudp[start], 0, (end - start) * sizeof(pud_t)); +} + +static void __init clear_shadow(u64 start, u64 end) +{ + int l = root_level_idx(start), m = root_level_idx(end); + + if (!root_level_aligned(start)) + clear_next_level(l++, next_level_idx(start), PTRS_PER_PTE); + if (!root_level_aligned(end)) + clear_next_level(m, 0, next_level_idx(end)); + memset(&swapper_pg_dir[l], 0, (m - l) * sizeof(pgd_t)); } static void __init kasan_init_shadow(void) { + static pud_t pud[2][PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); u64 kimg_shadow_start, kimg_shadow_end; u64 mod_shadow_start; u64 vmalloc_shadow_end; @@ -239,10 +321,23 @@ static void __init kasan_init_shadow(void) * setup will be finished. */ memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(tmp_pg_dir)); + + /* + * If the start or end address of the shadow region is not aligned to + * the root level size, we have to allocate a temporary next-level table + * in each case, clone the next level of descriptors, and install the + * table into tmp_pg_dir. Note that with 5 levels of paging, the next + * level will in fact be p4d_t, but that makes no difference in this + * case. + */ + if (!root_level_aligned(KASAN_SHADOW_START)) + clone_next_level(KASAN_SHADOW_START, tmp_pg_dir, pud[0]); + if (!root_level_aligned(KASAN_SHADOW_END)) + clone_next_level(KASAN_SHADOW_END, tmp_pg_dir, pud[1]); dsb(ishst); - cpu_replace_ttbr1(lm_alias(tmp_pg_dir), idmap_pg_dir); + cpu_replace_ttbr1(lm_alias(tmp_pg_dir)); - clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); + clear_shadow(KASAN_SHADOW_START, KASAN_SHADOW_END); kasan_map_populate(kimg_shadow_start, kimg_shadow_end, early_pfn_to_nid(virt_to_pfn(lm_alias(KERNEL_START)))); @@ -276,7 +371,7 @@ static void __init kasan_init_shadow(void) PAGE_KERNEL_RO)); memset(kasan_early_shadow_page, KASAN_SHADOW_INIT, PAGE_SIZE); - cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir); + cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); } static void __init kasan_init_depth(void) diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 645fe60d000f..642bdf908b22 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -73,6 +73,10 @@ static int __init adjust_protection_map(void) protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY; } + if (lpa2_is_enabled()) + for (int i = 0; i < ARRAY_SIZE(protection_map); i++) + pgprot_val(protection_map[i]) &= ~PTE_SHARED; + return 0; } arch_initcall(adjust_protection_map); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 1ac7467d34c9..bf5b1c426ad0 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -45,18 +45,13 @@ #define NO_CONT_MAPPINGS BIT(1) #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */ -int idmap_t0sz __ro_after_init; - -#if VA_BITS > 48 -u64 vabits_actual __ro_after_init = VA_BITS_MIN; -EXPORT_SYMBOL(vabits_actual); -#endif - u64 kimage_voffset __ro_after_init; EXPORT_SYMBOL(kimage_voffset); u32 __boot_cpu_mode[] = { BOOT_CPU_MODE_EL2, BOOT_CPU_MODE_EL1 }; +static bool rodata_is_rw __ro_after_init = true; + /* * The booting CPU updates the failed status @__early_cpu_boot_status, * with MMU turned off. @@ -73,10 +68,21 @@ EXPORT_SYMBOL(empty_zero_page); static DEFINE_SPINLOCK(swapper_pgdir_lock); static DEFINE_MUTEX(fixmap_lock); -void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd) +void noinstr set_swapper_pgd(pgd_t *pgdp, pgd_t pgd) { pgd_t *fixmap_pgdp; + /* + * Don't bother with the fixmap if swapper_pg_dir is still mapped + * writable in the kernel mapping. + */ + if (rodata_is_rw) { + WRITE_ONCE(*pgdp, pgd); + dsb(ishst); + isb(); + return; + } + spin_lock(&swapper_pgdir_lock); fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp)); WRITE_ONCE(*fixmap_pgdp, pgd); @@ -307,15 +313,14 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, } while (addr = next, addr != end); } -static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, +static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; - pud_t *pudp; - p4d_t *p4dp = p4d_offset(pgdp, addr); p4d_t p4d = READ_ONCE(*p4dp); + pud_t *pudp; if (p4d_none(p4d)) { p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN; @@ -363,6 +368,46 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, pud_clear_fixmap(); } +static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, + phys_addr_t phys, pgprot_t prot, + phys_addr_t (*pgtable_alloc)(int), + int flags) +{ + unsigned long next; + pgd_t pgd = READ_ONCE(*pgdp); + p4d_t *p4dp; + + if (pgd_none(pgd)) { + pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN; + phys_addr_t p4d_phys; + + if (flags & NO_EXEC_MAPPINGS) + pgdval |= PGD_TABLE_PXN; + BUG_ON(!pgtable_alloc); + p4d_phys = pgtable_alloc(P4D_SHIFT); + __pgd_populate(pgdp, p4d_phys, pgdval); + pgd = READ_ONCE(*pgdp); + } + BUG_ON(pgd_bad(pgd)); + + p4dp = p4d_set_fixmap_offset(pgdp, addr); + do { + p4d_t old_p4d = READ_ONCE(*p4dp); + + next = p4d_addr_end(addr, end); + + alloc_init_pud(p4dp, addr, next, phys, prot, + pgtable_alloc, flags); + + BUG_ON(p4d_val(old_p4d) != 0 && + p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp))); + + phys += next - addr; + } while (p4dp++, addr = next, addr != end); + + p4d_clear_fixmap(); +} + static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, @@ -385,7 +430,7 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, do { next = pgd_addr_end(addr, end); - alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc, + alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc, flags); phys += next - addr; } while (pgdp++, addr = next, addr != end); @@ -576,8 +621,12 @@ static void __init map_mem(pgd_t *pgdp) * entries at any level are being shared between the linear region and * the vmalloc region. Check whether this is true for the PGD level, in * which case it is guaranteed to be true for all other levels as well. + * (Unless we are running with support for LPA2, in which case the + * entire reduced VA space is covered by a single pgd_t which will have + * been populated without the PXNTable attribute by the time we get here.) */ - BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end)); + BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end) && + pgd_index(_PAGE_OFFSET(VA_BITS_MIN)) != PTRS_PER_PGD - 1); early_kfence_pool = arm64_kfence_alloc_pool(); @@ -630,15 +679,16 @@ void mark_rodata_ro(void) * to cover NOTES and EXCEPTION_TABLE. */ section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; + WRITE_ONCE(rodata_is_rw, false); update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); debug_checkwx(); } -static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end, - pgprot_t prot, struct vm_struct *vma, - int flags, unsigned long vm_flags) +static void __init declare_vma(struct vm_struct *vma, + void *va_start, void *va_end, + unsigned long vm_flags) { phys_addr_t pa_start = __pa_symbol(va_start); unsigned long size = va_end - va_start; @@ -646,9 +696,6 @@ static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end, BUG_ON(!PAGE_ALIGNED(pa_start)); BUG_ON(!PAGE_ALIGNED(size)); - __create_pgd_mapping(pgdp, pa_start, (unsigned long)va_start, size, prot, - early_pgtable_alloc, flags); - if (!(vm_flags & VM_NO_GUARD)) size += PAGE_SIZE; @@ -661,12 +708,12 @@ static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end, vm_area_add_early(vma); } +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 static pgprot_t kernel_exec_prot(void) { return rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; } -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 static int __init map_entry_trampoline(void) { int i; @@ -701,80 +748,36 @@ core_initcall(map_entry_trampoline); #endif /* - * Open coded check for BTI, only for use to determine configuration - * for early mappings for before the cpufeature code has run. + * Declare the VMA areas for the kernel */ -static bool arm64_early_this_cpu_has_bti(void) +static void __init declare_kernel_vmas(void) { - u64 pfr1; + static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; - if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) - return false; - - pfr1 = __read_sysreg_by_encoding(SYS_ID_AA64PFR1_EL1); - return cpuid_feature_extract_unsigned_field(pfr1, - ID_AA64PFR1_EL1_BT_SHIFT); + declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD); + declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); + declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); + declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); + declare_vma(&vmlinux_seg[4], _data, _end, 0); } -/* - * Create fine-grained mappings for the kernel. - */ -static void __init map_kernel(pgd_t *pgdp) -{ - static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext, - vmlinux_initdata, vmlinux_data; +void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, + int level, pte_t *tbl, bool may_use_cont, u64 va_offset); - /* - * External debuggers may need to write directly to the text - * mapping to install SW breakpoints. Allow this (only) when - * explicitly requested with rodata=off. - */ - pgprot_t text_prot = kernel_exec_prot(); - - /* - * If we have a CPU that supports BTI and a kernel built for - * BTI then mark the kernel executable text as guarded pages - * now so we don't have to rewrite the page tables later. - */ - if (arm64_early_this_cpu_has_bti()) - text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP); - - /* - * Only rodata will be remapped with different permissions later on, - * all other segments are allowed to use contiguous mappings. - */ - map_kernel_segment(pgdp, _stext, _etext, text_prot, &vmlinux_text, 0, - VM_NO_GUARD); - map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL, - &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD); - map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot, - &vmlinux_inittext, 0, VM_NO_GUARD); - map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL, - &vmlinux_initdata, 0, VM_NO_GUARD); - map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0); - - fixmap_copy(pgdp); - kasan_copy_shadow(pgdp); -} +static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init, + kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; static void __init create_idmap(void) { u64 start = __pa_symbol(__idmap_text_start); - u64 size = __pa_symbol(__idmap_text_end) - start; - pgd_t *pgd = idmap_pg_dir; - u64 pgd_phys; - - /* check if we need an additional level of translation */ - if (VA_BITS < 48 && idmap_t0sz < (64 - VA_BITS_MIN)) { - pgd_phys = early_pgtable_alloc(PAGE_SHIFT); - set_pgd(&idmap_pg_dir[start >> VA_BITS], - __pgd(pgd_phys | P4D_TYPE_TABLE)); - pgd = __va(pgd_phys); - } - __create_pgd_mapping(pgd, start, start, size, PAGE_KERNEL_ROX, - early_pgtable_alloc, 0); + u64 end = __pa_symbol(__idmap_text_end); + u64 ptep = __pa_symbol(idmap_ptes); + + __pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX, + IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, + __phys_to_virt(ptep) - ptep); - if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) { + if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) { extern u32 __idmap_kpti_flag; u64 pa = __pa_symbol(&__idmap_kpti_flag); @@ -782,32 +785,21 @@ static void __init create_idmap(void) * The KPTI G-to-nG conversion code needs a read-write mapping * of its synchronization flag in the ID map. */ - __create_pgd_mapping(pgd, pa, pa, sizeof(u32), PAGE_KERNEL, - early_pgtable_alloc, 0); + ptep = __pa_symbol(kpti_ptes); + __pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL, + IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, + __phys_to_virt(ptep) - ptep); } } void __init paging_init(void) { - pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir)); - extern pgd_t init_idmap_pg_dir[]; - - idmap_t0sz = 63UL - __fls(__pa_symbol(_end) | GENMASK(VA_BITS_MIN - 1, 0)); - - map_kernel(pgdp); - map_mem(pgdp); - - pgd_clear_fixmap(); - - cpu_replace_ttbr1(lm_alias(swapper_pg_dir), init_idmap_pg_dir); - init_mm.pgd = swapper_pg_dir; - - memblock_phys_free(__pa_symbol(init_pg_dir), - __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir)); + map_mem(swapper_pg_dir); memblock_allow_resize(); create_idmap(); + declare_kernel_vmas(); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -1073,10 +1065,10 @@ static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr, free_empty_pmd_table(pudp, addr, next, floor, ceiling); } while (addr = next, addr < end); - if (CONFIG_PGTABLE_LEVELS <= 3) + if (!pgtable_l4_enabled()) return; - if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK)) + if (!pgtable_range_aligned(start, end, floor, ceiling, P4D_MASK)) return; /* @@ -1099,8 +1091,8 @@ static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { - unsigned long next; p4d_t *p4dp, p4d; + unsigned long i, next, start = addr; do { next = p4d_addr_end(addr, end); @@ -1112,6 +1104,27 @@ static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr, WARN_ON(!p4d_present(p4d)); free_empty_pud_table(p4dp, addr, next, floor, ceiling); } while (addr = next, addr < end); + + if (!pgtable_l5_enabled()) + return; + + if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK)) + return; + + /* + * Check whether we can free the p4d page if the rest of the + * entries are empty. Overlap with other regions have been + * handled by the floor/ceiling check. + */ + p4dp = p4d_offset(pgdp, 0UL); + for (i = 0; i < PTRS_PER_P4D; i++) { + if (!p4d_none(READ_ONCE(p4dp[i]))) + return; + } + + pgd_clear(pgdp); + __flush_tlb_kernel_pgtable(start); + free_hotplug_pgtable_page(virt_to_page(p4dp)); } static void free_empty_tables(unsigned long addr, unsigned long end, @@ -1196,6 +1209,12 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) return 1; } +#ifndef __PAGETABLE_P4D_FOLDED +void p4d_clear_huge(p4d_t *p4dp) +{ +} +#endif + int pud_clear_huge(pud_t *pudp) { if (!pud_sect(READ_ONCE(*pudp))) @@ -1486,3 +1505,35 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte { set_pte_at(vma->vm_mm, addr, ptep, pte); } + +/* + * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD, + * avoiding the possibility of conflicting TLB entries being allocated. + */ +void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) +{ + typedef void (ttbr_replace_func)(phys_addr_t); + extern ttbr_replace_func idmap_cpu_replace_ttbr1; + ttbr_replace_func *replace_phys; + unsigned long daif; + + /* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */ + phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp)); + + if (cnp) + ttbr1 |= TTBR_CNP_BIT; + + replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1); + + cpu_install_idmap(); + + /* + * We really don't want to take *any* exceptions while TTBR1 is + * in the process of being replaced so mask everything. + */ + daif = local_daif_save(); + replace_phys(ttbr1); + local_daif_restore(daif); + + cpu_uninstall_idmap(); +} diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 4a64089e5771..0c501cabc238 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -17,11 +17,22 @@ static struct kmem_cache *pgd_cache __ro_after_init; +static bool pgdir_is_page_size(void) +{ + if (PGD_SIZE == PAGE_SIZE) + return true; + if (CONFIG_PGTABLE_LEVELS == 4) + return !pgtable_l4_enabled(); + if (CONFIG_PGTABLE_LEVELS == 5) + return !pgtable_l5_enabled(); + return false; +} + pgd_t *pgd_alloc(struct mm_struct *mm) { gfp_t gfp = GFP_PGTABLE_USER; - if (PGD_SIZE == PAGE_SIZE) + if (pgdir_is_page_size()) return (pgd_t *)__get_free_page(gfp); else return kmem_cache_alloc(pgd_cache, gfp); @@ -29,7 +40,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - if (PGD_SIZE == PAGE_SIZE) + if (pgdir_is_page_size()) free_page((unsigned long)pgd); else kmem_cache_free(pgd_cache, pgd); @@ -37,7 +48,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) void __init pgtable_cache_init(void) { - if (PGD_SIZE == PAGE_SIZE) + if (pgdir_is_page_size()) return; #ifdef CONFIG_ARM64_PA_BITS_52 diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index f66c37a1610e..9d40f3ffd8d2 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -195,27 +195,36 @@ SYM_TYPED_FUNC_START(idmap_cpu_replace_ttbr1) ret SYM_FUNC_END(idmap_cpu_replace_ttbr1) +SYM_FUNC_ALIAS(__pi_idmap_cpu_replace_ttbr1, idmap_cpu_replace_ttbr1) .popsection #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -#define KPTI_NG_PTE_FLAGS (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS | PTE_WRITE) +#define KPTI_NG_PTE_FLAGS (PTE_ATTRINDX(MT_NORMAL) | PTE_TYPE_PAGE | \ + PTE_AF | PTE_SHARED | PTE_UXN | PTE_WRITE) .pushsection ".idmap.text", "a" + .macro pte_to_phys, phys, pte + and \phys, \pte, #PTE_ADDR_LOW +#ifdef CONFIG_ARM64_PA_BITS_52 + and \pte, \pte, #PTE_ADDR_HIGH + orr \phys, \phys, \pte, lsl #PTE_ADDR_HIGH_SHIFT +#endif + .endm + .macro kpti_mk_tbl_ng, type, num_entries add end_\type\()p, cur_\type\()p, #\num_entries * 8 .Ldo_\type: - ldr \type, [cur_\type\()p] // Load the entry + ldr \type, [cur_\type\()p], #8 // Load the entry and advance tbz \type, #0, .Lnext_\type // Skip invalid and tbnz \type, #11, .Lnext_\type // non-global entries orr \type, \type, #PTE_NG // Same bit for blocks and pages - str \type, [cur_\type\()p] // Update the entry + str \type, [cur_\type\()p, #-8] // Update the entry .ifnc \type, pte tbnz \type, #1, .Lderef_\type .endif .Lnext_\type: - add cur_\type\()p, cur_\type\()p, #8 cmp cur_\type\()p, end_\type\()p b.ne .Ldo_\type .endm @@ -225,18 +234,18 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1) * fixmap slot associated with the current level. */ .macro kpti_map_pgtbl, type, level - str xzr, [temp_pte, #8 * (\level + 1)] // break before make + str xzr, [temp_pte, #8 * (\level + 2)] // break before make dsb nshst - add pte, temp_pte, #PAGE_SIZE * (\level + 1) + add pte, temp_pte, #PAGE_SIZE * (\level + 2) lsr pte, pte, #12 tlbi vaae1, pte dsb nsh isb phys_to_pte pte, cur_\type\()p - add cur_\type\()p, temp_pte, #PAGE_SIZE * (\level + 1) + add cur_\type\()p, temp_pte, #PAGE_SIZE * (\level + 2) orr pte, pte, pte_flags - str pte, [temp_pte, #8 * (\level + 1)] + str pte, [temp_pte, #8 * (\level + 2)] dsb nshst .endm @@ -269,6 +278,8 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) end_ptep .req x15 pte .req x16 valid .req x17 + cur_p4dp .req x19 + end_p4dp .req x20 mov x5, x3 // preserve temp_pte arg mrs swapper_ttb, ttbr1_el1 @@ -276,6 +287,12 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) cbnz cpu, __idmap_kpti_secondary +#if CONFIG_PGTABLE_LEVELS > 4 + stp x29, x30, [sp, #-32]! + mov x29, sp + stp x19, x20, [sp, #16] +#endif + /* We're the boot CPU. Wait for the others to catch up */ sevl 1: wfe @@ -293,9 +310,32 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) mov_q pte_flags, KPTI_NG_PTE_FLAGS /* Everybody is enjoying the idmap, so we can rewrite swapper. */ + +#ifdef CONFIG_ARM64_LPA2 + /* + * If LPA2 support is configured, but 52-bit virtual addressing is not + * enabled at runtime, we will fall back to one level of paging less, + * and so we have to walk swapper_pg_dir as if we dereferenced its + * address from a PGD level entry, and terminate the PGD level loop + * right after. + */ + adrp pgd, swapper_pg_dir // walk &swapper_pg_dir at the next level + mov cur_pgdp, end_pgdp // must be equal to terminate the PGD loop +alternative_if_not ARM64_HAS_VA52 + b .Lderef_pgd // skip to the next level +alternative_else_nop_endif + /* + * LPA2 based 52-bit virtual addressing requires 52-bit physical + * addressing to be enabled as well. In this case, the shareability + * bits are repurposed as physical address bits, and should not be + * set in pte_flags. + */ + bic pte_flags, pte_flags, #PTE_SHARED +#endif + /* PGD */ adrp cur_pgdp, swapper_pg_dir - kpti_map_pgtbl pgd, 0 + kpti_map_pgtbl pgd, -1 kpti_mk_tbl_ng pgd, PTRS_PER_PGD /* Ensure all the updated entries are visible to secondary CPUs */ @@ -308,16 +348,33 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) /* Set the flag to zero to indicate that we're all done */ str wzr, [flag_ptr] +#if CONFIG_PGTABLE_LEVELS > 4 + ldp x19, x20, [sp, #16] + ldp x29, x30, [sp], #32 +#endif ret .Lderef_pgd: + /* P4D */ + .if CONFIG_PGTABLE_LEVELS > 4 + p4d .req x30 + pte_to_phys cur_p4dp, pgd + kpti_map_pgtbl p4d, 0 + kpti_mk_tbl_ng p4d, PTRS_PER_P4D + b .Lnext_pgd + .else /* CONFIG_PGTABLE_LEVELS <= 4 */ + p4d .req pgd + .set .Lnext_p4d, .Lnext_pgd + .endif + +.Lderef_p4d: /* PUD */ .if CONFIG_PGTABLE_LEVELS > 3 pud .req x10 - pte_to_phys cur_pudp, pgd + pte_to_phys cur_pudp, p4d kpti_map_pgtbl pud, 1 kpti_mk_tbl_ng pud, PTRS_PER_PUD - b .Lnext_pgd + b .Lnext_p4d .else /* CONFIG_PGTABLE_LEVELS <= 3 */ pud .req pgd .set .Lnext_pud, .Lnext_pgd @@ -361,6 +418,9 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) .unreq end_ptep .unreq pte .unreq valid + .unreq cur_p4dp + .unreq end_p4dp + .unreq p4d /* Secondary CPUs end up here */ __idmap_kpti_secondary: @@ -395,8 +455,6 @@ SYM_FUNC_END(idmap_kpti_install_ng_mappings) * * Initialise the processor for turning the MMU on. * - * Input: - * x0 - actual number of VA bits (ignored unless VA_BITS > 48) * Output: * Return in x0 the value of the SCTLR_EL1 register. */ @@ -420,20 +478,21 @@ SYM_FUNC_START(__cpu_setup) mair .req x17 tcr .req x16 mov_q mair, MAIR_EL1_SET - mov_q tcr, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \ - TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ - TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS + mov_q tcr, TCR_T0SZ(IDMAP_VA_BITS) | TCR_T1SZ(VA_BITS_MIN) | TCR_CACHE_FLAGS | \ + TCR_SMP_FLAGS | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ + TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS tcr_clear_errata_bits tcr, x9, x5 #ifdef CONFIG_ARM64_VA_BITS_52 - sub x9, xzr, x0 - add x9, x9, #64 + mov x9, #64 - VA_BITS +alternative_if ARM64_HAS_VA52 tcr_set_t1sz tcr, x9 -#else - idmap_get_t0sz x9 +#ifdef CONFIG_ARM64_LPA2 + orr tcr, tcr, #TCR_DS +#endif +alternative_else_nop_endif #endif - tcr_set_t0sz tcr, x9 /* * Set the IPS bits in TCR_EL1. @@ -458,11 +517,26 @@ SYM_FUNC_START(__cpu_setup) ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4 cbz x1, .Lskip_indirection + /* + * The PROT_* macros describing the various memory types may resolve to + * C expressions if they include the PTE_MAYBE_* macros, and so they + * can only be used from C code. The PIE_E* constants below are also + * defined in terms of those macros, but will mask out those + * PTE_MAYBE_* constants, whether they are set or not. So #define them + * as 0x0 here so we can evaluate the PIE_E* constants in asm context. + */ + +#define PTE_MAYBE_NG 0 +#define PTE_MAYBE_SHARED 0 + mov_q x0, PIE_E0 msr REG_PIRE0_EL1, x0 mov_q x0, PIE_E1 msr REG_PIR_EL1, x0 +#undef PTE_MAYBE_NG +#undef PTE_MAYBE_SHARED + mov x0, TCR2_EL1x_PIE msr REG_TCR2_EL1, x0 diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index e305b6593c4e..5b87f8d623f7 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -26,34 +26,6 @@ #include <asm/ptdump.h> -enum address_markers_idx { - PAGE_OFFSET_NR = 0, - PAGE_END_NR, -#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) - KASAN_START_NR, -#endif -}; - -static struct addr_marker address_markers[] = { - { PAGE_OFFSET, "Linear Mapping start" }, - { 0 /* PAGE_END */, "Linear Mapping end" }, -#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) - { 0 /* KASAN_SHADOW_START */, "Kasan shadow start" }, - { KASAN_SHADOW_END, "Kasan shadow end" }, -#endif - { MODULES_VADDR, "Modules start" }, - { MODULES_END, "Modules end" }, - { VMALLOC_START, "vmalloc() area" }, - { VMALLOC_END, "vmalloc() end" }, - { FIXADDR_TOT_START, "Fixmap start" }, - { FIXADDR_TOP, "Fixmap end" }, - { PCI_IO_START, "PCI I/O start" }, - { PCI_IO_END, "PCI I/O end" }, - { VMEMMAP_START, "vmemmap start" }, - { VMEMMAP_START + VMEMMAP_SIZE, "vmemmap end" }, - { -1, NULL }, -}; - #define pt_dump_seq_printf(m, fmt, args...) \ ({ \ if (m) \ @@ -76,6 +48,7 @@ struct pg_state { struct ptdump_state ptdump; struct seq_file *seq; const struct addr_marker *marker; + const struct mm_struct *mm; unsigned long start_address; int level; u64 current_prot; @@ -172,12 +145,12 @@ static const struct prot_bits pte_bits[] = { struct pg_level { const struct prot_bits *bits; - const char *name; - size_t num; + char name[4]; + int num; u64 mask; }; -static struct pg_level pg_level[] = { +static struct pg_level pg_level[] __ro_after_init = { { /* pgd */ .name = "PGD", .bits = pte_bits, @@ -187,11 +160,11 @@ static struct pg_level pg_level[] = { .bits = pte_bits, .num = ARRAY_SIZE(pte_bits), }, { /* pud */ - .name = (CONFIG_PGTABLE_LEVELS > 3) ? "PUD" : "PGD", + .name = "PUD", .bits = pte_bits, .num = ARRAY_SIZE(pte_bits), }, { /* pmd */ - .name = (CONFIG_PGTABLE_LEVELS > 2) ? "PMD" : "PGD", + .name = "PMD", .bits = pte_bits, .num = ARRAY_SIZE(pte_bits), }, { /* pte */ @@ -255,6 +228,11 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, static const char units[] = "KMGTPE"; u64 prot = 0; + /* check if the current level has been folded dynamically */ + if ((level == 1 && mm_p4d_folded(st->mm)) || + (level == 2 && mm_pud_folded(st->mm))) + level = 0; + if (level >= 0) prot = val & pg_level[level].mask; @@ -316,6 +294,7 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info) st = (struct pg_state){ .seq = s, .marker = info->markers, + .mm = info->mm, .level = -1, .ptdump = { .note_page = note_page, @@ -339,10 +318,8 @@ static void __init ptdump_initialize(void) pg_level[i].mask |= pg_level[i].bits[j].mask; } -static struct ptdump_info kernel_ptdump_info = { +static struct ptdump_info kernel_ptdump_info __ro_after_init = { .mm = &init_mm, - .markers = address_markers, - .base_addr = PAGE_OFFSET, }; void ptdump_check_wx(void) @@ -358,7 +335,7 @@ void ptdump_check_wx(void) .ptdump = { .note_page = note_page, .range = (struct ptdump_range[]) { - {PAGE_OFFSET, ~0UL}, + {_PAGE_OFFSET(vabits_actual), ~0UL}, {0, 0} } } @@ -375,10 +352,32 @@ void ptdump_check_wx(void) static int __init ptdump_init(void) { - address_markers[PAGE_END_NR].start_address = PAGE_END; + u64 page_offset = _PAGE_OFFSET(vabits_actual); + u64 vmemmap_start = (u64)virt_to_page((void *)page_offset); + struct addr_marker m[] = { + { PAGE_OFFSET, "Linear Mapping start" }, + { PAGE_END, "Linear Mapping end" }, #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) - address_markers[KASAN_START_NR].start_address = KASAN_SHADOW_START; + { KASAN_SHADOW_START, "Kasan shadow start" }, + { KASAN_SHADOW_END, "Kasan shadow end" }, #endif + { MODULES_VADDR, "Modules start" }, + { MODULES_END, "Modules end" }, + { VMALLOC_START, "vmalloc() area" }, + { VMALLOC_END, "vmalloc() end" }, + { vmemmap_start, "vmemmap start" }, + { VMEMMAP_END, "vmemmap end" }, + { PCI_IO_START, "PCI I/O start" }, + { PCI_IO_END, "PCI I/O end" }, + { FIXADDR_TOT_START, "Fixmap start" }, + { FIXADDR_TOP, "Fixmap end" }, + { -1, NULL }, + }; + static struct addr_marker address_markers[ARRAY_SIZE(m)] __ro_after_init; + + kernel_ptdump_info.markers = memcpy(address_markers, m, sizeof(m)); + kernel_ptdump_info.base_addr = page_offset; + ptdump_initialize(); ptdump_debugfs_register(&kernel_ptdump_info, "kernel_page_tables"); return 0; |