diff options
Diffstat (limited to 'arch/arm64/mm')
-rw-r--r-- | arch/arm64/mm/Makefile | 1 | ||||
-rw-r--r-- | arch/arm64/mm/fault.c | 41 | ||||
-rw-r--r-- | arch/arm64/mm/init.c | 33 | ||||
-rw-r--r-- | arch/arm64/mm/mmap.c | 15 | ||||
-rw-r--r-- | arch/arm64/mm/mmu.c | 4 | ||||
-rw-r--r-- | arch/arm64/mm/physaddr.c | 2 | ||||
-rw-r--r-- | arch/arm64/mm/proc.S | 18 | ||||
-rw-r--r-- | arch/arm64/mm/ptdump.c | 1 | ||||
-rw-r--r-- | arch/arm64/mm/trans_pgd.c | 324 |
9 files changed, 360 insertions, 79 deletions
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 5ead3c3de3b6..77222d92667a 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -6,6 +6,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o +obj-$(CONFIG_TRANS_TABLE) += trans_pgd.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_ARM64_MTE) += mteswap.o diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 3c40da479899..2e339f0bd958 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -564,7 +564,7 @@ retry: mmap_read_lock(mm); } else { /* - * The above down_read_trylock() might have succeeded in which + * The above mmap_read_trylock() might have succeeded in which * case, we'll have missed the might_sleep() from down_read(). */ might_sleep(); @@ -709,10 +709,11 @@ static int do_tag_check_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { /* - * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN for tag - * check faults. Mask them out now so that userspace doesn't see them. + * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN + * for tag check faults. Set them to corresponding bits in the untagged + * address. */ - far &= (1UL << 60) - 1; + far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK); do_bad_area(far, esr, regs); return 0; } @@ -874,44 +875,12 @@ static void debug_exception_exit(struct pt_regs *regs) } NOKPROBE_SYMBOL(debug_exception_exit); -#ifdef CONFIG_ARM64_ERRATUM_1463225 -DECLARE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa); - -static int cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) -{ - if (user_mode(regs)) - return 0; - - if (!__this_cpu_read(__in_cortex_a76_erratum_1463225_wa)) - return 0; - - /* - * We've taken a dummy step exception from the kernel to ensure - * that interrupts are re-enabled on the syscall path. Return back - * to cortex_a76_erratum_1463225_svc_handler() with debug exceptions - * masked so that we can safely restore the mdscr and get on with - * handling the syscall. - */ - regs->pstate |= PSR_D_BIT; - return 1; -} -#else -static int cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) -{ - return 0; -} -#endif /* CONFIG_ARM64_ERRATUM_1463225 */ -NOKPROBE_SYMBOL(cortex_a76_erratum_1463225_debug_handler); - void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf = esr_to_debug_fault_info(esr); unsigned long pc = instruction_pointer(regs); - if (cortex_a76_erratum_1463225_debug_handler(regs)) - return; - debug_exception_enter(regs); if (user_mode(regs) && !is_ttbr0_addr(pc)) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 75addb36354a..709d98fea90c 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -53,13 +53,13 @@ s64 memstart_addr __ro_after_init = -1; EXPORT_SYMBOL(memstart_addr); /* - * We create both ZONE_DMA and ZONE_DMA32. ZONE_DMA covers the first 1G of - * memory as some devices, namely the Raspberry Pi 4, have peripherals with - * this limited view of the memory. ZONE_DMA32 will cover the rest of the 32 - * bit addressable memory area. + * If the corresponding config options are enabled, we create both ZONE_DMA + * and ZONE_DMA32. By default ZONE_DMA covers the 32-bit addressable memory + * unless restricted on specific platforms (e.g. 30-bit on Raspberry Pi 4). + * In such case, ZONE_DMA32 covers the rest of the 32-bit addressable memory, + * otherwise it is empty. */ phys_addr_t arm64_dma_phys_limit __ro_after_init; -static phys_addr_t arm64_dma32_phys_limit __ro_after_init; #ifdef CONFIG_KEXEC_CORE /* @@ -84,7 +84,7 @@ static void __init reserve_crashkernel(void) if (crash_base == 0) { /* Current arm64 boot protocol requires 2MB alignment */ - crash_base = memblock_find_in_range(0, arm64_dma32_phys_limit, + crash_base = memblock_find_in_range(0, arm64_dma_phys_limit, crash_size, SZ_2M); if (crash_base == 0) { pr_warn("cannot allocate crashkernel (size:0x%llx)\n", @@ -196,6 +196,7 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; unsigned int __maybe_unused acpi_zone_dma_bits; unsigned int __maybe_unused dt_zone_dma_bits; + phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32); #ifdef CONFIG_ZONE_DMA acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address()); @@ -205,8 +206,12 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); #endif #ifdef CONFIG_ZONE_DMA32 - max_zone_pfns[ZONE_DMA32] = PFN_DOWN(arm64_dma32_phys_limit); + max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit); + if (!arm64_dma_phys_limit) + arm64_dma_phys_limit = dma32_phys_limit; #endif + if (!arm64_dma_phys_limit) + arm64_dma_phys_limit = PHYS_MASK + 1; max_zone_pfns[ZONE_NORMAL] = max; free_area_init(max_zone_pfns); @@ -394,16 +399,9 @@ void __init arm64_memblock_init(void) early_init_fdt_scan_reserved_mem(); - if (IS_ENABLED(CONFIG_ZONE_DMA32)) - arm64_dma32_phys_limit = max_zone_phys(32); - else - arm64_dma32_phys_limit = PHYS_MASK + 1; - reserve_elfcorehdr(); high_memory = __va(memblock_end_of_DRAM() - 1) + 1; - - dma_contiguous_reserve(arm64_dma32_phys_limit); } void __init bootmem_init(void) @@ -439,6 +437,11 @@ void __init bootmem_init(void) zone_sizes_init(min, max); /* + * Reserve the CMA area after arm64_dma_phys_limit was initialised. + */ + dma_contiguous_reserve(arm64_dma_phys_limit); + + /* * request_standard_resources() depends on crashkernel's memory being * reserved, so do it here. */ @@ -455,7 +458,7 @@ void __init bootmem_init(void) void __init mem_init(void) { if (swiotlb_force == SWIOTLB_FORCE || - max_pfn > PFN_DOWN(arm64_dma_phys_limit ? : arm64_dma32_phys_limit)) + max_pfn > PFN_DOWN(arm64_dma_phys_limit)) swiotlb_init(1); else swiotlb_force = SWIOTLB_NO_FORCE; diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 07937b49cb88..a38f54cd638c 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -5,20 +5,11 @@ * Copyright (C) 2012 ARM Ltd. */ -#include <linux/elf.h> -#include <linux/fs.h> -#include <linux/memblock.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/export.h> -#include <linux/shm.h> -#include <linux/sched/signal.h> -#include <linux/sched/mm.h> #include <linux/io.h> -#include <linux/personality.h> -#include <linux/random.h> +#include <linux/memblock.h> +#include <linux/types.h> -#include <asm/cputype.h> +#include <asm/page.h> /* * You really shouldn't be using read() or write() on /dev/mem. This might go diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ae0c3d023824..25af183e4bed 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -628,7 +628,7 @@ static bool arm64_early_this_cpu_has_bti(void) if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) return false; - pfr1 = read_sysreg_s(SYS_ID_AA64PFR1_EL1); + pfr1 = __read_sysreg_by_encoding(SYS_ID_AA64PFR1_EL1); return cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_BT_SHIFT); } @@ -1094,6 +1094,7 @@ static void free_empty_tables(unsigned long addr, unsigned long end, int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { + WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END)); return vmemmap_populate_basepages(start, end, node, altmap); } #else /* !ARM64_SWAPPER_USES_SECTION_MAPS */ @@ -1107,6 +1108,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, pud_t *pudp; pmd_t *pmdp; + WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END)); do { next = pmd_addr_end(addr, end); diff --git a/arch/arm64/mm/physaddr.c b/arch/arm64/mm/physaddr.c index 67a9ba9eaa96..cde44c13dda1 100644 --- a/arch/arm64/mm/physaddr.c +++ b/arch/arm64/mm/physaddr.c @@ -9,7 +9,7 @@ phys_addr_t __virt_to_phys(unsigned long x) { - WARN(!__is_lm_address(x), + WARN(!__is_lm_address(__tag_reset(x)), "virt_to_phys used for non-linear address: %pK (%pS)\n", (void *)x, (void *)x); diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 37a54b57178a..c967bfd30d2b 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -46,7 +46,7 @@ #endif #ifdef CONFIG_KASAN_HW_TAGS -#define TCR_KASAN_HW_FLAGS SYS_TCR_EL1_TCMA1 | TCR_TBI1 +#define TCR_KASAN_HW_FLAGS SYS_TCR_EL1_TCMA1 | TCR_TBI1 | TCR_TBID1 #else #define TCR_KASAN_HW_FLAGS 0 #endif @@ -291,17 +291,7 @@ skip_pgd: /* We're done: fire up the MMU again */ mrs x17, sctlr_el1 orr x17, x17, #SCTLR_ELx_M - msr sctlr_el1, x17 - isb - - /* - * Invalidate the local I-cache so that any instructions fetched - * speculatively from the PoC are discarded, since they may have - * been dynamically patched at the PoU. - */ - ic iallu - dsb nsh - isb + set_sctlr_el1 x17 /* Set the flag to zero to indicate that we're all done */ str wzr, [flag_ptr] @@ -464,8 +454,8 @@ SYM_FUNC_START(__cpu_setup) #endif msr mair_el1, x5 /* - * Set/prepare TCR and TTBR. We use 512GB (39-bit) address range for - * both user and kernel. + * Set/prepare TCR and TTBR. TCR_EL1.T1SZ gets further + * adjusted if the kernel is compiled with 52bit VA support. */ mov_q x10, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \ TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 04137a8f3d2d..0e050d76b83a 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -324,6 +324,7 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info) st = (struct pg_state){ .seq = s, .marker = info->markers, + .level = -1, .ptdump = { .note_page = note_page, .range = (struct ptdump_range[]){ diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c new file mode 100644 index 000000000000..527f0a39c3da --- /dev/null +++ b/arch/arm64/mm/trans_pgd.c @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Transitional page tables for kexec and hibernate + * + * This file derived from: arch/arm64/kernel/hibernate.c + * + * Copyright (c) 2020, Microsoft Corporation. + * Pavel Tatashin <pasha.tatashin@soleen.com> + * + */ + +/* + * Transitional tables are used during system transferring from one world to + * another: such as during hibernate restore, and kexec reboots. During these + * phases one cannot rely on page table not being overwritten. This is because + * hibernate and kexec can overwrite the current page tables during transition. + */ + +#include <asm/trans_pgd.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <linux/suspend.h> +#include <linux/bug.h> +#include <linux/mm.h> +#include <linux/mmzone.h> + +static void *trans_alloc(struct trans_pgd_info *info) +{ + return info->trans_alloc_page(info->trans_alloc_arg); +} + +static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) +{ + pte_t pte = READ_ONCE(*src_ptep); + + if (pte_valid(pte)) { + /* + * Resume will overwrite areas that may be marked + * read only (code, rodata). Clear the RDONLY bit from + * the temporary mappings we use during restore. + */ + set_pte(dst_ptep, pte_mkwrite(pte)); + } else if (debug_pagealloc_enabled() && !pte_none(pte)) { + /* + * debug_pagealloc will removed the PTE_VALID bit if + * the page isn't in use by the resume kernel. It may have + * been in use by the original kernel, in which case we need + * to put it back in our copy to do the restore. + * + * Before marking this entry valid, check the pfn should + * be mapped. + */ + BUG_ON(!pfn_valid(pte_pfn(pte))); + + set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte))); + } +} + +static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp, + pmd_t *src_pmdp, unsigned long start, unsigned long end) +{ + pte_t *src_ptep; + pte_t *dst_ptep; + unsigned long addr = start; + + dst_ptep = trans_alloc(info); + if (!dst_ptep) + return -ENOMEM; + pmd_populate_kernel(NULL, dst_pmdp, dst_ptep); + dst_ptep = pte_offset_kernel(dst_pmdp, start); + + src_ptep = pte_offset_kernel(src_pmdp, start); + do { + _copy_pte(dst_ptep, src_ptep, addr); + } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end); + + return 0; +} + +static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp, + pud_t *src_pudp, unsigned long start, unsigned long end) +{ + pmd_t *src_pmdp; + pmd_t *dst_pmdp; + unsigned long next; + unsigned long addr = start; + + if (pud_none(READ_ONCE(*dst_pudp))) { + dst_pmdp = trans_alloc(info); + if (!dst_pmdp) + return -ENOMEM; + pud_populate(NULL, dst_pudp, dst_pmdp); + } + dst_pmdp = pmd_offset(dst_pudp, start); + + src_pmdp = pmd_offset(src_pudp, start); + do { + pmd_t pmd = READ_ONCE(*src_pmdp); + + next = pmd_addr_end(addr, end); + if (pmd_none(pmd)) + continue; + if (pmd_table(pmd)) { + if (copy_pte(info, dst_pmdp, src_pmdp, addr, next)) + return -ENOMEM; + } else { + set_pmd(dst_pmdp, + __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY)); + } + } while (dst_pmdp++, src_pmdp++, addr = next, addr != end); + + return 0; +} + +static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp, + p4d_t *src_p4dp, unsigned long start, + unsigned long end) +{ + pud_t *dst_pudp; + pud_t *src_pudp; + unsigned long next; + unsigned long addr = start; + + if (p4d_none(READ_ONCE(*dst_p4dp))) { + dst_pudp = trans_alloc(info); + if (!dst_pudp) + return -ENOMEM; + p4d_populate(NULL, dst_p4dp, dst_pudp); + } + dst_pudp = pud_offset(dst_p4dp, start); + + src_pudp = pud_offset(src_p4dp, start); + do { + pud_t pud = READ_ONCE(*src_pudp); + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + continue; + if (pud_table(pud)) { + if (copy_pmd(info, dst_pudp, src_pudp, addr, next)) + return -ENOMEM; + } else { + set_pud(dst_pudp, + __pud(pud_val(pud) & ~PUD_SECT_RDONLY)); + } + } while (dst_pudp++, src_pudp++, addr = next, addr != end); + + return 0; +} + +static int copy_p4d(struct trans_pgd_info *info, pgd_t *dst_pgdp, + pgd_t *src_pgdp, unsigned long start, + unsigned long end) +{ + p4d_t *dst_p4dp; + p4d_t *src_p4dp; + unsigned long next; + unsigned long addr = start; + + dst_p4dp = p4d_offset(dst_pgdp, start); + src_p4dp = p4d_offset(src_pgdp, start); + do { + next = p4d_addr_end(addr, end); + if (p4d_none(READ_ONCE(*src_p4dp))) + continue; + if (copy_pud(info, dst_p4dp, src_p4dp, addr, next)) + return -ENOMEM; + } while (dst_p4dp++, src_p4dp++, addr = next, addr != end); + + return 0; +} + +static int copy_page_tables(struct trans_pgd_info *info, pgd_t *dst_pgdp, + unsigned long start, unsigned long end) +{ + unsigned long next; + unsigned long addr = start; + pgd_t *src_pgdp = pgd_offset_k(start); + + dst_pgdp = pgd_offset_pgd(dst_pgdp, start); + do { + next = pgd_addr_end(addr, end); + if (pgd_none(READ_ONCE(*src_pgdp))) + continue; + if (copy_p4d(info, dst_pgdp, src_pgdp, addr, next)) + return -ENOMEM; + } while (dst_pgdp++, src_pgdp++, addr = next, addr != end); + + return 0; +} + +/* + * Create trans_pgd and copy linear map. + * info: contains allocator and its argument + * dst_pgdp: new page table that is created, and to which map is copied. + * start: Start of the interval (inclusive). + * end: End of the interval (exclusive). + * + * Returns 0 on success, and -ENOMEM on failure. + */ +int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **dst_pgdp, + unsigned long start, unsigned long end) +{ + int rc; + pgd_t *trans_pgd = trans_alloc(info); + + if (!trans_pgd) { + pr_err("Failed to allocate memory for temporary page tables.\n"); + return -ENOMEM; + } + + rc = copy_page_tables(info, trans_pgd, start, end); + if (!rc) + *dst_pgdp = trans_pgd; + + return rc; +} + +/* + * Add map entry to trans_pgd for a base-size page at PTE level. + * info: contains allocator and its argument + * trans_pgd: page table in which new map is added. + * page: page to be mapped. + * dst_addr: new VA address for the page + * pgprot: protection for the page. + * + * Returns 0 on success, and -ENOMEM on failure. + */ +int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, + void *page, unsigned long dst_addr, pgprot_t pgprot) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + pgdp = pgd_offset_pgd(trans_pgd, dst_addr); + if (pgd_none(READ_ONCE(*pgdp))) { + p4dp = trans_alloc(info); + if (!pgdp) + return -ENOMEM; + pgd_populate(NULL, pgdp, p4dp); + } + + p4dp = p4d_offset(pgdp, dst_addr); + if (p4d_none(READ_ONCE(*p4dp))) { + pudp = trans_alloc(info); + if (!pudp) + return -ENOMEM; + p4d_populate(NULL, p4dp, pudp); + } + + pudp = pud_offset(p4dp, dst_addr); + if (pud_none(READ_ONCE(*pudp))) { + pmdp = trans_alloc(info); + if (!pmdp) + return -ENOMEM; + pud_populate(NULL, pudp, pmdp); + } + + pmdp = pmd_offset(pudp, dst_addr); + if (pmd_none(READ_ONCE(*pmdp))) { + ptep = trans_alloc(info); + if (!ptep) + return -ENOMEM; + pmd_populate_kernel(NULL, pmdp, ptep); + } + + ptep = pte_offset_kernel(pmdp, dst_addr); + set_pte(ptep, pfn_pte(virt_to_pfn(page), pgprot)); + + return 0; +} + +/* + * The page we want to idmap may be outside the range covered by VA_BITS that + * can be built using the kernel's p?d_populate() helpers. As a one off, for a + * single page, we build these page tables bottom up and just assume that will + * need the maximum T0SZ. + * + * Returns 0 on success, and -ENOMEM on failure. + * On success trans_ttbr0 contains page table with idmapped page, t0sz is set to + * maximum T0SZ for this page. + */ +int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0, + unsigned long *t0sz, void *page) +{ + phys_addr_t dst_addr = virt_to_phys(page); + unsigned long pfn = __phys_to_pfn(dst_addr); + int max_msb = (dst_addr & GENMASK(52, 48)) ? 51 : 47; + int bits_mapped = PAGE_SHIFT - 4; + unsigned long level_mask, prev_level_entry, *levels[4]; + int this_level, index, level_lsb, level_msb; + + dst_addr &= PAGE_MASK; + prev_level_entry = pte_val(pfn_pte(pfn, PAGE_KERNEL_EXEC)); + + for (this_level = 3; this_level >= 0; this_level--) { + levels[this_level] = trans_alloc(info); + if (!levels[this_level]) + return -ENOMEM; + + level_lsb = ARM64_HW_PGTABLE_LEVEL_SHIFT(this_level); + level_msb = min(level_lsb + bits_mapped, max_msb); + level_mask = GENMASK_ULL(level_msb, level_lsb); + + index = (dst_addr & level_mask) >> level_lsb; + *(levels[this_level] + index) = prev_level_entry; + + pfn = virt_to_pfn(levels[this_level]); + prev_level_entry = pte_val(pfn_pte(pfn, + __pgprot(PMD_TYPE_TABLE))); + + if (level_msb == max_msb) + break; + } + + *trans_ttbr0 = phys_to_ttbr(__pfn_to_phys(pfn)); + *t0sz = TCR_T0SZ(max_msb + 1); + + return 0; +} |