diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 10 | ||||
-rw-r--r-- | arch/x86/mm/amdtopology.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/extable.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 8 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 23 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 12 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 19 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt_amd.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt_identity.c | 569 | ||||
-rw-r--r-- | arch/x86/mm/mm_internal.h | 4 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 5 | ||||
-rw-r--r-- | arch/x86/mm/numa_32.c | 61 | ||||
-rw-r--r-- | arch/x86/mm/numa_64.c | 13 | ||||
-rw-r--r-- | arch/x86/mm/numa_internal.h | 10 | ||||
-rw-r--r-- | arch/x86/mm/pat/memtype.c | 36 | ||||
-rw-r--r-- | arch/x86/mm/pat/set_memory.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 158 | ||||
-rw-r--r-- | arch/x86/mm/pti.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 176 |
19 files changed, 230 insertions, 888 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 32035d5be5a0..5b9908f13dcf 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -3,12 +3,10 @@ KCOV_INSTRUMENT_tlb.o := n KCOV_INSTRUMENT_mem_encrypt.o := n KCOV_INSTRUMENT_mem_encrypt_amd.o := n -KCOV_INSTRUMENT_mem_encrypt_identity.o := n KCOV_INSTRUMENT_pgprot.o := n KASAN_SANITIZE_mem_encrypt.o := n KASAN_SANITIZE_mem_encrypt_amd.o := n -KASAN_SANITIZE_mem_encrypt_identity.o := n KASAN_SANITIZE_pgprot.o := n # Disable KCSAN entirely, because otherwise we get warnings that some functions @@ -16,12 +14,10 @@ KASAN_SANITIZE_pgprot.o := n KCSAN_SANITIZE := n # Avoid recursion by not calling KMSAN hooks for CEA code. KMSAN_SANITIZE_cpu_entry_area.o := n -KMSAN_SANITIZE_mem_encrypt_identity.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg CFLAGS_REMOVE_mem_encrypt_amd.o = -pg -CFLAGS_REMOVE_mem_encrypt_identity.o = -pg CFLAGS_REMOVE_pgprot.o = -pg endif @@ -32,9 +28,6 @@ obj-y += pat/ # Make sure __phys_addr has no stackprotector CFLAGS_physaddr.o := -fno-stack-protector -CFLAGS_mem_encrypt_identity.o := -fno-stack-protector - -CFLAGS_fault.o := -I $(src)/../include/asm/trace obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o @@ -52,7 +45,7 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o -obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o @@ -63,5 +56,4 @@ obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) += pti.o obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o -obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index 628833afee37..f980b0eb0105 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -25,7 +25,7 @@ #include <asm/numa.h> #include <asm/mpspec.h> #include <asm/apic.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> static unsigned char __initdata nodeids[8]; diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 51986e8a9d35..bf8dab18be97 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -111,7 +111,7 @@ static bool ex_handler_sgx(const struct exception_table_entry *fixup, /* * Handler for when we fail to restore a task's FPU state. We should never get - * here because the FPU state of a task using the FPU (task->thread.fpu.state) + * here because the FPU state of a task using the FPU (struct fpu::fpstate) * should always be valid. However, past bugs have allowed userspace to set * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). * These caused XRSTOR to fail when switching to the task, leaking the FPU diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 296d294142c8..998bd807fc7b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -13,7 +13,6 @@ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ #include <linux/hugetlb.h> /* hstate_index_to_shift */ -#include <linux/prefetch.h> /* prefetchw */ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ #include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/ @@ -38,7 +37,7 @@ #include <asm/sev.h> /* snp_dump_hva_rmpentry() */ #define CREATE_TRACE_POINTS -#include <asm/trace/exceptions.h> +#include <trace/events/exceptions.h> /* * Returns 0 if mmiotrace is disabled, or if the fault is not @@ -1455,9 +1454,6 @@ static __always_inline void trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, unsigned long address) { - if (!trace_pagefault_enabled()) - return; - if (user_mode(regs)) trace_page_fault_user(address, regs, error_code); else @@ -1496,8 +1492,6 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2(); - prefetchw(¤t->mm->mmap_lock); - /* * KVM uses #PF vector to deliver 'page not present' events to guests * (asynchronous page fault mechanism). The event happens when a diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bfa444a7dbb0..7456df985d96 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -28,6 +28,7 @@ #include <asm/text-patching.h> #include <asm/memtype.h> #include <asm/paravirt.h> +#include <asm/mmu_context.h> /* * We need to define the tracepoints somewhere, and tlb.c @@ -173,11 +174,7 @@ __ref void *alloc_low_pages(unsigned int num) * randomization is enabled. */ -#ifndef CONFIG_X86_5LEVEL -#define INIT_PGD_PAGE_TABLES 3 -#else #define INIT_PGD_PAGE_TABLES 4 -#endif #ifndef CONFIG_RANDOMIZE_MEMORY #define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES) @@ -824,31 +821,33 @@ void __init poking_init(void) spinlock_t *ptl; pte_t *ptep; - poking_mm = mm_alloc(); - BUG_ON(!poking_mm); + text_poke_mm = mm_alloc(); + BUG_ON(!text_poke_mm); /* Xen PV guests need the PGD to be pinned. */ - paravirt_enter_mmap(poking_mm); + paravirt_enter_mmap(text_poke_mm); + + set_notrack_mm(text_poke_mm); /* * Randomize the poking address, but make sure that the following page * will be mapped at the same PMD. We need 2 pages, so find space for 3, * and adjust the address if the PMD ends after the first one. */ - poking_addr = TASK_UNMAPPED_BASE; + text_poke_mm_addr = TASK_UNMAPPED_BASE; if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % + text_poke_mm_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); - if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) - poking_addr += PAGE_SIZE; + if (((text_poke_mm_addr + PAGE_SIZE) & ~PMD_MASK) == 0) + text_poke_mm_addr += PAGE_SIZE; /* * We need to trigger the allocation of the page-tables that will be * needed for poking now. Later, poking may be performed in an atomic * section, which might cause allocation to fail. */ - ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); BUG_ON(!ptep); pte_unmap_unlock(ptep, ptl); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ad662cc4605c..607d6a2e66e2 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -30,6 +30,7 @@ #include <linux/initrd.h> #include <linux/cpumask.h> #include <linux/gfp.h> +#include <linux/execmem.h> #include <asm/asm.h> #include <asm/bios_ebda.h> @@ -565,7 +566,7 @@ static void __init lowmem_pfn_init(void) "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" #define MSG_HIGHMEM_TRIMMED \ - "Warning: only 4GB will be used. Support for for CONFIG_HIGHMEM64G was removed!\n" + "Warning: only 4GB will be used. Support for CONFIG_HIGHMEM64G was removed!\n" /* * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: @@ -612,7 +613,6 @@ void __init find_low_pfn_range(void) highmem_pfn_init(); } -#ifndef CONFIG_NUMA void __init initmem_init(void) { #ifdef CONFIG_HIGHMEM @@ -633,12 +633,6 @@ void __init initmem_init(void) printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); - setup_bootmem_allocator(); -} -#endif /* !CONFIG_NUMA */ - -void __init setup_bootmem_allocator(void) -{ printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<<PAGE_SHIFT); printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); @@ -755,6 +749,8 @@ void mark_rodata_ro(void) pr_info("Write protecting kernel text and read-only data: %luk\n", size >> 10); + execmem_cache_make_ro(); + kernel_set_to_readonly = 1; #ifdef CONFIG_CPA_DEBUG diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7c4f6f591f2b..66330fe4e18c 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -34,6 +34,7 @@ #include <linux/gfp.h> #include <linux/kcore.h> #include <linux/bootmem_info.h> +#include <linux/execmem.h> #include <asm/processor.h> #include <asm/bios_ebda.h> @@ -805,12 +806,17 @@ kernel_physical_mapping_change(unsigned long paddr_start, } #ifndef CONFIG_NUMA -void __init initmem_init(void) +static inline void x86_numa_init(void) { memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); } #endif +void __init initmem_init(void) +{ + x86_numa_init(); +} + void __init paging_init(void) { sparse_init(); @@ -827,7 +833,6 @@ void __init paging_init(void) zone_sizes_init(); } -#ifdef CONFIG_SPARSEMEM_VMEMMAP #define PAGE_UNUSED 0xFD /* @@ -926,7 +931,6 @@ static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long if (!IS_ALIGNED(end, PMD_SIZE)) unused_pmd_start = end; } -#endif /* * Memory hotplug specific functions @@ -1146,16 +1150,13 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, pmd_clear(pmd); spin_unlock(&init_mm.page_table_lock); pages++; - } -#ifdef CONFIG_SPARSEMEM_VMEMMAP - else if (vmemmap_pmd_is_unused(addr, next)) { + } else if (vmemmap_pmd_is_unused(addr, next)) { free_hugepage_table(pmd_page(*pmd), altmap); spin_lock(&init_mm.page_table_lock); pmd_clear(pmd); spin_unlock(&init_mm.page_table_lock); } -#endif continue; } @@ -1391,6 +1392,8 @@ void mark_rodata_ro(void) (end - start) >> 10); set_memory_ro(start, (end - start) >> PAGE_SHIFT); + execmem_cache_make_ro(); + kernel_set_to_readonly = 1; /* @@ -1492,7 +1495,6 @@ unsigned long memory_block_size_bytes(void) return memory_block_size_probed; } -#ifdef CONFIG_SPARSEMEM_VMEMMAP /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. */ @@ -1639,4 +1641,3 @@ void __meminit vmemmap_populate_print_last(void) node_start = 0; } } -#endif diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 7490ff6d83b1..faf3a13fb6ba 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -40,7 +40,9 @@ * section is later cleared. */ u64 sme_me_mask __section(".data") = 0; +SYM_PIC_ALIAS(sme_me_mask); u64 sev_status __section(".data") = 0; +SYM_PIC_ALIAS(sev_status); u64 sev_check_data __section(".data") = 0; EXPORT_SYMBOL(sme_me_mask); diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c deleted file mode 100644 index 5eecdd92da10..000000000000 --- a/arch/x86/mm/mem_encrypt_identity.c +++ /dev/null @@ -1,569 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * AMD Memory Encryption Support - * - * Copyright (C) 2016 Advanced Micro Devices, Inc. - * - * Author: Tom Lendacky <thomas.lendacky@amd.com> - */ - -/* - * Since we're dealing with identity mappings, physical and virtual - * addresses are the same, so override these defines which are ultimately - * used by the headers in misc.h. - */ -#define __pa(x) ((unsigned long)(x)) -#define __va(x) ((void *)((unsigned long)(x))) - -/* - * Special hack: we have to be careful, because no indirections are - * allowed here, and paravirt_ops is a kind of one. As it will only run in - * baremetal anyway, we just keep it from happening. (This list needs to - * be extended when new paravirt and debugging variants are added.) - */ -#undef CONFIG_PARAVIRT -#undef CONFIG_PARAVIRT_XXL -#undef CONFIG_PARAVIRT_SPINLOCKS - -/* - * This code runs before CPU feature bits are set. By default, the - * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if - * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5 - * is provided to handle this situation and, instead, use a variable that - * has been set by the early boot code. - */ -#define USE_EARLY_PGTABLE_L5 - -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/mem_encrypt.h> -#include <linux/cc_platform.h> - -#include <asm/init.h> -#include <asm/setup.h> -#include <asm/sections.h> -#include <asm/coco.h> -#include <asm/sev.h> - -#include "mm_internal.h" - -#define PGD_FLAGS _KERNPG_TABLE_NOENC -#define P4D_FLAGS _KERNPG_TABLE_NOENC -#define PUD_FLAGS _KERNPG_TABLE_NOENC -#define PMD_FLAGS _KERNPG_TABLE_NOENC - -#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) - -#define PMD_FLAGS_DEC PMD_FLAGS_LARGE -#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \ - (_PAGE_PAT_LARGE | _PAGE_PWT)) - -#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) - -#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL) - -#define PTE_FLAGS_DEC PTE_FLAGS -#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ - (_PAGE_PAT | _PAGE_PWT)) - -#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) - -struct sme_populate_pgd_data { - void *pgtable_area; - pgd_t *pgd; - - pmdval_t pmd_flags; - pteval_t pte_flags; - unsigned long paddr; - - unsigned long vaddr; - unsigned long vaddr_end; -}; - -/* - * This work area lives in the .init.scratch section, which lives outside of - * the kernel proper. It is sized to hold the intermediate copy buffer and - * more than enough pagetable pages. - * - * By using this section, the kernel can be encrypted in place and it - * avoids any possibility of boot parameters or initramfs images being - * placed such that the in-place encryption logic overwrites them. This - * section is 2MB aligned to allow for simple pagetable setup using only - * PMD entries (see vmlinux.lds.S). - */ -static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch"); - -static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd) -{ - unsigned long pgd_start, pgd_end, pgd_size; - pgd_t *pgd_p; - - pgd_start = ppd->vaddr & PGDIR_MASK; - pgd_end = ppd->vaddr_end & PGDIR_MASK; - - pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t); - - pgd_p = ppd->pgd + pgd_index(ppd->vaddr); - - memset(pgd_p, 0, pgd_size); -} - -static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) -{ - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - pgd = ppd->pgd + pgd_index(ppd->vaddr); - if (pgd_none(*pgd)) { - p4d = ppd->pgtable_area; - memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D); - ppd->pgtable_area += sizeof(*p4d) * PTRS_PER_P4D; - set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d))); - } - - p4d = p4d_offset(pgd, ppd->vaddr); - if (p4d_none(*p4d)) { - pud = ppd->pgtable_area; - memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD); - ppd->pgtable_area += sizeof(*pud) * PTRS_PER_PUD; - set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud))); - } - - pud = pud_offset(p4d, ppd->vaddr); - if (pud_none(*pud)) { - pmd = ppd->pgtable_area; - memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD); - ppd->pgtable_area += sizeof(*pmd) * PTRS_PER_PMD; - set_pud(pud, __pud(PUD_FLAGS | __pa(pmd))); - } - - if (pud_leaf(*pud)) - return NULL; - - return pud; -} - -static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) -{ - pud_t *pud; - pmd_t *pmd; - - pud = sme_prepare_pgd(ppd); - if (!pud) - return; - - pmd = pmd_offset(pud, ppd->vaddr); - if (pmd_leaf(*pmd)) - return; - - set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); -} - -static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd) -{ - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pud = sme_prepare_pgd(ppd); - if (!pud) - return; - - pmd = pmd_offset(pud, ppd->vaddr); - if (pmd_none(*pmd)) { - pte = ppd->pgtable_area; - memset(pte, 0, sizeof(*pte) * PTRS_PER_PTE); - ppd->pgtable_area += sizeof(*pte) * PTRS_PER_PTE; - set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte))); - } - - if (pmd_leaf(*pmd)) - return; - - pte = pte_offset_kernel(pmd, ppd->vaddr); - if (pte_none(*pte)) - set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); -} - -static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) -{ - while (ppd->vaddr < ppd->vaddr_end) { - sme_populate_pgd_large(ppd); - - ppd->vaddr += PMD_SIZE; - ppd->paddr += PMD_SIZE; - } -} - -static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd) -{ - while (ppd->vaddr < ppd->vaddr_end) { - sme_populate_pgd(ppd); - - ppd->vaddr += PAGE_SIZE; - ppd->paddr += PAGE_SIZE; - } -} - -static void __head __sme_map_range(struct sme_populate_pgd_data *ppd, - pmdval_t pmd_flags, pteval_t pte_flags) -{ - unsigned long vaddr_end; - - ppd->pmd_flags = pmd_flags; - ppd->pte_flags = pte_flags; - - /* Save original end value since we modify the struct value */ - vaddr_end = ppd->vaddr_end; - - /* If start is not 2MB aligned, create PTE entries */ - ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_SIZE); - __sme_map_range_pte(ppd); - - /* Create PMD entries */ - ppd->vaddr_end = vaddr_end & PMD_MASK; - __sme_map_range_pmd(ppd); - - /* If end is not 2MB aligned, create PTE entries */ - ppd->vaddr_end = vaddr_end; - __sme_map_range_pte(ppd); -} - -static void __head sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) -{ - __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); -} - -static void __head sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) -{ - __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); -} - -static void __head sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) -{ - __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); -} - -static unsigned long __head sme_pgtable_calc(unsigned long len) -{ - unsigned long entries = 0, tables = 0; - - /* - * Perform a relatively simplistic calculation of the pagetable - * entries that are needed. Those mappings will be covered mostly - * by 2MB PMD entries so we can conservatively calculate the required - * number of P4D, PUD and PMD structures needed to perform the - * mappings. For mappings that are not 2MB aligned, PTE mappings - * would be needed for the start and end portion of the address range - * that fall outside of the 2MB alignment. This results in, at most, - * two extra pages to hold PTE entries for each range that is mapped. - * Incrementing the count for each covers the case where the addresses - * cross entries. - */ - - /* PGDIR_SIZE is equal to P4D_SIZE on 4-level machine. */ - if (PTRS_PER_P4D > 1) - entries += (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * sizeof(p4d_t) * PTRS_PER_P4D; - entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * sizeof(pud_t) * PTRS_PER_PUD; - entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * sizeof(pmd_t) * PTRS_PER_PMD; - entries += 2 * sizeof(pte_t) * PTRS_PER_PTE; - - /* - * Now calculate the added pagetable structures needed to populate - * the new pagetables. - */ - - if (PTRS_PER_P4D > 1) - tables += DIV_ROUND_UP(entries, PGDIR_SIZE) * sizeof(p4d_t) * PTRS_PER_P4D; - tables += DIV_ROUND_UP(entries, P4D_SIZE) * sizeof(pud_t) * PTRS_PER_PUD; - tables += DIV_ROUND_UP(entries, PUD_SIZE) * sizeof(pmd_t) * PTRS_PER_PMD; - - return entries + tables; -} - -void __head sme_encrypt_kernel(struct boot_params *bp) -{ - unsigned long workarea_start, workarea_end, workarea_len; - unsigned long execute_start, execute_end, execute_len; - unsigned long kernel_start, kernel_end, kernel_len; - unsigned long initrd_start, initrd_end, initrd_len; - struct sme_populate_pgd_data ppd; - unsigned long pgtable_area_len; - unsigned long decrypted_base; - - /* - * This is early code, use an open coded check for SME instead of - * using cc_platform_has(). This eliminates worries about removing - * instrumentation or checking boot_cpu_data in the cc_platform_has() - * function. - */ - if (!sme_get_me_mask() || - RIP_REL_REF(sev_status) & MSR_AMD64_SEV_ENABLED) - return; - - /* - * Prepare for encrypting the kernel and initrd by building new - * pagetables with the necessary attributes needed to encrypt the - * kernel in place. - * - * One range of virtual addresses will map the memory occupied - * by the kernel and initrd as encrypted. - * - * Another range of virtual addresses will map the memory occupied - * by the kernel and initrd as decrypted and write-protected. - * - * The use of write-protect attribute will prevent any of the - * memory from being cached. - */ - - kernel_start = (unsigned long)RIP_REL_REF(_text); - kernel_end = ALIGN((unsigned long)RIP_REL_REF(_end), PMD_SIZE); - kernel_len = kernel_end - kernel_start; - - initrd_start = 0; - initrd_end = 0; - initrd_len = 0; -#ifdef CONFIG_BLK_DEV_INITRD - initrd_len = (unsigned long)bp->hdr.ramdisk_size | - ((unsigned long)bp->ext_ramdisk_size << 32); - if (initrd_len) { - initrd_start = (unsigned long)bp->hdr.ramdisk_image | - ((unsigned long)bp->ext_ramdisk_image << 32); - initrd_end = PAGE_ALIGN(initrd_start + initrd_len); - initrd_len = initrd_end - initrd_start; - } -#endif - - /* - * Calculate required number of workarea bytes needed: - * executable encryption area size: - * stack page (PAGE_SIZE) - * encryption routine page (PAGE_SIZE) - * intermediate copy buffer (PMD_SIZE) - * pagetable structures for the encryption of the kernel - * pagetable structures for workarea (in case not currently mapped) - */ - execute_start = workarea_start = (unsigned long)RIP_REL_REF(sme_workarea); - execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE; - execute_len = execute_end - execute_start; - - /* - * One PGD for both encrypted and decrypted mappings and a set of - * PUDs and PMDs for each of the encrypted and decrypted mappings. - */ - pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; - pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; - if (initrd_len) - pgtable_area_len += sme_pgtable_calc(initrd_len) * 2; - - /* PUDs and PMDs needed in the current pagetables for the workarea */ - pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); - - /* - * The total workarea includes the executable encryption area and - * the pagetable area. The start of the workarea is already 2MB - * aligned, align the end of the workarea on a 2MB boundary so that - * we don't try to create/allocate PTE entries from the workarea - * before it is mapped. - */ - workarea_len = execute_len + pgtable_area_len; - workarea_end = ALIGN(workarea_start + workarea_len, PMD_SIZE); - - /* - * Set the address to the start of where newly created pagetable - * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable - * structures are created when the workarea is added to the current - * pagetables and when the new encrypted and decrypted kernel - * mappings are populated. - */ - ppd.pgtable_area = (void *)execute_end; - - /* - * Make sure the current pagetable structure has entries for - * addressing the workarea. - */ - ppd.pgd = (pgd_t *)native_read_cr3_pa(); - ppd.paddr = workarea_start; - ppd.vaddr = workarea_start; - ppd.vaddr_end = workarea_end; - sme_map_range_decrypted(&ppd); - - /* Flush the TLB - no globals so cr3 is enough */ - native_write_cr3(__native_read_cr3()); - - /* - * A new pagetable structure is being built to allow for the kernel - * and initrd to be encrypted. It starts with an empty PGD that will - * then be populated with new PUDs and PMDs as the encrypted and - * decrypted kernel mappings are created. - */ - ppd.pgd = ppd.pgtable_area; - memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); - ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; - - /* - * A different PGD index/entry must be used to get different - * pagetable entries for the decrypted mapping. Choose the next - * PGD index and convert it to a virtual address to be used as - * the base of the mapping. - */ - decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); - if (initrd_len) { - unsigned long check_base; - - check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1); - decrypted_base = max(decrypted_base, check_base); - } - decrypted_base <<= PGDIR_SHIFT; - - /* Add encrypted kernel (identity) mappings */ - ppd.paddr = kernel_start; - ppd.vaddr = kernel_start; - ppd.vaddr_end = kernel_end; - sme_map_range_encrypted(&ppd); - - /* Add decrypted, write-protected kernel (non-identity) mappings */ - ppd.paddr = kernel_start; - ppd.vaddr = kernel_start + decrypted_base; - ppd.vaddr_end = kernel_end + decrypted_base; - sme_map_range_decrypted_wp(&ppd); - - if (initrd_len) { - /* Add encrypted initrd (identity) mappings */ - ppd.paddr = initrd_start; - ppd.vaddr = initrd_start; - ppd.vaddr_end = initrd_end; - sme_map_range_encrypted(&ppd); - /* - * Add decrypted, write-protected initrd (non-identity) mappings - */ - ppd.paddr = initrd_start; - ppd.vaddr = initrd_start + decrypted_base; - ppd.vaddr_end = initrd_end + decrypted_base; - sme_map_range_decrypted_wp(&ppd); - } - - /* Add decrypted workarea mappings to both kernel mappings */ - ppd.paddr = workarea_start; - ppd.vaddr = workarea_start; - ppd.vaddr_end = workarea_end; - sme_map_range_decrypted(&ppd); - - ppd.paddr = workarea_start; - ppd.vaddr = workarea_start + decrypted_base; - ppd.vaddr_end = workarea_end + decrypted_base; - sme_map_range_decrypted(&ppd); - - /* Perform the encryption */ - sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, - kernel_len, workarea_start, (unsigned long)ppd.pgd); - - if (initrd_len) - sme_encrypt_execute(initrd_start, initrd_start + decrypted_base, - initrd_len, workarea_start, - (unsigned long)ppd.pgd); - - /* - * At this point we are running encrypted. Remove the mappings for - * the decrypted areas - all that is needed for this is to remove - * the PGD entry/entries. - */ - ppd.vaddr = kernel_start + decrypted_base; - ppd.vaddr_end = kernel_end + decrypted_base; - sme_clear_pgd(&ppd); - - if (initrd_len) { - ppd.vaddr = initrd_start + decrypted_base; - ppd.vaddr_end = initrd_end + decrypted_base; - sme_clear_pgd(&ppd); - } - - ppd.vaddr = workarea_start + decrypted_base; - ppd.vaddr_end = workarea_end + decrypted_base; - sme_clear_pgd(&ppd); - - /* Flush the TLB - no globals so cr3 is enough */ - native_write_cr3(__native_read_cr3()); -} - -void __head sme_enable(struct boot_params *bp) -{ - unsigned int eax, ebx, ecx, edx; - unsigned long feature_mask; - unsigned long me_mask; - bool snp_en; - u64 msr; - - snp_en = snp_init(bp); - - /* Check for the SME/SEV support leaf */ - eax = 0x80000000; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - if (eax < 0x8000001f) - return; - -#define AMD_SME_BIT BIT(0) -#define AMD_SEV_BIT BIT(1) - - /* - * Check for the SME/SEV feature: - * CPUID Fn8000_001F[EAX] - * - Bit 0 - Secure Memory Encryption support - * - Bit 1 - Secure Encrypted Virtualization support - * CPUID Fn8000_001F[EBX] - * - Bits 5:0 - Pagetable bit position used to indicate encryption - */ - eax = 0x8000001f; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - /* Check whether SEV or SME is supported */ - if (!(eax & (AMD_SEV_BIT | AMD_SME_BIT))) - return; - - me_mask = 1UL << (ebx & 0x3f); - - /* Check the SEV MSR whether SEV or SME is enabled */ - RIP_REL_REF(sev_status) = msr = __rdmsr(MSR_AMD64_SEV); - feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; - - /* - * Any discrepancies between the presence of a CC blob and SNP - * enablement abort the guest. - */ - if (snp_en ^ !!(msr & MSR_AMD64_SEV_SNP_ENABLED)) - snp_abort(); - - /* Check if memory encryption is enabled */ - if (feature_mask == AMD_SME_BIT) { - if (!(bp->hdr.xloadflags & XLF_MEM_ENCRYPTION)) - return; - - /* - * No SME if Hypervisor bit is set. This check is here to - * prevent a guest from trying to enable SME. For running as a - * KVM guest the MSR_AMD64_SYSCFG will be sufficient, but there - * might be other hypervisors which emulate that MSR as non-zero - * or even pass it through to the guest. - * A malicious hypervisor can still trick a guest into this - * path, but there is no way to protect against that. - */ - eax = 1; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - if (ecx & BIT(31)) - return; - - /* For SME, check the SYSCFG MSR */ - msr = __rdmsr(MSR_AMD64_SYSCFG); - if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) - return; - } - - RIP_REL_REF(sme_me_mask) = me_mask; - RIP_REL_REF(physical_mask) &= ~me_mask; - RIP_REL_REF(cc_vendor) = CC_VENDOR_AMD; - cc_set_mask(me_mask); -} diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 3f37b5c80bb3..097aadc250f7 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -25,4 +25,8 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache); extern unsigned long tlb_single_page_flush_ceiling; +#ifdef CONFIG_NUMA +void __init x86_numa_init(void); +#endif + #endif /* __X86_MM_INTERNAL_H */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 64e5cdb2460a..c24890c40138 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -18,9 +18,10 @@ #include <asm/e820/api.h> #include <asm/proto.h> #include <asm/dma.h> -#include <asm/amd_nb.h> +#include <asm/numa.h> +#include <asm/amd/nb.h> -#include "numa_internal.h" +#include "mm_internal.h" int numa_off; diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c deleted file mode 100644 index 65fda406e6f2..000000000000 --- a/arch/x86/mm/numa_32.c +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation - * August 2002: added remote node KVA remap - Martin J. Bligh - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <linux/memblock.h> -#include <linux/init.h> -#include <linux/vmalloc.h> -#include <asm/pgtable_areas.h> - -#include "numa_internal.h" - -extern unsigned long highend_pfn, highstart_pfn; - -void __init initmem_init(void) -{ - x86_numa_init(); - -#ifdef CONFIG_HIGHMEM - highstart_pfn = highend_pfn = max_pfn; - if (max_pfn > max_low_pfn) - highstart_pfn = max_low_pfn; - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; -#else - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; -#endif - printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(max_low_pfn)); - printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n", - max_low_pfn, highstart_pfn); - - printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", - (ulong) pfn_to_kaddr(max_low_pfn)); - - printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", - (ulong) pfn_to_kaddr(highstart_pfn)); - - __vmalloc_start_set = true; - setup_bootmem_allocator(); -} diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c deleted file mode 100644 index 59d80160fa5a..000000000000 --- a/arch/x86/mm/numa_64.c +++ /dev/null @@ -1,13 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Generic VM initialization for x86-64 NUMA setups. - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - */ -#include <linux/memblock.h> - -#include "numa_internal.h" - -void __init initmem_init(void) -{ - x86_numa_init(); -} diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h deleted file mode 100644 index 11e1ff370c10..000000000000 --- a/arch/x86/mm/numa_internal.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __X86_MM_NUMA_INTERNAL_H -#define __X86_MM_NUMA_INTERNAL_H - -#include <linux/types.h> -#include <asm/numa.h> - -void __init x86_numa_init(void); - -#endif /* __X86_MM_NUMA_INTERNAL_H */ diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 72d8cbc61158..c97b527c66fe 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -38,6 +38,7 @@ #include <linux/kernel.h> #include <linux/pfn_t.h> #include <linux/slab.h> +#include <linux/io.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/fs.h> @@ -232,7 +233,7 @@ void pat_cpu_init(void) panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); } - wrmsrl(MSR_IA32_CR_PAT, pat_msr_val); + wrmsrq(MSR_IA32_CR_PAT, pat_msr_val); __flush_tlb_all(); } @@ -256,7 +257,7 @@ void __init pat_bp_init(void) if (!cpu_feature_enabled(X86_FEATURE_PAT)) pat_disable("PAT not supported by the CPU."); else - rdmsrl(MSR_IA32_CR_PAT, pat_msr_val); + rdmsrq(MSR_IA32_CR_PAT, pat_msr_val); if (!pat_msr_val) { pat_disable("PAT support disabled by the firmware."); @@ -682,6 +683,7 @@ static enum page_cache_mode lookup_memtype(u64 paddr) /** * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type * of @pfn cannot be overridden by UC MTRR memory type. + * @pfn: The page frame number to check. * * Only to be called when PAT is enabled. * @@ -773,38 +775,14 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, return vma_prot; } -#ifdef CONFIG_STRICT_DEVMEM -/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ -static inline int range_is_allowed(unsigned long pfn, unsigned long size) -{ - return 1; -} -#else -/* This check is needed to avoid cache aliasing when PAT is enabled */ -static inline int range_is_allowed(unsigned long pfn, unsigned long size) -{ - u64 from = ((u64)pfn) << PAGE_SHIFT; - u64 to = from + size; - u64 cursor = from; - - if (!pat_enabled()) - return 1; - - while (cursor < to) { - if (!devmem_is_allowed(pfn)) - return 0; - cursor += PAGE_SIZE; - pfn++; - } - return 1; -} -#endif /* CONFIG_STRICT_DEVMEM */ - int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; + if (!pat_enabled()) + return 1; + if (!range_is_allowed(pfn, size)) return 0; diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index def3d9284254..30ab4aced761 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -889,7 +889,7 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) /* change init_mm */ set_pte_atomic(kpte, pte); #ifdef CONFIG_X86_32 - if (!SHARED_KERNEL_PMD) { + { struct page *page; list_for_each_entry(page, &pgd_list, lru) { @@ -1293,7 +1293,7 @@ static int collapse_pmd_page(pmd_t *pmd, unsigned long addr, /* Queue the page table to be freed after TLB flush */ list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables); - if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) { + if (IS_ENABLED(CONFIG_X86_32)) { struct page *page; /* Update all PGD tables to use the same large page */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index a05fcddfc811..62777ba4de1a 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -10,6 +10,7 @@ #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); +SYM_PIC_ALIAS(physical_mask); #endif pgtable_t pte_alloc_one(struct mm_struct *mm) @@ -68,12 +69,6 @@ static inline void pgd_list_del(pgd_t *pgd) list_del(&ptdesc->pt_list); } -#define UNSHARED_PTRS_PER_PGD \ - (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) -#define MAX_UNSHARED_PTRS_PER_PGD \ - MAX_T(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) - - static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { virt_to_ptdesc(pgd)->pt_mm = mm; @@ -86,29 +81,19 @@ struct mm_struct *pgd_page_get_mm(struct page *page) static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { - /* If the pgd points to a shared pagetable level (either the - ptes in non-PAE, or shared PMD in PAE), then just copy the - references from swapper_pg_dir. */ - if (CONFIG_PGTABLE_LEVELS == 2 || - (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || - CONFIG_PGTABLE_LEVELS >= 4) { + /* PAE preallocates all its PMDs. No cloning needed. */ + if (!IS_ENABLED(CONFIG_X86_PAE)) clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); - } - /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) { - pgd_set_mm(pgd, mm); - pgd_list_add(pgd); - } + /* List used to sync kernel mapping updates */ + pgd_set_mm(pgd, mm); + pgd_list_add(pgd); } static void pgd_dtor(pgd_t *pgd) { - if (SHARED_KERNEL_PMD) - return; - spin_lock(&pgd_lock); pgd_list_del(pgd); spin_unlock(&pgd_lock); @@ -132,15 +117,15 @@ static void pgd_dtor(pgd_t *pgd) * processor notices the update. Since this is expensive, and * all 4 top-level entries are used almost immediately in a * new process's life, we just pre-populate them here. - * - * Also, if we're in a paravirt environment where the kernel pmd is - * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate - * and initialize the kernel pmds here. */ -#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD -#define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD +#define PREALLOCATED_PMDS PTRS_PER_PGD /* + * "USER_PMDS" are the PMDs for the user copy of the page tables when + * PTI is enabled. They do not exist when PTI is disabled. Note that + * this is distinct from the user _portion_ of the kernel page tables + * which always exists. + * * We allocate separate PMDs for the kernel part of the user page-table * when PTI is enabled. We need them to map the per-process LDT into the * user-space page-table. @@ -169,7 +154,6 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 -#define MAX_PREALLOCATED_PMDS 0 #define PREALLOCATED_USER_PMDS 0 #define MAX_PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ @@ -318,82 +302,28 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, { } #endif -/* - * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also - * assumes that pgd should be in one page. - * - * But kernel with PAE paging that is not running as a Xen domain - * only needs to allocate 32 bytes for pgd instead of one page. - */ -#ifdef CONFIG_X86_PAE - -#include <linux/slab.h> - -#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) -#define PGD_ALIGN 32 - -static struct kmem_cache *pgd_cache; - -void __init pgtable_cache_init(void) -{ - /* - * When PAE kernel is running as a Xen domain, it does not use - * shared kernel pmd. And this requires a whole page for pgd. - */ - if (!SHARED_KERNEL_PMD) - return; - - /* - * when PAE kernel is not running as a Xen domain, it uses - * shared kernel pmd. Shared kernel pmd does not require a whole - * page for pgd. We are able to just allocate a 32-byte for pgd. - * During boot time, we create a 32-byte slab for pgd table allocation. - */ - pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, - SLAB_PANIC, NULL); -} static inline pgd_t *_pgd_alloc(struct mm_struct *mm) { /* - * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. - * We allocate one page for pgd. - */ - if (!SHARED_KERNEL_PMD) - return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); - - /* - * Now PAE kernel is not running as a Xen domain. We can allocate - * a 32-byte slab for pgd to save memory space. + * PTI and Xen need a whole page for the PAE PGD + * even though the hardware only needs 32 bytes. + * + * For simplicity, allocate a page for all users. */ - return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER); -} - -static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - if (!SHARED_KERNEL_PMD) - __pgd_free(mm, pgd); - else - kmem_cache_free(pgd_cache, pgd); -} -#else - -static inline pgd_t *_pgd_alloc(struct mm_struct *mm) -{ - return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); + return __pgd_alloc(mm, pgd_allocation_order()); } static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) { __pgd_free(mm, pgd); } -#endif /* CONFIG_X86_PAE */ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; - pmd_t *pmds[MAX_PREALLOCATED_PMDS]; + pmd_t *pmds[PREALLOCATED_PMDS]; pgd = _pgd_alloc(mm); @@ -613,11 +543,11 @@ pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, #endif /** - * reserve_top_address - reserves a hole in the top of kernel address space - * @reserve - size of hole to reserve + * reserve_top_address - Reserve a hole in the top of the kernel address space + * @reserve: Size of hole to reserve * * Can be used to relocate the fixmap area and poke a hole in the top - * of kernel address space to make room for a hypervisor. + * of the kernel address space to make room for a hypervisor. */ void __init reserve_top_address(unsigned long reserve) { @@ -662,9 +592,12 @@ void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -#ifdef CONFIG_X86_5LEVEL +#if CONFIG_PGTABLE_LEVELS > 4 /** - * p4d_set_huge - setup kernel P4D mapping + * p4d_set_huge - Set up kernel P4D mapping + * @p4d: Pointer to the P4D entry + * @addr: Virtual address associated with the P4D entry + * @prot: Protection bits to use * * No 512GB pages yet -- always return 0 */ @@ -674,9 +607,10 @@ int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) } /** - * p4d_clear_huge - clear kernel P4D mapping when it is set + * p4d_clear_huge - Clear kernel P4D mapping when it is set + * @p4d: Pointer to the P4D entry to clear * - * No 512GB pages yet -- always return 0 + * No 512GB pages yet -- do nothing */ void p4d_clear_huge(p4d_t *p4d) { @@ -684,7 +618,10 @@ void p4d_clear_huge(p4d_t *p4d) #endif /** - * pud_set_huge - setup kernel PUD mapping + * pud_set_huge - Set up kernel PUD mapping + * @pud: Pointer to the PUD entry + * @addr: Virtual address associated with the PUD entry + * @prot: Protection bits to use * * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this * function sets up a huge page only if the complete range has the same MTRR @@ -715,7 +652,10 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) } /** - * pmd_set_huge - setup kernel PMD mapping + * pmd_set_huge - Set up kernel PMD mapping + * @pmd: Pointer to the PMD entry + * @addr: Virtual address associated with the PMD entry + * @prot: Protection bits to use * * See text over pud_set_huge() above. * @@ -744,7 +684,8 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) } /** - * pud_clear_huge - clear kernel PUD mapping when it is set + * pud_clear_huge - Clear kernel PUD mapping when it is set + * @pud: Pointer to the PUD entry to clear. * * Returns 1 on success and 0 on failure (no PUD map is found). */ @@ -759,7 +700,8 @@ int pud_clear_huge(pud_t *pud) } /** - * pmd_clear_huge - clear kernel PMD mapping when it is set + * pmd_clear_huge - Clear kernel PMD mapping when it is set + * @pmd: Pointer to the PMD entry to clear. * * Returns 1 on success and 0 on failure (no PMD map is found). */ @@ -775,11 +717,11 @@ int pmd_clear_huge(pmd_t *pmd) #ifdef CONFIG_X86_64 /** - * pud_free_pmd_page - Clear pud entry and free pmd page. - * @pud: Pointer to a PUD. - * @addr: Virtual address associated with pud. + * pud_free_pmd_page - Clear PUD entry and free PMD page + * @pud: Pointer to a PUD + * @addr: Virtual address associated with PUD * - * Context: The pud range has been unmapped and TLB purged. + * Context: The PUD range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. * * NOTE: Callers must allow a single page allocation. @@ -822,11 +764,11 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) } /** - * pmd_free_pte_page - Clear pmd entry and free pte page. - * @pmd: Pointer to a PMD. - * @addr: Virtual address associated with pmd. + * pmd_free_pte_page - Clear PMD entry and free PTE page. + * @pmd: Pointer to the PMD + * @addr: Virtual address associated with PMD * - * Context: The pmd range has been unmapped and TLB purged. + * Context: The PMD range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) @@ -848,7 +790,7 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) /* * Disable free page handling on x86-PAE. This assures that ioremap() - * does not update sync'd pmd entries. See vmalloc_sync_one(). + * does not update sync'd PMD entries. See vmalloc_sync_one(). */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 5f0d579932c6..190299834011 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -185,7 +185,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); } - BUILD_BUG_ON(pgd_leaf(*pgd) != 0); + BUILD_BUG_ON(pgd_leaf(*pgd)); return p4d_offset(pgd, address); } @@ -206,7 +206,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) if (!p4d) return NULL; - BUILD_BUG_ON(p4d_leaf(*p4d) != 0); + BUILD_BUG_ON(p4d_leaf(*p4d)); if (p4d_none(*p4d)) { unsigned long new_pud_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_pud_page)) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index e459d97ef397..39f80111e6f1 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -19,6 +19,7 @@ #include <asm/cache.h> #include <asm/cacheflush.h> #include <asm/apic.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include <asm/tlb.h> @@ -215,16 +216,20 @@ static void clear_asid_other(void) atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); +struct new_asid { + unsigned int asid : 16; + unsigned int need_flush : 1; +}; -static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, - u16 *new_asid, bool *need_flush) +static struct new_asid choose_new_asid(struct mm_struct *next, u64 next_tlb_gen) { + struct new_asid ns; u16 asid; if (!static_cpu_has(X86_FEATURE_PCID)) { - *new_asid = 0; - *need_flush = true; - return; + ns.asid = 0; + ns.need_flush = 1; + return ns; } /* @@ -235,9 +240,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, u16 global_asid = mm_global_asid(next); if (global_asid) { - *new_asid = global_asid; - *need_flush = false; - return; + ns.asid = global_asid; + ns.need_flush = 0; + return ns; } } @@ -249,22 +254,23 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, next->context.ctx_id) continue; - *new_asid = asid; - *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < - next_tlb_gen); - return; + ns.asid = asid; + ns.need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < next_tlb_gen); + return ns; } /* * We don't currently own an ASID slot on this CPU. * Allocate a slot. */ - *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; - if (*new_asid >= TLB_NR_DYN_ASIDS) { - *new_asid = 0; + ns.asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; + if (ns.asid >= TLB_NR_DYN_ASIDS) { + ns.asid = 0; this_cpu_write(cpu_tlbstate.next_asid, 1); } - *need_flush = true; + ns.need_flush = true; + + return ns; } /* @@ -623,7 +629,7 @@ static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm, { /* Flush L1D if the outgoing task requests it */ if (prev_mm & LAST_USER_MM_L1D_FLUSH) - wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); /* Check whether the incoming task opted in for L1D flush */ if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH))) @@ -667,9 +673,9 @@ static void cond_mitigation(struct task_struct *next) prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); /* - * Avoid user/user BTB poisoning by flushing the branch predictor - * when switching between processes. This stops one process from - * doing Spectre-v2 attacks on another. + * Avoid user->user BTB/RSB poisoning by flushing them when switching + * between processes. This stops one process from doing Spectre-v2 + * attacks on another. * * Both, the conditional and the always IBPB mode use the mm * pointer to avoid the IBPB when switching between tasks of the @@ -781,9 +787,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); unsigned cpu = smp_processor_id(); unsigned long new_lam; + struct new_asid ns; u64 next_tlb_gen; - bool need_flush; - u16 new_asid; + /* We don't want flush_tlb_func() to run concurrently with us. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) @@ -847,14 +853,15 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, * mm_cpumask. The TLB shootdown code can figure out from * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. */ - if (IS_ENABLED(CONFIG_DEBUG_VM) && WARN_ON_ONCE(prev != &init_mm && + if (IS_ENABLED(CONFIG_DEBUG_VM) && + WARN_ON_ONCE(prev != &init_mm && !is_notrack_mm(prev) && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); /* Check if the current mm is transitioning to a global ASID */ if (mm_needs_global_asid(next, prev_asid)) { next_tlb_gen = atomic64_read(&next->context.tlb_gen); - choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + ns = choose_new_asid(next, next_tlb_gen); goto reload_tlb; } @@ -889,8 +896,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, * TLB contents went out of date while we were in lazy * mode. Fall through to the TLB switching code below. */ - new_asid = prev_asid; - need_flush = true; + ns.asid = prev_asid; + ns.need_flush = true; } else { /* * Apply process to process speculation vulnerability @@ -899,40 +906,33 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, cond_mitigation(tsk); /* - * Let nmi_uaccess_okay() and finish_asid_transition() - * know that CR3 is changing. + * Indicate that CR3 is about to change. nmi_uaccess_okay() + * and others are sensitive to the window where mm_cpumask(), + * CR3 and cpu_tlbstate.loaded_mm are not all in sync. */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); - /* - * Leave this CPU in prev's mm_cpumask. Atomic writes to - * mm_cpumask can be expensive under contention. The CPU - * will be removed lazily at TLB flush time. - */ - VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, - mm_cpumask(prev))); - /* Start receiving IPIs and then read tlb_gen (and LAM below) */ if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next))) cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); - choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + ns = choose_new_asid(next, next_tlb_gen); } reload_tlb: new_lam = mm_lam_cr3_mask(next); - if (need_flush) { - VM_WARN_ON_ONCE(is_global_asid(new_asid)); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, new_lam, true); + if (ns.need_flush) { + VM_WARN_ON_ONCE(is_global_asid(ns.asid)); + this_cpu_write(cpu_tlbstate.ctxs[ns.asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[ns.asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, ns.asid, new_lam, true); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, new_lam, false); + load_new_mm_cr3(next->pgd, ns.asid, new_lam, false); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } @@ -941,7 +941,7 @@ reload_tlb: barrier(); this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, ns.asid); cpu_tlbstate_update_lam(new_lam, mm_untag_mask(next)); if (next != prev) { @@ -972,6 +972,77 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) } /* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. Temporary mms can also be used for EFI runtime service + * calls or similar functionality. + * + * It is illegal to schedule while using a temporary mm -- the context switch + * code is unaware of the temporary mm and does not know how to context switch. + * Use a real (non-temporary) mm in a kernel thread if you need to sleep. + * + * Note: For sensitive memory writes, the temporary mm needs to be used + * exclusively by a single core, and IRQs should be disabled while the + * temporary mm is loaded, thereby preventing interrupt handler bugs from + * overriding the kernel memory protection. + */ +struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm) +{ + struct mm_struct *prev_mm; + + lockdep_assert_preemption_disabled(); + guard(irqsave)(); + + /* + * Make sure not to be in TLB lazy mode, as otherwise we'll end up + * with a stale address space WITHOUT being in lazy mode after + * restoring the previous mm. + */ + if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) + leave_mm(); + + prev_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, temp_mm, current); + + /* + * If breakpoints are enabled, disable them while the temporary mm is + * used. Userspace might set up watchpoints on addresses that are used + * in the temporary mm, which would lead to wrong signals being sent or + * crashes. + * + * Note that breakpoints are not disabled selectively, which also causes + * kernel breakpoints (e.g., perf's) to be disabled. This might be + * undesirable, but still seems reasonable as the code that runs in the + * temporary mm should be short. + */ + if (hw_breakpoint_active()) + hw_breakpoint_disable(); + + return prev_mm; +} + +void unuse_temporary_mm(struct mm_struct *prev_mm) +{ + lockdep_assert_preemption_disabled(); + guard(irqsave)(); + + /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ + cpumask_clear_cpu(smp_processor_id(), mm_cpumask(this_cpu_read(cpu_tlbstate.loaded_mm))); + + switch_mm_irqs_off(NULL, prev_mm, current); + + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); +} + +/* * Call this when reinitializing a CPU. It fixes the following potential * problems: * @@ -1204,8 +1275,16 @@ done: static bool should_flush_tlb(int cpu, void *data) { + struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu); struct flush_tlb_info *info = data; + /* + * Order the 'loaded_mm' and 'is_lazy' against their + * write ordering in switch_mm_irqs_off(). Ensure + * 'is_lazy' is at least as new as 'loaded_mm'. + */ + smp_rmb(); + /* Lazy TLB will get flushed at the next context switch. */ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) return false; @@ -1214,8 +1293,15 @@ static bool should_flush_tlb(int cpu, void *data) if (!info->mm) return true; + /* + * While switching, the remote CPU could have state from + * either the prev or next mm. Assume the worst and flush. + */ + if (loaded_mm == LOADED_MM_SWITCHING) + return true; + /* The target mm is loaded, and the CPU is not lazy. */ - if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm) + if (loaded_mm == info->mm) return true; /* In cpumask, but not the loaded mm? Periodically remove by flushing. */ |