diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/cpu_entry_area.c | 64 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/extable.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 58 | ||||
-rw-r--r-- | arch/x86/mm/hugetlbpage.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 43 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 11 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 164 | ||||
-rw-r--r-- | arch/x86/mm/kaslr.c | 94 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/mm_internal.h | 3 | ||||
-rw-r--r-- | arch/x86/mm/mpx.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 16 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 14 | ||||
-rw-r--r-- | arch/x86/mm/pkeys.c | 21 | ||||
-rw-r--r-- | arch/x86/mm/pti.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 116 |
17 files changed, 394 insertions, 242 deletions
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 19c6abf9ea31..752ad11d6868 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -13,8 +13,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); #ifdef CONFIG_X86_64 -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); +DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); #endif struct cpu_entry_area *get_cpu_entry_area(int cpu) @@ -52,10 +52,10 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); } -static void __init percpu_setup_debug_store(int cpu) +static void __init percpu_setup_debug_store(unsigned int cpu) { #ifdef CONFIG_CPU_SUP_INTEL - int npages; + unsigned int npages; void *cea; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) @@ -78,9 +78,43 @@ static void __init percpu_setup_debug_store(int cpu) #endif } +#ifdef CONFIG_X86_64 + +#define cea_map_stack(name) do { \ + npages = sizeof(estacks->name## _stack) / PAGE_SIZE; \ + cea_map_percpu_pages(cea->estacks.name## _stack, \ + estacks->name## _stack, npages, PAGE_KERNEL); \ + } while (0) + +static void __init percpu_setup_exception_stacks(unsigned int cpu) +{ + struct exception_stacks *estacks = per_cpu_ptr(&exception_stacks, cpu); + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + unsigned int npages; + + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + + per_cpu(cea_exception_stacks, cpu) = &cea->estacks; + + /* + * The exceptions stack mappings in the per cpu area are protected + * by guard pages so each stack must be mapped separately. DB2 is + * not mapped; it just exists to catch triple nesting of #DB. + */ + cea_map_stack(DF); + cea_map_stack(NMI); + cea_map_stack(DB1); + cea_map_stack(DB); + cea_map_stack(MCE); +} +#else +static inline void percpu_setup_exception_stacks(unsigned int cpu) {} +#endif + /* Setup the fixmap mappings only once per-processor */ -static void __init setup_cpu_entry_area(int cpu) +static void __init setup_cpu_entry_area(unsigned int cpu) { + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); #ifdef CONFIG_X86_64 /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ pgprot_t gdt_prot = PAGE_KERNEL_RO; @@ -101,10 +135,9 @@ static void __init setup_cpu_entry_area(int cpu) pgprot_t tss_prot = PAGE_KERNEL; #endif - cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), - gdt_prot); + cea_set_pte(&cea->gdt, get_cpu_gdt_paddr(cpu), gdt_prot); - cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, + cea_map_percpu_pages(&cea->entry_stack_page, per_cpu_ptr(&entry_stack_storage, cpu), 1, PAGE_KERNEL); @@ -128,22 +161,15 @@ static void __init setup_cpu_entry_area(int cpu) BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); - cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, - &per_cpu(cpu_tss_rw, cpu), + cea_map_percpu_pages(&cea->tss, &per_cpu(cpu_tss_rw, cpu), sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); #ifdef CONFIG_X86_32 - per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); + per_cpu(cpu_entry_area, cpu) = cea; #endif -#ifdef CONFIG_X86_64 - BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); - BUILD_BUG_ON(sizeof(exception_stacks) != - sizeof(((struct cpu_entry_area *)0)->exception_stacks)); - cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, - &per_cpu(exception_stacks, cpu), - sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); -#endif + percpu_setup_exception_stacks(cpu); + percpu_setup_debug_store(cpu); } diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index c0309ea9abee..6a7302d1161f 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -578,7 +578,7 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) { #ifdef CONFIG_PAGE_TABLE_ISOLATION - if (user && static_cpu_has(X86_FEATURE_PTI)) + if (user && boot_cpu_has(X86_FEATURE_PTI)) pgd = kernel_to_user_pgdp(pgd); #endif ptdump_walk_pgd_level_core(m, pgd, false, false); @@ -591,7 +591,7 @@ void ptdump_walk_user_pgd_level_checkwx(void) pgd_t *pgd = INIT_PGD; if (!(__supported_pte_mask & _PAGE_NX) || - !static_cpu_has(X86_FEATURE_PTI)) + !boot_cpu_has(X86_FEATURE_PTI)) return; pr_info("x86/mm: Checking user space page tables\n"); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 3c4568f8fb28..b0a2de8d2f9e 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -145,7 +145,7 @@ __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup unsigned long error_code, unsigned long fault_addr) { - if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n", + if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", (unsigned int)regs->cx, regs->ip, (void *)regs->ip)) show_stack_regs(regs); @@ -162,7 +162,7 @@ __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup unsigned long error_code, unsigned long fault_addr) { - if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n", + if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, regs->ip, (void *)regs->ip)) show_stack_regs(regs); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 667f1da36208..46df4c6aae46 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -28,6 +28,7 @@ #include <asm/mmu_context.h> /* vma_pkey() */ #include <asm/efi.h> /* efi_recover_from_page_fault()*/ #include <asm/desc.h> /* store_idt(), ... */ +#include <asm/cpu_entry_area.h> /* exception stack */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -359,8 +360,6 @@ static noinline int vmalloc_fault(unsigned long address) if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; - WARN_ON_ONCE(in_nmi()); - /* * Copy kernel mappings over when needed. This can also * happen within a race in page table update. In the later @@ -603,24 +602,9 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) name, index, addr, (desc.limit0 | (desc.limit1 << 16))); } -/* - * This helper function transforms the #PF error_code bits into - * "[PROT] [USER]" type of descriptive, almost human-readable error strings: - */ -static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt) -{ - if (error_code & mask) { - if (buf[0]) - strcat(buf, " "); - strcat(buf, txt); - } -} - static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { - char err_txt[64]; - if (!oops_may_print()) return; @@ -644,31 +628,29 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad from_kuid(&init_user_ns, current_uid())); } - pr_alert("BUG: unable to handle kernel %s at %px\n", - address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", - (void *)address); - - err_txt[0] = 0; - - /* - * Note: length of these appended strings including the separation space and the - * zero delimiter must fit into err_txt[]. - */ - err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" ); - err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]"); - err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" ); - err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" ); - err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]"); - err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" ); - - pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]"); + if (address < PAGE_SIZE && !user_mode(regs)) + pr_alert("BUG: kernel NULL pointer dereference, address: %px\n", + (void *)address); + else + pr_alert("BUG: unable to handle page fault for address: %px\n", + (void *)address); + + pr_alert("#PF: %s %s in %s mode\n", + (error_code & X86_PF_USER) ? "user" : "supervisor", + (error_code & X86_PF_INSTR) ? "instruction fetch" : + (error_code & X86_PF_WRITE) ? "write access" : + "read access", + user_mode(regs) ? "user" : "kernel"); + pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code, + !(error_code & X86_PF_PROT) ? "not-present page" : + (error_code & X86_PF_RSVD) ? "reserved bit violation" : + (error_code & X86_PF_PK) ? "protection keys violation" : + "permissions violation"); if (!(error_code & X86_PF_USER) && user_mode(regs)) { struct desc_ptr idt, gdt; u16 ldtr, tr; - pr_alert("This was a system access from user code\n"); - /* * This can happen for quite a few reasons. The more obvious * ones are faults accessing the GDT, or LDT. Perhaps @@ -793,7 +775,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (is_vmalloc_addr((void *)address) && (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { - unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); + unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *); /* * We're likely to be running with very little stack space * left. It's plausible that we'd hit this condition but diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 92e4c4b85bba..fab095362c50 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -203,7 +203,7 @@ static __init int setup_hugepagesz(char *opt) } __setup("hugepagesz=", setup_hugepagesz); -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) +#ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) { /* With compaction or CMA we can allocate gigantic pages at runtime */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f905a2371080..fd10d91a6115 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -5,6 +5,8 @@ #include <linux/memblock.h> #include <linux/swapfile.h> #include <linux/swapops.h> +#include <linux/kmemleak.h> +#include <linux/sched/task.h> #include <asm/set_memory.h> #include <asm/e820/api.h> @@ -22,6 +24,7 @@ #include <asm/hypervisor.h> #include <asm/cpufeature.h> #include <asm/pti.h> +#include <asm/text-patching.h> /* * We need to define the tracepoints somewhere, and tlb.c @@ -701,6 +704,41 @@ void __init init_mem_mapping(void) } /* + * Initialize an mm_struct to be used during poking and a pointer to be used + * during patching. + */ +void __init poking_init(void) +{ + spinlock_t *ptl; + pte_t *ptep; + + poking_mm = copy_init_mm(); + BUG_ON(!poking_mm); + + /* + * Randomize the poking address, but make sure that the following page + * will be mapped at the same PMD. We need 2 pages, so find space for 3, + * and adjust the address if the PMD ends after the first one. + */ + poking_addr = TASK_UNMAPPED_BASE; + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % + (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); + + if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) + poking_addr += PAGE_SIZE; + + /* + * We need to trigger the allocation of the page-tables that will be + * needed for poking now. Later, poking may be performed in an atomic + * section, which might cause allocation to fail. + */ + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + BUG_ON(!ptep); + pte_unmap_unlock(ptep, ptl); +} + +/* * devmem_is_allowed() checks to see if /dev/mem access to a certain address * is valid. The argument is a physical page number. * @@ -766,6 +804,11 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end) if (debug_pagealloc_enabled()) { pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n", begin, end - 1); + /* + * Inform kmemleak about the hole in the memory since the + * corresponding pages will be unmapped. + */ + kmemleak_free_part((void *)begin, end - begin); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); } else { /* diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 85c94f9a87f8..075e568098f2 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -850,24 +850,25 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, restrictions); } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; struct zone *zone; zone = page_zone(pfn_to_page(start_pfn)); - return __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bccff68e3267..62fc457f3849 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -58,6 +58,37 @@ #include "ident_map.c" +#define DEFINE_POPULATE(fname, type1, type2, init) \ +static inline void fname##_init(struct mm_struct *mm, \ + type1##_t *arg1, type2##_t *arg2, bool init) \ +{ \ + if (init) \ + fname##_safe(mm, arg1, arg2); \ + else \ + fname(mm, arg1, arg2); \ +} + +DEFINE_POPULATE(p4d_populate, p4d, pud, init) +DEFINE_POPULATE(pgd_populate, pgd, p4d, init) +DEFINE_POPULATE(pud_populate, pud, pmd, init) +DEFINE_POPULATE(pmd_populate_kernel, pmd, pte, init) + +#define DEFINE_ENTRY(type1, type2, init) \ +static inline void set_##type1##_init(type1##_t *arg1, \ + type2##_t arg2, bool init) \ +{ \ + if (init) \ + set_##type1##_safe(arg1, arg2); \ + else \ + set_##type1(arg1, arg2); \ +} + +DEFINE_ENTRY(p4d, p4d, init) +DEFINE_ENTRY(pud, pud, init) +DEFINE_ENTRY(pmd, pmd, init) +DEFINE_ENTRY(pte, pte, init) + + /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the * physical space so we can cache the place of the first one and move @@ -414,7 +445,7 @@ void __init cleanup_highmap(void) */ static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, - pgprot_t prot) + pgprot_t prot, bool init) { unsigned long pages = 0, paddr_next; unsigned long paddr_last = paddr_end; @@ -432,7 +463,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PAGE_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pte_safe(pte, __pte(0)); + set_pte_init(pte, __pte(0), init); continue; } @@ -452,7 +483,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte); pages++; - set_pte_safe(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); + set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init); paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE; } @@ -468,7 +499,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, */ static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, - unsigned long page_size_mask, pgprot_t prot) + unsigned long page_size_mask, pgprot_t prot, bool init) { unsigned long pages = 0, paddr_next; unsigned long paddr_last = paddr_end; @@ -487,7 +518,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PMD_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pmd_safe(pmd, __pmd(0)); + set_pmd_init(pmd, __pmd(0), init); continue; } @@ -496,7 +527,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, spin_lock(&init_mm.page_table_lock); pte = (pte_t *)pmd_page_vaddr(*pmd); paddr_last = phys_pte_init(pte, paddr, - paddr_end, prot); + paddr_end, prot, + init); spin_unlock(&init_mm.page_table_lock); continue; } @@ -524,19 +556,20 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, if (page_size_mask & (1<<PG_LEVEL_2M)) { pages++; spin_lock(&init_mm.page_table_lock); - set_pte_safe((pte_t *)pmd, - pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT, - __pgprot(pgprot_val(prot) | _PAGE_PSE))); + set_pte_init((pte_t *)pmd, + pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE)), + init); spin_unlock(&init_mm.page_table_lock); paddr_last = paddr_next; continue; } pte = alloc_low_page(); - paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot); + paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot, init); spin_lock(&init_mm.page_table_lock); - pmd_populate_kernel_safe(&init_mm, pmd, pte); + pmd_populate_kernel_init(&init_mm, pmd, pte, init); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); @@ -551,7 +584,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, */ static unsigned long __meminit phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, - unsigned long page_size_mask) + unsigned long page_size_mask, bool init) { unsigned long pages = 0, paddr_next; unsigned long paddr_last = paddr_end; @@ -573,7 +606,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PUD_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pud_safe(pud, __pud(0)); + set_pud_init(pud, __pud(0), init); continue; } @@ -583,7 +616,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, paddr_last = phys_pmd_init(pmd, paddr, paddr_end, page_size_mask, - prot); + prot, init); continue; } /* @@ -610,9 +643,10 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, if (page_size_mask & (1<<PG_LEVEL_1G)) { pages++; spin_lock(&init_mm.page_table_lock); - set_pte_safe((pte_t *)pud, - pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, - PAGE_KERNEL_LARGE)); + set_pte_init((pte_t *)pud, + pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE), + init); spin_unlock(&init_mm.page_table_lock); paddr_last = paddr_next; continue; @@ -620,10 +654,10 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, pmd = alloc_low_page(); paddr_last = phys_pmd_init(pmd, paddr, paddr_end, - page_size_mask, prot); + page_size_mask, prot, init); spin_lock(&init_mm.page_table_lock); - pud_populate_safe(&init_mm, pud, pmd); + pud_populate_init(&init_mm, pud, pmd, init); spin_unlock(&init_mm.page_table_lock); } @@ -634,14 +668,15 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, static unsigned long __meminit phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, - unsigned long page_size_mask) + unsigned long page_size_mask, bool init) { unsigned long paddr_next, paddr_last = paddr_end; unsigned long vaddr = (unsigned long)__va(paddr); int i = p4d_index(vaddr); if (!pgtable_l5_enabled()) - return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask); + return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, + page_size_mask, init); for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) { p4d_t *p4d; @@ -657,39 +692,34 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & P4D_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_p4d_safe(p4d, __p4d(0)); + set_p4d_init(p4d, __p4d(0), init); continue; } if (!p4d_none(*p4d)) { pud = pud_offset(p4d, 0); - paddr_last = phys_pud_init(pud, paddr, - paddr_end, - page_size_mask); + paddr_last = phys_pud_init(pud, paddr, paddr_end, + page_size_mask, init); continue; } pud = alloc_low_page(); paddr_last = phys_pud_init(pud, paddr, paddr_end, - page_size_mask); + page_size_mask, init); spin_lock(&init_mm.page_table_lock); - p4d_populate_safe(&init_mm, p4d, pud); + p4d_populate_init(&init_mm, p4d, pud, init); spin_unlock(&init_mm.page_table_lock); } return paddr_last; } -/* - * Create page table mapping for the physical memory for specific physical - * addresses. The virtual and physical addresses have to be aligned on PMD level - * down. It returns the last physical address mapped. - */ -unsigned long __meminit -kernel_physical_mapping_init(unsigned long paddr_start, - unsigned long paddr_end, - unsigned long page_size_mask) +static unsigned long __meminit +__kernel_physical_mapping_init(unsigned long paddr_start, + unsigned long paddr_end, + unsigned long page_size_mask, + bool init) { bool pgd_changed = false; unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last; @@ -709,19 +739,22 @@ kernel_physical_mapping_init(unsigned long paddr_start, p4d = (p4d_t *)pgd_page_vaddr(*pgd); paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), - page_size_mask); + page_size_mask, + init); continue; } p4d = alloc_low_page(); paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), - page_size_mask); + page_size_mask, init); spin_lock(&init_mm.page_table_lock); if (pgtable_l5_enabled()) - pgd_populate_safe(&init_mm, pgd, p4d); + pgd_populate_init(&init_mm, pgd, p4d, init); else - p4d_populate_safe(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); + p4d_populate_init(&init_mm, p4d_offset(pgd, vaddr), + (pud_t *) p4d, init); + spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } @@ -732,6 +765,37 @@ kernel_physical_mapping_init(unsigned long paddr_start, return paddr_last; } + +/* + * Create page table mapping for the physical memory for specific physical + * addresses. Note that it can only be used to populate non-present entries. + * The virtual and physical addresses have to be aligned on PMD level + * down. It returns the last physical address mapped. + */ +unsigned long __meminit +kernel_physical_mapping_init(unsigned long paddr_start, + unsigned long paddr_end, + unsigned long page_size_mask) +{ + return __kernel_physical_mapping_init(paddr_start, paddr_end, + page_size_mask, true); +} + +/* + * This function is similar to kernel_physical_mapping_init() above with the + * exception that it uses set_{pud,pmd}() instead of the set_{pud,pte}_safe() + * when updating the mapping. The caller is responsible to flush the TLBs after + * the function returns. + */ +unsigned long __meminit +kernel_physical_mapping_change(unsigned long paddr_start, + unsigned long paddr_end, + unsigned long page_size_mask) +{ + return __kernel_physical_mapping_init(paddr_start, paddr_end, + page_size_mask, false); +} + #ifndef CONFIG_NUMA void __init initmem_init(void) { @@ -777,11 +841,11 @@ static void update_end_of_memory_vars(u64 start, u64 size) } int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, bool want_memblock) + struct mhp_restrictions *restrictions) { int ret; - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ @@ -791,15 +855,15 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, return ret; } -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; init_memory_mapping(start, start + size); - return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return add_pages(nid, start_pfn, nr_pages, restrictions); } #define PAGE_INUSE 0xFD @@ -1141,24 +1205,20 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end) remove_pagetable(start, end, true, NULL); } -int __ref arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void __ref arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; struct page *page = pfn_to_page(start_pfn); struct zone *zone; - int ret; /* With altmap the first mapped page is offset from @start */ if (altmap) page += vmem_altmap_offset(altmap); zone = page_zone(page); - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); - WARN_ON_ONCE(ret); + __remove_pages(zone, start_pfn, nr_pages, altmap); kernel_physical_mapping_remove(start, start + size); - - return ret; } #endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index d669c5e797e0..dc3f058bdf9b 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -125,10 +125,7 @@ void __init kernel_randomize_memory(void) */ entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); prandom_bytes_state(&rand_state, &rand, sizeof(rand)); - if (pgtable_l5_enabled()) - entropy = (rand % (entropy + 1)) & P4D_MASK; - else - entropy = (rand % (entropy + 1)) & PUD_MASK; + entropy = (rand % (entropy + 1)) & PUD_MASK; vaddr += entropy; *kaslr_regions[i].base = vaddr; @@ -137,84 +134,71 @@ void __init kernel_randomize_memory(void) * randomization alignment. */ vaddr += get_padding(&kaslr_regions[i]); - if (pgtable_l5_enabled()) - vaddr = round_up(vaddr + 1, P4D_SIZE); - else - vaddr = round_up(vaddr + 1, PUD_SIZE); + vaddr = round_up(vaddr + 1, PUD_SIZE); remain_entropy -= entropy; } } static void __meminit init_trampoline_pud(void) { - unsigned long paddr, paddr_next; + pud_t *pud_page_tramp, *pud, *pud_tramp; + p4d_t *p4d_page_tramp, *p4d, *p4d_tramp; + unsigned long paddr, vaddr; pgd_t *pgd; - pud_t *pud_page, *pud_page_tramp; - int i; pud_page_tramp = alloc_low_page(); + /* + * There are two mappings for the low 1MB area, the direct mapping + * and the 1:1 mapping for the real mode trampoline: + * + * Direct mapping: virt_addr = phys_addr + PAGE_OFFSET + * 1:1 mapping: virt_addr = phys_addr + */ paddr = 0; - pgd = pgd_offset_k((unsigned long)__va(paddr)); - pud_page = (pud_t *) pgd_page_vaddr(*pgd); - - for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) { - pud_t *pud, *pud_tramp; - unsigned long vaddr = (unsigned long)__va(paddr); + vaddr = (unsigned long)__va(paddr); + pgd = pgd_offset_k(vaddr); - pud_tramp = pud_page_tramp + pud_index(paddr); - pud = pud_page + pud_index(vaddr); - paddr_next = (paddr & PUD_MASK) + PUD_SIZE; - - *pud_tramp = *pud; - } + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); -} - -static void __meminit init_trampoline_p4d(void) -{ - unsigned long paddr, paddr_next; - pgd_t *pgd; - p4d_t *p4d_page, *p4d_page_tramp; - int i; + pud_tramp = pud_page_tramp + pud_index(paddr); + *pud_tramp = *pud; - p4d_page_tramp = alloc_low_page(); - - paddr = 0; - pgd = pgd_offset_k((unsigned long)__va(paddr)); - p4d_page = (p4d_t *) pgd_page_vaddr(*pgd); - - for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) { - p4d_t *p4d, *p4d_tramp; - unsigned long vaddr = (unsigned long)__va(paddr); + if (pgtable_l5_enabled()) { + p4d_page_tramp = alloc_low_page(); p4d_tramp = p4d_page_tramp + p4d_index(paddr); - p4d = p4d_page + p4d_index(vaddr); - paddr_next = (paddr & P4D_MASK) + P4D_SIZE; - *p4d_tramp = *p4d; - } + set_p4d(p4d_tramp, + __p4d(_KERNPG_TABLE | __pa(pud_page_tramp))); - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); + set_pgd(&trampoline_pgd_entry, + __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); + } else { + set_pgd(&trampoline_pgd_entry, + __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); + } } /* - * Create PGD aligned trampoline table to allow real mode initialization - * of additional CPUs. Consume only 1 low memory page. + * The real mode trampoline, which is required for bootstrapping CPUs + * occupies only a small area under the low 1MB. See reserve_real_mode() + * for details. + * + * If KASLR is disabled the first PGD entry of the direct mapping is copied + * to map the real mode trampoline. + * + * If KASLR is enabled, copy only the PUD which covers the low 1MB + * area. This limits the randomization granularity to 1GB for both 4-level + * and 5-level paging. */ void __meminit init_trampoline(void) { - if (!kaslr_memory_enabled()) { init_trampoline_default(); return; } - if (pgtable_l5_enabled()) - init_trampoline_p4d(); - else - init_trampoline_pud(); + init_trampoline_pud(); } diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 385afa2b9e17..51f50a7a07ef 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -301,9 +301,13 @@ static int __init early_set_memory_enc_dec(unsigned long vaddr, else split_page_size_mask = 1 << PG_LEVEL_2M; - kernel_physical_mapping_init(__pa(vaddr & pmask), - __pa((vaddr_end & pmask) + psize), - split_page_size_mask); + /* + * kernel_physical_mapping_change() does not flush the TLBs, so + * a TLB flush is required after we exit from the for loop. + */ + kernel_physical_mapping_change(__pa(vaddr & pmask), + __pa((vaddr_end & pmask) + psize), + split_page_size_mask); } ret = 0; diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 319bde386d5f..eeae142062ed 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -13,6 +13,9 @@ void early_ioremap_page_table_range_init(void); unsigned long kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask); +unsigned long kernel_physical_mapping_change(unsigned long start, + unsigned long end, + unsigned long page_size_mask); void zone_sizes_init(void); extern int after_bootmem; diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index c805db6236b4..59726aaf4671 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -142,7 +142,7 @@ int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs) goto err_out; } /* get bndregs field from current task's xsave area */ - bndregs = get_xsave_field_ptr(XFEATURE_MASK_BNDREGS); + bndregs = get_xsave_field_ptr(XFEATURE_BNDREGS); if (!bndregs) { err = -EINVAL; goto err_out; @@ -190,7 +190,7 @@ static __user void *mpx_get_bounds_dir(void) * The bounds directory pointer is stored in a register * only accessible if we first do an xsave. */ - bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); + bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR); if (!bndcsr) return MPX_INVALID_BOUNDS_DIR; @@ -376,7 +376,7 @@ static int do_mpx_bt_fault(void) const struct mpx_bndcsr *bndcsr; struct mm_struct *mm = current->mm; - bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); + bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR); if (!bndcsr) return -EINVAL; /* diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4c570612e24e..daf4d645e537 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -2209,8 +2209,6 @@ int set_pages_rw(struct page *page, int numpages) return set_memory_rw(addr, numpages); } -#ifdef CONFIG_DEBUG_PAGEALLOC - static int __set_pages_p(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); @@ -2249,6 +2247,16 @@ static int __set_pages_np(struct page *page, int numpages) return __change_page_attr_set_clr(&cpa, 0); } +int set_direct_map_invalid_noflush(struct page *page) +{ + return __set_pages_np(page, 1); +} + +int set_direct_map_default_noflush(struct page *page) +{ + return __set_pages_p(page, 1); +} + void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) @@ -2282,7 +2290,6 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) } #ifdef CONFIG_HIBERNATION - bool kernel_page_present(struct page *page) { unsigned int level; @@ -2294,11 +2301,8 @@ bool kernel_page_present(struct page *page) pte = lookup_address((unsigned long)page_address(page), &level); return (pte_val(*pte) & _PAGE_PRESENT); } - #endif /* CONFIG_HIBERNATION */ -#endif /* CONFIG_DEBUG_PAGEALLOC */ - int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags) { diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 7bd01709a091..1f67b1e15bf6 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -190,7 +190,7 @@ static void pgd_dtor(pgd_t *pgd) * when PTI is enabled. We need them to map the per-process LDT into the * user-space page-table. */ -#define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \ +#define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ KERNEL_PGD_PTRS : 0) #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS @@ -292,7 +292,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) #ifdef CONFIG_PAGE_TABLE_ISOLATION - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; pgdp = kernel_to_user_pgdp(pgdp); @@ -373,14 +373,14 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, static struct kmem_cache *pgd_cache; -static int __init pgd_cache_init(void) +void __init pgd_cache_init(void) { /* * When PAE kernel is running as a Xen domain, it does not use * shared kernel pmd. And this requires a whole page for pgd. */ if (!SHARED_KERNEL_PMD) - return 0; + return; /* * when PAE kernel is not running as a Xen domain, it uses @@ -390,9 +390,7 @@ static int __init pgd_cache_init(void) */ pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, SLAB_PANIC, NULL); - return 0; } -core_initcall(pgd_cache_init); static inline pgd_t *_pgd_alloc(void) { @@ -420,6 +418,10 @@ static inline void _pgd_free(pgd_t *pgd) } #else +void __init pgd_cache_init(void) +{ +} + static inline pgd_t *_pgd_alloc(void) { return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 047a77f6a10c..1dcfc91c8f0c 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -18,6 +18,7 @@ #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/mmu_context.h> /* vma_pkey() */ +#include <asm/fpu/internal.h> /* init_fpstate */ int __execute_only_pkey(struct mm_struct *mm) { @@ -39,17 +40,12 @@ int __execute_only_pkey(struct mm_struct *mm) * dance to set PKRU if we do not need to. Check it * first and assume that if the execute-only pkey is * write-disabled that we do not have to set it - * ourselves. We need preempt off so that nobody - * can make fpregs inactive. + * ourselves. */ - preempt_disable(); if (!need_to_set_mm_pkey && - current->thread.fpu.initialized && !__pkru_allows_read(read_pkru(), execute_only_pkey)) { - preempt_enable(); return execute_only_pkey; } - preempt_enable(); /* * Set up PKRU so that it denies access for everything @@ -131,7 +127,6 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey * in the process's lifetime will not accidentally get access * to data which is pkey-protected later on. */ -static u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) | PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) | @@ -148,13 +143,6 @@ void copy_init_pkru_to_fpregs(void) { u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value); /* - * Any write to PKRU takes it out of the XSAVE 'init - * state' which increases context switch cost. Avoid - * writing 0 when PKRU was already 0. - */ - if (!init_pkru_value_snapshot && !read_pkru()) - return; - /* * Override the PKRU state that came from 'init_fpstate' * with the baseline from the process. */ @@ -174,6 +162,7 @@ static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf, static ssize_t init_pkru_write_file(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { + struct pkru_state *pk; char buf[32]; ssize_t len; u32 new_init_pkru; @@ -196,6 +185,10 @@ static ssize_t init_pkru_write_file(struct file *file, return -EINVAL; WRITE_ONCE(init_pkru_value, new_init_pkru); + pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); + if (!pk) + return -EINVAL; + pk->pkru = new_init_pkru; return count; } diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 139b28a01ce4..9c2463bc158f 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -35,6 +35,7 @@ #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/uaccess.h> +#include <linux/cpu.h> #include <asm/cpufeature.h> #include <asm/hypervisor.h> @@ -115,7 +116,8 @@ void __init pti_check_boottime_disable(void) } } - if (cmdline_find_option_bool(boot_command_line, "nopti")) { + if (cmdline_find_option_bool(boot_command_line, "nopti") || + cpu_mitigations_off()) { pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on command line."); return; @@ -626,7 +628,7 @@ static void pti_set_kernel_image_nonglobal(void) */ void __init pti_init(void) { - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; pr_info("enabled\n"); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 487b8474c01c..7f61431c75fb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); } -static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) +static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason) { const struct flush_tlb_info *f = info; @@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask, */ unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info); + +#ifdef CONFIG_DEBUG_VM +static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx); +#endif + +static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, + unsigned long start, unsigned long end, + unsigned int stride_shift, bool freed_tables, + u64 new_tlb_gen) +{ + struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info); + +#ifdef CONFIG_DEBUG_VM + /* + * Ensure that the following code is non-reentrant and flush_tlb_info + * is not overwritten. This means no TLB flushing is initiated by + * interrupt handlers and machine-check exception handlers. + */ + BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); +#endif + + info->start = start; + info->end = end; + info->mm = mm; + info->stride_shift = stride_shift; + info->freed_tables = freed_tables; + info->new_tlb_gen = new_tlb_gen; + + return info; +} + +static inline void put_flush_tlb_info(void) +{ +#ifdef CONFIG_DEBUG_VM + /* Complete reentrency prevention checks */ + barrier(); + this_cpu_dec(flush_tlb_info_idx); +#endif +} + void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int stride_shift, bool freed_tables) { + struct flush_tlb_info *info; + u64 new_tlb_gen; int cpu; - struct flush_tlb_info info = { - .mm = mm, - .stride_shift = stride_shift, - .freed_tables = freed_tables, - }; - cpu = get_cpu(); - /* This is also a barrier that synchronizes with switch_mm(). */ - info.new_tlb_gen = inc_mm_tlb_gen(mm); - /* Should we flush just the requested range? */ - if ((end != TLB_FLUSH_ALL) && - ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) { - info.start = start; - info.end = end; - } else { - info.start = 0UL; - info.end = TLB_FLUSH_ALL; + if ((end == TLB_FLUSH_ALL) || + ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { + start = 0; + end = TLB_FLUSH_ALL; } + /* This is also a barrier that synchronizes with switch_mm(). */ + new_tlb_gen = inc_mm_tlb_gen(mm); + + info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables, + new_tlb_gen); + if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { - VM_WARN_ON(irqs_disabled()); + lockdep_assert_irqs_enabled(); local_irq_disable(); - flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); + flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN); local_irq_enable(); } if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), &info); + flush_tlb_others(mm_cpumask(mm), info); + put_flush_tlb_info(); put_cpu(); } @@ -787,38 +825,48 @@ static void do_kernel_range_flush(void *info) void flush_tlb_kernel_range(unsigned long start, unsigned long end) { - /* Balance as user space task's flush, a bit conservative */ if (end == TLB_FLUSH_ALL || (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { on_each_cpu(do_flush_tlb_all, NULL, 1); } else { - struct flush_tlb_info info; - info.start = start; - info.end = end; - on_each_cpu(do_kernel_range_flush, &info, 1); + struct flush_tlb_info *info; + + preempt_disable(); + info = get_flush_tlb_info(NULL, start, end, 0, false, 0); + + on_each_cpu(do_kernel_range_flush, info, 1); + + put_flush_tlb_info(); + preempt_enable(); } } +/* + * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm. + * This means that the 'struct flush_tlb_info' that describes which mappings to + * flush is actually fixed. We therefore set a single fixed struct and use it in + * arch_tlbbatch_flush(). + */ +static const struct flush_tlb_info full_flush_tlb_info = { + .mm = NULL, + .start = 0, + .end = TLB_FLUSH_ALL, +}; + void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { - struct flush_tlb_info info = { - .mm = NULL, - .start = 0UL, - .end = TLB_FLUSH_ALL, - }; - int cpu = get_cpu(); if (cpumask_test_cpu(cpu, &batch->cpumask)) { - VM_WARN_ON(irqs_disabled()); + lockdep_assert_irqs_enabled(); local_irq_disable(); - flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); + flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN); local_irq_enable(); } if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) - flush_tlb_others(&batch->cpumask, &info); + flush_tlb_others(&batch->cpumask, &full_flush_tlb_info); cpumask_clear(&batch->cpumask); |