diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/amdtopology.c | 22 | ||||
-rw-r--r-- | arch/x86/mm/extable.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 43 | ||||
-rw-r--r-- | arch/x86/mm/ident_map.c | 19 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 18 | ||||
-rw-r--r-- | arch/x86/mm/kaslr.c | 30 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 27 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 21 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 17 | ||||
-rw-r--r-- | arch/x86/mm/pat_rbtree.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/pkeys.c | 142 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 15 |
12 files changed, 284 insertions, 76 deletions
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index ba47524f56e8..d1c7de095808 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -52,21 +52,6 @@ static __init int find_northbridge(void) return -ENOENT; } -static __init void early_get_boot_cpu_id(void) -{ - /* - * need to get the APIC ID of the BSP so can use that to - * create apicid_to_node in amd_scan_nodes() - */ -#ifdef CONFIG_X86_MPPARSE - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - early_get_smp_config(); -#endif -} - int __init amd_numa_init(void) { u64 start = PFN_PHYS(0); @@ -180,8 +165,11 @@ int __init amd_numa_init(void) cores = 1 << bits; apicid_base = 0; - /* get the APIC ID of the BSP early for systems with apicid lifting */ - early_get_boot_cpu_id(); + /* + * get boot-time SMP configuration: + */ + early_get_smp_config(); + if (boot_cpu_physical_apicid > 0) { pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); apicid_base = boot_cpu_physical_apicid; diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 832b98f822be..79ae939970d3 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,4 +1,4 @@ -#include <linux/module.h> +#include <linux/extable.h> #include <asm/uaccess.h> #include <asm/traps.h> #include <asm/kdebug.h> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index dc8023060456..9f72ca3b2669 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -5,7 +5,7 @@ */ #include <linux/sched.h> /* test_thread_flag(), ... */ #include <linux/kdebug.h> /* oops_begin/end, ... */ -#include <linux/module.h> /* search_exception_table */ +#include <linux/extable.h> /* search_exception_tables */ #include <linux/bootmem.h> /* max_low_pfn */ #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ @@ -753,6 +753,38 @@ no_context(struct pt_regs *regs, unsigned long error_code, return; } +#ifdef CONFIG_VMAP_STACK + /* + * Stack overflow? During boot, we can fault near the initial + * stack in the direct map, but that's not an overflow -- check + * that we're in vmalloc space to avoid this. + */ + if (is_vmalloc_addr((void *)address) && + (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || + address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { + register void *__sp asm("rsp"); + unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); + /* + * We're likely to be running with very little stack space + * left. It's plausible that we'd hit this condition but + * double-fault even before we get this far, in which case + * we're fine: the double-fault handler will deal with it. + * + * We don't want to make it all the way into the oops code + * and then double-fault, though, because we're likely to + * break the console driver and lose most of the stack dump. + */ + asm volatile ("movq %[stack], %%rsp\n\t" + "call handle_stack_overflow\n\t" + "1: jmp 1b" + : "+r" (__sp) + : "D" ("kernel stack overflow (page fault)"), + "S" (regs), "d" (address), + [stack] "rm" (stack)); + unreachable(); + } +#endif + /* * 32-bit: * @@ -1112,6 +1144,15 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) { /* This is only called for the current mm, so: */ bool foreign = false; + + /* + * Read or write was blocked by protection keys. This is + * always an unconditional error and can never result in + * a follow-up action to resolve the fault, like a COW. + */ + if (error_code & PF_PK) + return 1; + /* * Make sure to check the VMA so that we do not perform * faults just to hit a PF_PK as soon as we fill in a diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index ec21796ac5fd..4473cb4f8b90 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -3,15 +3,17 @@ * included by both the compressed kernel and the regular kernel. */ -static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, +static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page, unsigned long addr, unsigned long end) { addr &= PMD_MASK; for (; addr < end; addr += PMD_SIZE) { pmd_t *pmd = pmd_page + pmd_index(addr); - if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | pmd_flag)); + if (pmd_present(*pmd)) + continue; + + set_pmd(pmd, __pmd((addr - info->offset) | info->pmd_flag)); } } @@ -30,13 +32,13 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, if (pud_present(*pud)) { pmd = pmd_offset(pud, 0); - ident_pmd_init(info->pmd_flag, pmd, addr, next); + ident_pmd_init(info, pmd, addr, next); continue; } pmd = (pmd_t *)info->alloc_pgt_page(info->context); if (!pmd) return -ENOMEM; - ident_pmd_init(info->pmd_flag, pmd, addr, next); + ident_pmd_init(info, pmd, addr, next); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); } @@ -44,14 +46,15 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, } int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, - unsigned long addr, unsigned long end) + unsigned long pstart, unsigned long pend) { + unsigned long addr = pstart + info->offset; + unsigned long end = pend + info->offset; unsigned long next; int result; - int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; for (; addr < end; addr = next) { - pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pgd_t *pgd = pgd_page + pgd_index(addr); pud_t *pud; next = (addr & PGDIR_MASK) + PGDIR_SIZE; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 620928903be3..22af912d66d2 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -122,8 +122,18 @@ __ref void *alloc_low_pages(unsigned int num) return __va(pfn << PAGE_SHIFT); } -/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */ -#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE) +/* + * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS. + * With KASLR memory randomization, depending on the machine e820 memory + * and the PUD alignment. We may need twice more pages when KASLR memory + * randomization is enabled. + */ +#ifndef CONFIG_RANDOMIZE_MEMORY +#define INIT_PGD_PAGE_COUNT 6 +#else +#define INIT_PGD_PAGE_COUNT 12 +#endif +#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); void __init early_alloc_pgt_buf(void) { @@ -689,8 +699,10 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) } } -void free_initmem(void) +void __ref free_initmem(void) { + e820_reallocate_tables(); + free_init_pages("unused kernel", (unsigned long)(&__init_begin), (unsigned long)(&__init_end)); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 26dccd6c0df1..ddd2661c4502 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -40,17 +40,26 @@ * You need to add an if/def entry if you introduce a new memory region * compatible with KASLR. Your entry must be in logical order with memory * layout. For example, ESPFIX is before EFI because its virtual address is - * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to + * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to * ensure that this order is correct and won't be changed. */ static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; -static const unsigned long vaddr_end = VMEMMAP_START; + +#if defined(CONFIG_X86_ESPFIX64) +static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; +#elif defined(CONFIG_EFI) +static const unsigned long vaddr_end = EFI_VA_START; +#else +static const unsigned long vaddr_end = __START_KERNEL_map; +#endif /* Default values */ unsigned long page_offset_base = __PAGE_OFFSET_BASE; EXPORT_SYMBOL(page_offset_base); unsigned long vmalloc_base = __VMALLOC_BASE; EXPORT_SYMBOL(vmalloc_base); +unsigned long vmemmap_base = __VMEMMAP_BASE; +EXPORT_SYMBOL(vmemmap_base); /* * Memory regions randomized by KASLR (except modules that use a separate logic @@ -63,6 +72,7 @@ static __initdata struct kaslr_memory_region { } kaslr_regions[] = { { &page_offset_base, 64/* Maximum */ }, { &vmalloc_base, VMALLOC_SIZE_TB }, + { &vmemmap_base, 1 }, }; /* Get size in bytes used by the memory region */ @@ -77,7 +87,7 @@ static inline unsigned long get_padding(struct kaslr_memory_region *region) */ static inline bool kaslr_memory_enabled(void) { - return kaslr_enabled() && !config_enabled(CONFIG_KASAN); + return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN); } /* Initialize base and padding for each memory region randomized with KASLR */ @@ -89,6 +99,18 @@ void __init kernel_randomize_memory(void) struct rnd_state rand_state; unsigned long remain_entropy; + /* + * All these BUILD_BUG_ON checks ensures the memory layout is + * consistent with the vaddr_start/vaddr_end variables. + */ + BUILD_BUG_ON(vaddr_start >= vaddr_end); + BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) && + vaddr_end >= EFI_VA_START); + BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) || + config_enabled(CONFIG_EFI)) && + vaddr_end >= __START_KERNEL_map); + BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); + if (!kaslr_memory_enabled()) return; @@ -97,7 +119,7 @@ void __init kernel_randomize_memory(void) * add padding if needed (especially for memory hotplug support). */ BUG_ON(kaslr_regions[0].base != &page_offset_base); - memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT) + + memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; /* Adapt phyiscal memory region size based on available memory */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index fb682108f4dc..3f35b48d1d9d 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -722,22 +722,19 @@ void __init x86_numa_init(void) numa_init(dummy_numa_init); } -static __init int find_near_online_node(int node) +static void __init init_memory_less_node(int nid) { - int n, val; - int min_val = INT_MAX; - int best_node = -1; + unsigned long zones_size[MAX_NR_ZONES] = {0}; + unsigned long zholes_size[MAX_NR_ZONES] = {0}; - for_each_online_node(n) { - val = node_distance(node, n); + /* Allocate and initialize node data. Memory-less node is now online.*/ + alloc_node_data(nid); + free_area_init_node(nid, zones_size, 0, zholes_size); - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - return best_node; + /* + * All zonelists will be built later in start_kernel() after per cpu + * areas are initialized. + */ } /* @@ -766,8 +763,10 @@ void __init init_cpu_to_node(void) if (node == NUMA_NO_NODE) continue; + if (!node_online(node)) - node = find_near_online_node(node); + init_memory_less_node(node); + numa_set_node(cpu, node); } } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 849dc09fa4f0..e3353c97d086 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -917,11 +917,11 @@ static void populate_pte(struct cpa_data *cpa, } } -static int populate_pmd(struct cpa_data *cpa, - unsigned long start, unsigned long end, - unsigned num_pages, pud_t *pud, pgprot_t pgprot) +static long populate_pmd(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pud_t *pud, pgprot_t pgprot) { - unsigned int cur_pages = 0; + long cur_pages = 0; pmd_t *pmd; pgprot_t pmd_pgprot; @@ -991,12 +991,12 @@ static int populate_pmd(struct cpa_data *cpa, return num_pages; } -static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, - pgprot_t pgprot) +static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, + pgprot_t pgprot) { pud_t *pud; unsigned long end; - int cur_pages = 0; + long cur_pages = 0; pgprot_t pud_pgprot; end = start + (cpa->numpages << PAGE_SHIFT); @@ -1052,7 +1052,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, /* Map trailing leftover */ if (start < end) { - int tmp; + long tmp; pud = pud_offset(pgd, start); if (pud_none(*pud)) @@ -1078,7 +1078,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) pgprot_t pgprot = __pgprot(_KERNPG_TABLE); pud_t *pud = NULL; /* shut up gcc */ pgd_t *pgd_entry; - int ret; + long ret; pgd_entry = cpa->pgd + pgd_index(addr); @@ -1327,7 +1327,8 @@ static int cpa_process_alias(struct cpa_data *cpa) static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) { - int ret, numpages = cpa->numpages; + unsigned long numpages = cpa->numpages; + int ret; while (numpages) { /* diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index ecb1b69c1651..170cc4ff057b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -927,9 +927,10 @@ int track_pfn_copy(struct vm_area_struct *vma) } /* - * prot is passed in as a parameter for the new mapping. If the vma has a - * linear pfn mapping for the entire range reserve the entire vma range with - * single reserve_pfn_range call. + * prot is passed in as a parameter for the new mapping. If the vma has + * a linear pfn mapping for the entire range, or no vma is provided, + * reserve the entire pfn + size range with single reserve_pfn_range + * call. */ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) @@ -938,11 +939,12 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, enum page_cache_mode pcm; /* reserve the whole chunk starting from paddr */ - if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { + if (!vma || (addr == vma->vm_start + && size == (vma->vm_end - vma->vm_start))) { int ret; ret = reserve_pfn_range(paddr, size, prot, 0); - if (!ret) + if (ret == 0 && vma) vma->vm_flags |= VM_PAT; return ret; } @@ -997,7 +999,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long prot; - if (!(vma->vm_flags & VM_PAT)) + if (vma && !(vma->vm_flags & VM_PAT)) return; /* free the chunk starting from pfn or the whole chunk */ @@ -1011,7 +1013,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); - vma->vm_flags &= ~VM_PAT; + if (vma) + vma->vm_flags &= ~VM_PAT; } /* diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index de391b7bc19a..159b52ccd600 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -254,9 +254,7 @@ struct memtype *rbt_memtype_erase(u64 start, u64 end) struct memtype *rbt_memtype_lookup(u64 addr) { - struct memtype *data; - data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); - return data; + return memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); } #if defined(CONFIG_DEBUG_FS) diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index e8c474451928..f88ce0e5efd9 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -11,6 +11,7 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. */ +#include <linux/debugfs.h> /* debugfs_create_u32() */ #include <linux/mm_types.h> /* mm_struct, vma, etc... */ #include <linux/pkeys.h> /* PKEY_* */ #include <uapi/asm-generic/mman-common.h> @@ -21,8 +22,19 @@ int __execute_only_pkey(struct mm_struct *mm) { + bool need_to_set_mm_pkey = false; + int execute_only_pkey = mm->context.execute_only_pkey; int ret; + /* Do we need to assign a pkey for mm's execute-only maps? */ + if (execute_only_pkey == -1) { + /* Go allocate one to use, which might fail */ + execute_only_pkey = mm_pkey_alloc(mm); + if (execute_only_pkey < 0) + return -1; + need_to_set_mm_pkey = true; + } + /* * We do not want to go through the relatively costly * dance to set PKRU if we do not need to. Check it @@ -32,22 +44,33 @@ int __execute_only_pkey(struct mm_struct *mm) * can make fpregs inactive. */ preempt_disable(); - if (fpregs_active() && - !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) { + if (!need_to_set_mm_pkey && + fpregs_active() && + !__pkru_allows_read(read_pkru(), execute_only_pkey)) { preempt_enable(); - return PKEY_DEDICATED_EXECUTE_ONLY; + return execute_only_pkey; } preempt_enable(); - ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY, + + /* + * Set up PKRU so that it denies access for everything + * other than execution. + */ + ret = arch_set_user_pkey_access(current, execute_only_pkey, PKEY_DISABLE_ACCESS); /* * If the PKRU-set operation failed somehow, just return * 0 and effectively disable execute-only support. */ - if (ret) - return 0; + if (ret) { + mm_set_pkey_free(mm, execute_only_pkey); + return -1; + } - return PKEY_DEDICATED_EXECUTE_ONLY; + /* We got one, store it and use it from here on out */ + if (need_to_set_mm_pkey) + mm->context.execute_only_pkey = execute_only_pkey; + return execute_only_pkey; } static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) @@ -55,7 +78,7 @@ static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) /* Do this check first since the vm_flags should be hot */ if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC) return false; - if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY) + if (vma_pkey(vma) != vma->vm_mm->context.execute_only_pkey) return false; return true; @@ -99,3 +122,106 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey */ return vma_pkey(vma); } + +#define PKRU_AD_KEY(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY)) + +/* + * Make the default PKRU value (at execve() time) as restrictive + * as possible. This ensures that any threads clone()'d early + * in the process's lifetime will not accidentally get access + * to data which is pkey-protected later on. + */ +u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | + PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) | + PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) | + PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) | + PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15); + +/* + * Called from the FPU code when creating a fresh set of FPU + * registers. This is called from a very specific context where + * we know the FPU regstiers are safe for use and we can use PKRU + * directly. The fact that PKRU is only available when we are + * using eagerfpu mode makes this possible. + */ +void copy_init_pkru_to_fpregs(void) +{ + u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value); + /* + * Any write to PKRU takes it out of the XSAVE 'init + * state' which increases context switch cost. Avoid + * writing 0 when PKRU was already 0. + */ + if (!init_pkru_value_snapshot && !read_pkru()) + return; + /* + * Override the PKRU state that came from 'init_fpstate' + * with the baseline from the process. + */ + write_pkru(init_pkru_value_snapshot); +} + +static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[32]; + unsigned int len; + + len = sprintf(buf, "0x%x\n", init_pkru_value); + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t init_pkru_write_file(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + ssize_t len; + u32 new_init_pkru; + + len = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, user_buf, len)) + return -EFAULT; + + /* Make the buffer a valid string that we can not overrun */ + buf[len] = '\0'; + if (kstrtouint(buf, 0, &new_init_pkru)) + return -EINVAL; + + /* + * Don't allow insane settings that will blow the system + * up immediately if someone attempts to disable access + * or writes to pkey 0. + */ + if (new_init_pkru & (PKRU_AD_BIT|PKRU_WD_BIT)) + return -EINVAL; + + WRITE_ONCE(init_pkru_value, new_init_pkru); + return count; +} + +static const struct file_operations fops_init_pkru = { + .read = init_pkru_read_file, + .write = init_pkru_write_file, + .llseek = default_llseek, +}; + +static int __init create_init_pkru_value(void) +{ + debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_init_pkru); + return 0; +} +late_initcall(create_init_pkru_value); + +static __init int setup_init_pkru(char *opt) +{ + u32 new_init_pkru; + + if (kstrtouint(opt, 0, &new_init_pkru)) + return 1; + + WRITE_ONCE(init_pkru_value, new_init_pkru); + + return 1; +} +__setup("init_pkru=", setup_init_pkru); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 4dbe65622810..a7655f6caf7d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, unsigned cpu = smp_processor_id(); if (likely(prev != next)) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* + * If our current stack is in vmalloc space and isn't + * mapped in the new pgd, we'll double-fault. Forcibly + * map it. + */ + unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); + + pgd_t *pgd = next->pgd + stack_pgd_index; + + if (unlikely(pgd_none(*pgd))) + set_pgd(pgd, init_mm.pgd[stack_pgd_index]); + } + #ifdef CONFIG_SMP this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); this_cpu_write(cpu_tlbstate.active_mm, next); #endif + cpumask_set_cpu(cpu, mm_cpumask(next)); /* |