diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-02 09:54:56 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-02 09:54:56 +0300 |
commit | d3b5d35290d729a2518af00feca867385a1b08fa (patch) | |
tree | 7b56c0863d59bc57f7c7dcf5d5665c56b05f1d1b /arch/x86/mm | |
parent | aa2a4b6569d5b10491b606a86e574dff3852597a (diff) | |
parent | 71389703839ebe9cb426c72d5f0bd549592e583c (diff) | |
download | linux-d3b5d35290d729a2518af00feca867385a1b08fa.tar.xz |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar:
"The main x86 MM changes in this cycle were:
- continued native kernel PCID support preparation patches to the TLB
flushing code (Andy Lutomirski)
- various fixes related to 32-bit compat syscall returning address
over 4Gb in applications, launched from 64-bit binaries - motivated
by C/R frameworks such as Virtuozzo. (Dmitry Safonov)
- continued Intel 5-level paging enablement: in particular the
conversion of x86 GUP to the generic GUP code. (Kirill A. Shutemov)
- x86/mpx ABI corner case fixes/enhancements (Joerg Roedel)
- ... plus misc updates, fixes and cleanups"
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (62 commits)
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash
x86/mm: Fix flush_tlb_page() on Xen
x86/mm: Make flush_tlb_mm_range() more predictable
x86/mm: Remove flush_tlb() and flush_tlb_current_task()
x86/vm86/32: Switch to flush_tlb_mm_range() in mark_screen_rdonly()
x86/mm/64: Fix crash in remove_pagetable()
Revert "x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation"
x86/boot/e820: Remove a redundant self assignment
x86/mm: Fix dump pagetables for 4 levels of page tables
x86/mpx, selftests: Only check bounds-vs-shadow when we keep shadow
x86/mpx: Correctly report do_mpx_bt_fault() failures to user-space
Revert "x86/mm/numa: Remove numa_nodemask_from_meminfo()"
x86/espfix: Add support for 5-level paging
x86/kasan: Extend KASAN to support 5-level paging
x86/mm: Add basic defines/helpers for CONFIG_X86_5LEVEL=y
x86/paravirt: Add 5-level support to the paravirt code
x86/mm: Define virtual memory map for 5-level paging
x86/asm: Remove __VIRTUAL_MASK_SHIFT==47 assert
x86/boot: Detect 5-level paging support
x86/mm/numa: Remove numa_nodemask_from_meminfo()
...
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 59 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 66 | ||||
-rw-r--r-- | arch/x86/mm/gup.c | 33 | ||||
-rw-r--r-- | arch/x86/mm/hugetlbpage.c | 9 | ||||
-rw-r--r-- | arch/x86/mm/ident_map.c | 51 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 68 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 185 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 3 | ||||
-rw-r--r-- | arch/x86/mm/kasan_init_64.c | 33 | ||||
-rw-r--r-- | arch/x86/mm/mmap.c | 125 | ||||
-rw-r--r-- | arch/x86/mm/mpx.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 54 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 36 | ||||
-rw-r--r-- | arch/x86/mm/pgtable_32.c | 8 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 33 |
16 files changed, 561 insertions, 216 deletions
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 58b5bee7ea27..bce6990b1d81 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -110,7 +110,8 @@ static struct addr_marker address_markers[] = { #define PTE_LEVEL_MULT (PAGE_SIZE) #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) -#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) +#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) +#define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT) #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ ({ \ @@ -286,14 +287,13 @@ static void note_page(struct seq_file *m, struct pg_state *st, } } -static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, - unsigned long P) +static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P) { int i; pte_t *start; pgprotval_t prot; - start = (pte_t *) pmd_page_vaddr(addr); + start = (pte_t *)pmd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PTE; i++) { prot = pte_flags(*start); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); @@ -304,14 +304,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, #if PTRS_PER_PMD > 1 -static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, - unsigned long P) +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) { int i; pmd_t *start; pgprotval_t prot; - start = (pmd_t *) pud_page_vaddr(addr); + start = (pmd_t *)pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { @@ -347,15 +346,14 @@ static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx) return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); } -static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, - unsigned long P) +static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) { int i; pud_t *start; pgprotval_t prot; pud_t *prev_pud = NULL; - start = (pud_t *) pgd_page_vaddr(addr); + start = (pud_t *)p4d_page_vaddr(addr); for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); @@ -377,9 +375,42 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, } #else -#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p) -#define pgd_large(a) pud_large(__pud(pgd_val(a))) -#define pgd_none(a) pud_none(__pud(pgd_val(a))) +#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p) +#define p4d_large(a) pud_large(__pud(p4d_val(a))) +#define p4d_none(a) pud_none(__pud(p4d_val(a))) +#endif + +#if PTRS_PER_P4D > 1 + +static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) +{ + int i; + p4d_t *start; + pgprotval_t prot; + + start = (p4d_t *)pgd_page_vaddr(addr); + + for (i = 0; i < PTRS_PER_P4D; i++) { + st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); + if (!p4d_none(*start)) { + if (p4d_large(*start) || !p4d_present(*start)) { + prot = p4d_flags(*start); + note_page(m, st, __pgprot(prot), 2); + } else { + walk_pud_level(m, st, *start, + P + i * P4D_LEVEL_MULT); + } + } else + note_page(m, st, __pgprot(0), 2); + + start++; + } +} + +#else +#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p) +#define pgd_large(a) p4d_large(__p4d(pgd_val(a))) +#define pgd_none(a) p4d_none(__p4d(pgd_val(a))) #endif static inline bool is_hypervisor_range(int idx) @@ -424,7 +455,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, prot = pgd_flags(*start); note_page(m, &st, __pgprot(prot), 1); } else { - walk_pud_level(m, &st, *start, + walk_p4d_level(m, &st, *start, i * PGD_LEVEL_MULT); } } else diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 428e31763cb9..8ad91a01cbc8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -253,6 +253,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -265,10 +266,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) /* * set_pgd(pgd, *pgd_k); here would be useless on PAE * and redundant with the set_pmd() on non-PAE. As would - * set_pud. + * set_p4d/set_pud. */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; @@ -384,6 +390,8 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(address)]; + p4d_t *p4d; + pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -392,7 +400,9 @@ static void dump_pagetable(unsigned long address) if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; #endif - pmd = pmd_offset(pud_offset(pgd, address), address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); + pmd = pmd_offset(pud, address); printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); /* @@ -425,6 +435,7 @@ void vmalloc_sync_all(void) static noinline int vmalloc_fault(unsigned long address) { pgd_t *pgd, *pgd_ref; + p4d_t *p4d, *p4d_ref; pud_t *pud, *pud_ref; pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; @@ -448,17 +459,37 @@ static noinline int vmalloc_fault(unsigned long address) if (pgd_none(*pgd)) { set_pgd(pgd, *pgd_ref); arch_flush_lazy_mmu_mode(); - } else { + } else if (CONFIG_PGTABLE_LEVELS > 4) { + /* + * With folded p4d, pgd_none() is always false, so the pgd may + * point to an empty page table entry and pgd_page_vaddr() + * will return garbage. + * + * We will do the correct sanity check on the p4d level. + */ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } + /* With 4-level paging, copying happens on the p4d level. */ + p4d = p4d_offset(pgd, address); + p4d_ref = p4d_offset(pgd_ref, address); + if (p4d_none(*p4d_ref)) + return -1; + + if (p4d_none(*p4d)) { + set_p4d(p4d, *p4d_ref); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref)); + } + /* * Below here mismatches are bugs because these lower tables * are shared: */ - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); + pud = pud_offset(p4d, address); + pud_ref = pud_offset(p4d_ref, address); if (pud_none(*pud_ref)) return -1; @@ -526,6 +557,7 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); pgd_t *pgd = base + pgd_index(address); + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -538,7 +570,15 @@ static void dump_pagetable(unsigned long address) if (!pgd_present(*pgd)) goto out; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (bad_address(p4d)) + goto bad; + + printk("P4D %lx ", p4d_val(*p4d)); + if (!p4d_present(*p4d) || p4d_large(*p4d)) + goto out; + + pud = pud_offset(p4d, address); if (bad_address(pud)) goto bad; @@ -1082,6 +1122,7 @@ static noinline int spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -1104,7 +1145,14 @@ spurious_fault(unsigned long error_code, unsigned long address) if (!pgd_present(*pgd)) return 0; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return 0; + + if (p4d_large(*p4d)) + return spurious_fault_check(error_code, (pte_t *) p4d); + + pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 1f3b6ef105cd..456dfdfd2249 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -76,9 +76,9 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } /* - * 'pteval' can come from a pte, pmd or pud. We only check + * 'pteval' can come from a pte, pmd, pud or p4d. We only check * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the - * same value on all 3 types. + * same value on all 4 types. */ static inline int pte_allows_gup(unsigned long pteval, int write) { @@ -295,13 +295,13 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, return 1; } -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, +static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; - pudp = pud_offset(&pgd, addr); + pudp = pud_offset(&p4d, addr); do { pud_t pud = *pudp; @@ -320,6 +320,27 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + p4d_t *p4dp; + + p4dp = p4d_offset(&pgd, addr); + do { + p4d_t p4d = *p4dp; + + next = p4d_addr_end(addr, end); + if (p4d_none(p4d)) + return 0; + BUILD_BUG_ON(p4d_large(p4d)); + if (!gup_pud_range(p4d, addr, next, write, pages, nr)) + return 0; + } while (p4dp++, addr = next, addr != end); + + return 1; +} + /* * Like get_user_pages_fast() except its IRQ-safe in that it won't fall * back to the regular GUP. @@ -368,7 +389,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) break; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) break; } while (pgdp++, addr = next, addr != end); local_irq_restore(flags); @@ -440,7 +461,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index c5066a260803..302f43fd9c28 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -12,10 +12,12 @@ #include <linux/pagemap.h> #include <linux/err.h> #include <linux/sysctl.h> +#include <linux/compat.h> #include <asm/mman.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/pgalloc.h> +#include <asm/elf.h> #if 0 /* This is just for testing */ struct page * @@ -82,8 +84,9 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, info.flags = 0; info.length = len; - info.low_limit = current->mm->mmap_legacy_base; - info.high_limit = TASK_SIZE; + info.low_limit = get_mmap_base(1); + info.high_limit = in_compat_syscall() ? + tasksize_32bit() : tasksize_64bit(); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); @@ -100,7 +103,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; - info.high_limit = current->mm->mmap_base; + info.high_limit = get_mmap_base(0); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 4473cb4f8b90..04210a29dd60 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -45,6 +45,34 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, return 0; } +static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + p4d_t *p4d = p4d_page + p4d_index(addr); + pud_t *pud; + + next = (addr & P4D_MASK) + P4D_SIZE; + if (next > end) + next = end; + + if (p4d_present(*p4d)) { + pud = pud_offset(p4d, 0); + ident_pud_init(info, pud, addr, next); + continue; + } + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + ident_pud_init(info, pud, addr, next); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} + int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long pstart, unsigned long pend) { @@ -55,27 +83,36 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); - pud_t *pud; + p4d_t *p4d; next = (addr & PGDIR_MASK) + PGDIR_SIZE; if (next > end) next = end; if (pgd_present(*pgd)) { - pud = pud_offset(pgd, 0); - result = ident_pud_init(info, pud, addr, next); + p4d = p4d_offset(pgd, 0); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; continue; } - pud = (pud_t *)info->alloc_pgt_page(info->context); - if (!pud) + p4d = (p4d_t *)info->alloc_pgt_page(info->context); + if (!p4d) return -ENOMEM; - result = ident_pud_init(info, pud, addr, next); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } else { + /* + * With p4d folded, pgd is equal to p4d. + * The pgd entry has to point to the pud page table in this case. + */ + pud_t *pud = pud_offset(p4d, 0); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } } return 0; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 030bfed10a6c..f34d275ee201 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -56,8 +56,6 @@ unsigned long highstart_pfn, highend_pfn; -static noinline int do_test_wp_bit(void); - bool __read_mostly __vmalloc_start_set = false; /* @@ -67,6 +65,7 @@ bool __read_mostly __vmalloc_start_set = false; */ static pmd_t * __init one_md_table_init(pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd_table; @@ -75,13 +74,15 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); return pmd_table; } #endif - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); return pmd_table; @@ -390,8 +391,11 @@ pte_t *kmap_pte; static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) { - return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), - vaddr), vaddr), vaddr); + pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d = p4d_offset(pgd, vaddr); + pud_t *pud = pud_offset(p4d, vaddr); + pmd_t *pmd = pmd_offset(pud, vaddr); + return pte_offset_kernel(pmd, vaddr); } static void __init kmap_init(void) @@ -410,6 +414,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) { unsigned long vaddr; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -418,7 +423,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); pgd = swapper_pg_dir + pgd_index(vaddr); - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); pmd = pmd_offset(pud, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; @@ -450,6 +456,7 @@ void __init native_pagetable_init(void) { unsigned long pfn, va; pgd_t *pgd, *base = swapper_pg_dir; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -469,7 +476,8 @@ void __init native_pagetable_init(void) if (!pgd_present(*pgd)) break; - pud = pud_offset(pgd, va); + p4d = p4d_offset(pgd, va); + pud = pud_offset(p4d, va); pmd = pmd_offset(pud, va); if (!pmd_present(*pmd)) break; @@ -716,22 +724,20 @@ void __init paging_init(void) */ static void __init test_wp_bit(void) { - int wp_works_ok; + char z = 0; - printk(KERN_INFO - "Checking if this processor honours the WP bit even in supervisor mode..."); + printk(KERN_INFO "Checking if this processor honours the WP bit even in supervisor mode..."); - /* Any page-aligned address will do, the test is non-destructive */ - __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO); - wp_works_ok = do_test_wp_bit(); - clear_fixmap(FIX_WP_TEST); + __set_fixmap(FIX_WP_TEST, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO); - if (!wp_works_ok) { - printk(KERN_CONT "No.\n"); - panic("Linux doesn't support CPUs with broken WP."); - } else { + if (probe_kernel_write((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) { + clear_fixmap(FIX_WP_TEST); printk(KERN_CONT "Ok.\n"); + return; } + + printk(KERN_CONT "No.\n"); + panic("Linux doesn't support CPUs with broken WP."); } void __init mem_init(void) @@ -841,30 +847,6 @@ int arch_remove_memory(u64 start, u64 size) #endif #endif -/* - * This function cannot be __init, since exceptions don't work in that - * section. Put this after the callers, so that it cannot be inlined. - */ -static noinline int do_test_wp_bit(void) -{ - char tmp_reg; - int flag; - - __asm__ __volatile__( - " movb %0, %1 \n" - "1: movb %1, %0 \n" - " xorl %2, %2 \n" - "2: \n" - _ASM_EXTABLE(1b,2b) - :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), - "=q" (tmp_reg), - "=r" (flag) - :"2" (1) - :"memory"); - - return flag; -} - int kernel_set_to_readonly __read_mostly; void set_kernel_text_rw(void) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f6da869810a8..745e5e183169 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -97,28 +97,38 @@ void sync_global_pgds(unsigned long start, unsigned long end) unsigned long address; for (address = start; address <= end; address += PGDIR_SIZE) { - const pgd_t *pgd_ref = pgd_offset_k(address); + pgd_t *pgd_ref = pgd_offset_k(address); + const p4d_t *p4d_ref; struct page *page; - if (pgd_none(*pgd_ref)) + /* + * With folded p4d, pgd_none() is always false, we need to + * handle synchonization on p4d level. + */ + BUILD_BUG_ON(pgd_none(*pgd_ref)); + p4d_ref = p4d_offset(pgd_ref, address); + + if (p4d_none(*p4d_ref)) continue; spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; + p4d_t *p4d; spinlock_t *pgt_lock; pgd = (pgd_t *)page_address(page) + pgd_index(address); + p4d = p4d_offset(pgd, address); /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) - BUG_ON(pgd_page_vaddr(*pgd) - != pgd_page_vaddr(*pgd_ref)); + if (!p4d_none(*p4d_ref) && !p4d_none(*p4d)) + BUG_ON(p4d_page_vaddr(*p4d) + != p4d_page_vaddr(*p4d_ref)); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); + if (p4d_none(*p4d)) + set_p4d(p4d, *p4d_ref); spin_unlock(pgt_lock); } @@ -149,16 +159,28 @@ static __ref void *spp_getpage(void) return ptr; } -static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) +static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr) { if (pgd_none(*pgd)) { - pud_t *pud = (pud_t *)spp_getpage(); - pgd_populate(&init_mm, pgd, pud); - if (pud != pud_offset(pgd, 0)) + p4d_t *p4d = (p4d_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, p4d); + if (p4d != p4d_offset(pgd, 0)) printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", - pud, pud_offset(pgd, 0)); + p4d, p4d_offset(pgd, 0)); + } + return p4d_offset(pgd, vaddr); +} + +static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr) +{ + if (p4d_none(*p4d)) { + pud_t *pud = (pud_t *)spp_getpage(); + p4d_populate(&init_mm, p4d, pud); + if (pud != pud_offset(p4d, 0)) + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + pud, pud_offset(p4d, 0)); } - return pud_offset(pgd, vaddr); + return pud_offset(p4d, vaddr); } static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) @@ -167,7 +189,7 @@ static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) pmd_t *pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); if (pmd != pmd_offset(pud, 0)) - printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n", pmd, pmd_offset(pud, 0)); } return pmd_offset(pud, vaddr); @@ -179,20 +201,15 @@ static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) pte_t *pte = (pte_t *) spp_getpage(); pmd_populate_kernel(&init_mm, pmd, pte); if (pte != pte_offset_kernel(pmd, 0)) - printk(KERN_ERR "PAGETABLE BUG #02!\n"); + printk(KERN_ERR "PAGETABLE BUG #03!\n"); } return pte_offset_kernel(pmd, vaddr); } -void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pud = pud_page + pud_index(vaddr); - pmd = fill_pmd(pud, vaddr); - pte = fill_pte(pmd, vaddr); + pmd_t *pmd = fill_pmd(pud, vaddr); + pte_t *pte = fill_pte(pmd, vaddr); set_pte(pte, new_pte); @@ -203,10 +220,25 @@ void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } +void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte) +{ + p4d_t *p4d = p4d_page + p4d_index(vaddr); + pud_t *pud = fill_pud(p4d, vaddr); + + __set_pte_vaddr(pud, vaddr, new_pte); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud = pud_page + pud_index(vaddr); + + __set_pte_vaddr(pud, vaddr, new_pte); +} + void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; - pud_t *pud_page; + p4d_t *p4d_page; pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); @@ -216,17 +248,20 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) "PGD FIXMAP MISSING, it should be setup in head.S!\n"); return; } - pud_page = (pud_t*)pgd_page_vaddr(*pgd); - set_pte_vaddr_pud(pud_page, vaddr, pteval); + + p4d_page = p4d_offset(pgd, 0); + set_pte_vaddr_p4d(p4d_page, vaddr, pteval); } pmd_t * __init populate_extra_pmd(unsigned long vaddr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pgd = pgd_offset_k(vaddr); - pud = fill_pud(pgd, vaddr); + p4d = fill_p4d(pgd, vaddr); + pud = fill_pud(p4d, vaddr); return fill_pmd(pud, vaddr); } @@ -245,6 +280,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, enum page_cache_mode cache) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgprot_t prot; @@ -255,11 +291,17 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { pgd = pgd_offset_k((unsigned long)__va(phys)); if (pgd_none(*pgd)) { + p4d = (p4d_t *) spp_getpage(); + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE | + _PAGE_USER)); + } + p4d = p4d_offset(pgd, (unsigned long)__va(phys)); + if (p4d_none(*p4d)) { pud = (pud_t *) spp_getpage(); - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE | _PAGE_USER)); } - pud = pud_offset(pgd, (unsigned long)__va(phys)); + pud = pud_offset(p4d, (unsigned long)__va(phys)); if (pud_none(*pud)) { pmd = (pmd_t *) spp_getpage(); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | @@ -563,12 +605,15 @@ kernel_physical_mapping_init(unsigned long paddr_start, for (; vaddr < vaddr_end; vaddr = vaddr_next) { pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d; pud_t *pud; vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; - if (pgd_val(*pgd)) { - pud = (pud_t *)pgd_page_vaddr(*pgd); + BUILD_BUG_ON(pgd_none(*pgd)); + p4d = p4d_offset(pgd, vaddr); + if (p4d_val(*p4d)) { + pud = (pud_t *)p4d_page_vaddr(*p4d); paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), page_size_mask); @@ -580,7 +625,7 @@ kernel_physical_mapping_init(unsigned long paddr_start, page_size_mask); spin_lock(&init_mm.page_table_lock); - pgd_populate(&init_mm, pgd, pud); + p4d_populate(&init_mm, p4d, pud); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } @@ -726,6 +771,24 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) spin_unlock(&init_mm.page_table_lock); } +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + /* free a pud talbe */ + free_pagetable(p4d_page(*p4d), 0); + spin_lock(&init_mm.page_table_lock); + p4d_clear(p4d); + spin_unlock(&init_mm.page_table_lock); +} + static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, bool direct) @@ -899,7 +962,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, continue; } - pmd_base = (pmd_t *)pud_page_vaddr(*pud); + pmd_base = pmd_offset(pud, 0); remove_pmd_table(pmd_base, addr, next, direct); free_pmd_table(pmd_base, pud); } @@ -908,6 +971,32 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, update_page_count(PG_LEVEL_1G, -pages); } +static void __meminit +remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pud_t *pud_base; + p4d_t *p4d; + + p4d = p4d_start + p4d_index(addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + + if (!p4d_present(*p4d)) + continue; + + BUILD_BUG_ON(p4d_large(*p4d)); + + pud_base = pud_offset(p4d, 0); + remove_pud_table(pud_base, addr, next, direct); + free_pud_table(pud_base, p4d); + } + + if (direct) + update_page_count(PG_LEVEL_512G, -pages); +} + /* start and end are both virtual address. */ static void __meminit remove_pagetable(unsigned long start, unsigned long end, bool direct) @@ -915,7 +1004,7 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) unsigned long next; unsigned long addr; pgd_t *pgd; - pud_t *pud; + p4d_t *p4d; for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); @@ -924,8 +1013,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) if (!pgd_present(*pgd)) continue; - pud = (pud_t *)pgd_page_vaddr(*pgd); - remove_pud_table(pud, addr, next, direct); + p4d = p4d_offset(pgd, 0); + remove_p4d_table(p4d, addr, next, direct); } flush_tlb_all(); @@ -1090,6 +1179,7 @@ int kern_addr_valid(unsigned long addr) { unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -1101,7 +1191,11 @@ int kern_addr_valid(unsigned long addr) if (pgd_none(*pgd)) return 0; - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return 0; + + pud = pud_offset(p4d, addr); if (pud_none(*pud)) return 0; @@ -1158,6 +1252,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long addr; unsigned long next; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -1168,7 +1263,11 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, if (!pgd) return -ENOMEM; - pud = vmemmap_pud_populate(pgd, addr, node); + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return -ENOMEM; + + pud = vmemmap_pud_populate(p4d, addr, node); if (!pud) return -ENOMEM; @@ -1236,6 +1335,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, unsigned long end = (unsigned long)(start_page + size); unsigned long next; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; unsigned int nr_pages; @@ -1251,7 +1351,14 @@ void register_page_bootmem_memmap(unsigned long section_nr, } get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO); + + pud = pud_offset(p4d, addr); if (pud_none(*pud)) { next = (addr + PAGE_SIZE) & PAGE_MASK; continue; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index c43b6b33463a..e4f7b25df18e 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -426,7 +426,8 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) /* Don't assume we're using swapper_pg_dir at this point */ pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(addr)]; - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); return pmd; diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index da92df32d0f1..0c7d8129bed6 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -34,8 +34,19 @@ static int __init map_range(struct range *range) static void __init clear_pgds(unsigned long start, unsigned long end) { - for (; start < end; start += PGDIR_SIZE) - pgd_clear(pgd_offset_k(start)); + pgd_t *pgd; + + for (; start < end; start += PGDIR_SIZE) { + pgd = pgd_offset_k(start); + /* + * With folded p4d, pgd_clear() is nop, use p4d_clear() + * instead. + */ + if (CONFIG_PGTABLE_LEVELS < 5) + p4d_clear(p4d_offset(pgd, start)); + else + pgd_clear(pgd); + } } static void __init kasan_map_early_shadow(pgd_t *pgd) @@ -45,8 +56,18 @@ static void __init kasan_map_early_shadow(pgd_t *pgd) unsigned long end = KASAN_SHADOW_END; for (i = pgd_index(start); start < end; i++) { - pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) - | _KERNPG_TABLE); + switch (CONFIG_PGTABLE_LEVELS) { + case 4: + pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) | + _KERNPG_TABLE); + break; + case 5: + pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) | + _KERNPG_TABLE); + break; + default: + BUILD_BUG(); + } start += PGDIR_SIZE; } } @@ -74,6 +95,7 @@ void __init kasan_early_init(void) pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; + p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; for (i = 0; i < PTRS_PER_PTE; i++) kasan_zero_pte[i] = __pte(pte_val); @@ -84,6 +106,9 @@ void __init kasan_early_init(void) for (i = 0; i < PTRS_PER_PUD; i++) kasan_zero_pud[i] = __pud(pud_val); + for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++) + kasan_zero_p4d[i] = __p4d(p4d_val); + kasan_map_early_shadow(early_level4_pgt); kasan_map_early_shadow(init_level4_pgt); } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 7940166c799b..19ad095b41df 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -30,30 +30,44 @@ #include <linux/limits.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> +#include <linux/compat.h> #include <asm/elf.h> struct va_alignment __read_mostly va_align = { .flags = -1, }; -static unsigned long stack_maxrandom_size(void) +unsigned long tasksize_32bit(void) +{ + return IA32_PAGE_OFFSET; +} + +unsigned long tasksize_64bit(void) +{ + return TASK_SIZE_MAX; +} + +static unsigned long stack_maxrandom_size(unsigned long task_size) { unsigned long max = 0; if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { - max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT; + max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); + max <<= PAGE_SHIFT; } return max; } -/* - * Top of mmap area (just below the process stack). - * - * Leave an at least ~128 MB hole with possible stack randomization. - */ -#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size()) -#define MAX_GAP (TASK_SIZE/6*5) +#ifdef CONFIG_COMPAT +# define mmap32_rnd_bits mmap_rnd_compat_bits +# define mmap64_rnd_bits mmap_rnd_bits +#else +# define mmap32_rnd_bits mmap_rnd_bits +# define mmap64_rnd_bits mmap_rnd_bits +#endif + +#define SIZE_128M (128 * 1024 * 1024UL) static int mmap_is_legacy(void) { @@ -66,54 +80,91 @@ static int mmap_is_legacy(void) return sysctl_legacy_va_layout; } -unsigned long arch_mmap_rnd(void) +static unsigned long arch_rnd(unsigned int rndbits) { - unsigned long rnd; - - if (mmap_is_ia32()) -#ifdef CONFIG_COMPAT - rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); -#else - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); -#endif - else - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); + return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT; +} - return rnd << PAGE_SHIFT; +unsigned long arch_mmap_rnd(void) +{ + if (!(current->flags & PF_RANDOMIZE)) + return 0; + return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); } -static unsigned long mmap_base(unsigned long rnd) +static unsigned long mmap_base(unsigned long rnd, unsigned long task_size) { unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap_min, gap_max; + + /* + * Top of mmap area (just below the process stack). + * Leave an at least ~128 MB hole with possible stack randomization. + */ + gap_min = SIZE_128M + stack_maxrandom_size(task_size); + gap_max = (task_size / 6) * 5; - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; + if (gap < gap_min) + gap = gap_min; + else if (gap > gap_max) + gap = gap_max; + + return PAGE_ALIGN(task_size - gap - rnd); +} - return PAGE_ALIGN(TASK_SIZE - gap - rnd); +static unsigned long mmap_legacy_base(unsigned long rnd, + unsigned long task_size) +{ + return __TASK_UNMAPPED_BASE(task_size) + rnd; } /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ +static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, + unsigned long random_factor, unsigned long task_size) +{ + *legacy_base = mmap_legacy_base(random_factor, task_size); + if (mmap_is_legacy()) + *base = *legacy_base; + else + *base = mmap_base(random_factor, task_size); +} + void arch_pick_mmap_layout(struct mm_struct *mm) { - unsigned long random_factor = 0UL; + if (mmap_is_legacy()) + mm->get_unmapped_area = arch_get_unmapped_area; + else + mm->get_unmapped_area = arch_get_unmapped_area_topdown; - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); + arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, + arch_rnd(mmap64_rnd_bits), tasksize_64bit()); + +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + /* + * The mmap syscall mapping base decision depends solely on the + * syscall type (64-bit or compat). This applies for 64bit + * applications and 32bit applications. The 64bit syscall uses + * mmap_base, the compat syscall uses mmap_compat_base. + */ + arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, + arch_rnd(mmap32_rnd_bits), tasksize_32bit()); +#endif +} - mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor; +unsigned long get_mmap_base(int is_legacy) +{ + struct mm_struct *mm = current->mm; - if (mmap_is_legacy()) { - mm->mmap_base = mm->mmap_legacy_base; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + if (in_compat_syscall()) { + return is_legacy ? mm->mmap_compat_legacy_base + : mm->mmap_compat_base; } +#endif + return is_legacy ? mm->mmap_legacy_base : mm->mmap_base; } const char *arch_vma_name(struct vm_area_struct *vma) diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index cd44ae727df7..1c34b767c84c 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -526,15 +526,7 @@ int mpx_handle_bd_fault(void) if (!kernel_managing_mpx_tables(current->mm)) return -EINVAL; - if (do_mpx_bt_fault()) { - force_sig(SIGSEGV, current); - /* - * The force_sig() is essentially "handling" this - * exception, so we do not pass up the error - * from do_mpx_bt_fault(). - */ - } - return 0; + return do_mpx_bt_fault(); } /* diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index f9d99535f233..25504d5aa816 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -201,7 +201,7 @@ static void __init alloc_node_data(int nid) nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES, MEMBLOCK_ALLOC_ACCESSIBLE); if (!nd_pa) { - pr_err("Cannot find %zu bytes in node %d\n", + pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", nd_size, nid); return; } @@ -225,7 +225,7 @@ static void __init alloc_node_data(int nid) * numa_cleanup_meminfo - Cleanup a numa_meminfo * @mi: numa_meminfo to clean up * - * Sanitize @mi by merging and removing unncessary memblks. Also check for + * Sanitize @mi by merging and removing unnecessary memblks. Also check for * conflicts and clear unused memblks. * * RETURNS: diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a57e8e02f457..56b22fa504df 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -346,6 +346,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -354,7 +355,15 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, if (pgd_none(*pgd)) return NULL; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d)) + return NULL; + + *level = PG_LEVEL_512G; + if (p4d_large(*p4d) || !p4d_present(*p4d)) + return (pte_t *)p4d; + + pud = pud_offset(p4d, address); if (pud_none(*pud)) return NULL; @@ -406,13 +415,18 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, pmd_t *lookup_pmd_address(unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pgd = pgd_offset_k(address); if (pgd_none(*pgd)) return NULL; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d)) + return NULL; + + pud = pud_offset(p4d, address); if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) return NULL; @@ -477,11 +491,13 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = (pgd_t *)page_address(page) + pgd_index(address); - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); set_pte_atomic((pte_t *)pmd, pte); } @@ -836,9 +852,9 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) pud_clear(pud); } -static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) { - pud_t *pud = pud_offset(pgd, start); + pud_t *pud = pud_offset(p4d, start); /* * Not on a GB page boundary? @@ -1004,8 +1020,8 @@ static long populate_pmd(struct cpa_data *cpa, return num_pages; } -static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, - pgprot_t pgprot) +static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, + pgprot_t pgprot) { pud_t *pud; unsigned long end; @@ -1026,7 +1042,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, cur_pages = (pre_end - start) >> PAGE_SHIFT; cur_pages = min_t(int, (int)cpa->numpages, cur_pages); - pud = pud_offset(pgd, start); + pud = pud_offset(p4d, start); /* * Need a PMD page? @@ -1047,7 +1063,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, if (cpa->numpages == cur_pages) return cur_pages; - pud = pud_offset(pgd, start); + pud = pud_offset(p4d, start); pud_pgprot = pgprot_4k_2_large(pgprot); /* @@ -1067,7 +1083,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, if (start < end) { long tmp; - pud = pud_offset(pgd, start); + pud = pud_offset(p4d, start); if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; @@ -1090,33 +1106,43 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) { pgprot_t pgprot = __pgprot(_KERNPG_TABLE); pud_t *pud = NULL; /* shut up gcc */ + p4d_t *p4d; pgd_t *pgd_entry; long ret; pgd_entry = cpa->pgd + pgd_index(addr); + if (pgd_none(*pgd_entry)) { + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + if (!p4d) + return -1; + + set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } + /* * Allocate a PUD page and hand it down for mapping. */ - if (pgd_none(*pgd_entry)) { + p4d = p4d_offset(pgd_entry, addr); + if (p4d_none(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); if (!pud) return -1; - set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); - ret = populate_pud(cpa, addr, pgd_entry, pgprot); + ret = populate_pud(cpa, addr, p4d, pgprot); if (ret < 0) { /* * Leave the PUD page in place in case some other CPU or thread * already found it, but remove any useless entries we just * added to it. */ - unmap_pud_range(pgd_entry, addr, + unmap_pud_range(p4d, addr, addr + (cpa->numpages << PAGE_SHIFT)); return ret; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6cbdff26bb96..508a708eb9a6 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -81,6 +81,14 @@ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pud)); } + +#if CONFIG_PGTABLE_LEVELS > 4 +void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) +{ + paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(p4d)); +} +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ @@ -120,7 +128,7 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) references from swapper_pg_dir. */ if (CONFIG_PGTABLE_LEVELS == 2 || (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || - CONFIG_PGTABLE_LEVELS == 4) { + CONFIG_PGTABLE_LEVELS >= 4) { clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); @@ -261,13 +269,15 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { + p4d_t *p4d; pud_t *pud; int i; if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ return; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; @@ -580,6 +590,28 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +#ifdef CONFIG_X86_5LEVEL +/** + * p4d_set_huge - setup kernel P4D mapping + * + * No 512GB pages yet -- always return 0 + */ +int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) +{ + return 0; +} + +/** + * p4d_clear_huge - clear kernel P4D mapping when it is set + * + * No 512GB pages yet -- always return 0 + */ +int p4d_clear_huge(p4d_t *p4d) +{ + return 0; +} +#endif + /** * pud_set_huge - setup kernel PUD mapping * diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index de53c52551a5..b9bd5b8b14fa 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -26,6 +26,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -35,7 +36,12 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) BUG(); return; } - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + if (p4d_none(*p4d)) { + BUG(); + return; + } + pud = pud_offset(p4d, vaddr); if (pud_none(*pud)) { BUG(); return; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index a7655f6caf7d..6e7bedf69af7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -263,8 +263,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, { struct flush_tlb_info info; - if (end == 0) - end = start + PAGE_SIZE; info.flush_mm = mm; info.flush_start = start; info.flush_end = end; @@ -289,23 +287,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, smp_call_function_many(cpumask, flush_tlb_func, &info, 1); } -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - - preempt_disable(); - - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - - /* This is an implicit full barrier that synchronizes with switch_mm. */ - local_flush_tlb(); - - trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); - if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); - preempt_enable(); -} - /* * See Documentation/x86/tlb.txt for details. We choose 33 * because it is large enough to cover the vast majority (at @@ -326,6 +307,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long base_pages_to_flush = TLB_FLUSH_ALL; preempt_disable(); + + if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) + base_pages_to_flush = (end - start) >> PAGE_SHIFT; + if (base_pages_to_flush > tlb_single_page_flush_ceiling) + base_pages_to_flush = TLB_FLUSH_ALL; + if (current->active_mm != mm) { /* Synchronize with switch_mm. */ smp_mb(); @@ -342,15 +329,11 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, goto out; } - if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) - base_pages_to_flush = (end - start) >> PAGE_SHIFT; - /* * Both branches below are implicit full barriers (MOV to CR or * INVLPG) that synchronize with switch_mm. */ - if (base_pages_to_flush > tlb_single_page_flush_ceiling) { - base_pages_to_flush = TLB_FLUSH_ALL; + if (base_pages_to_flush == TLB_FLUSH_ALL) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { @@ -393,7 +376,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, start, 0UL); + flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE); preempt_enable(); } |