diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 59 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 66 | ||||
-rw-r--r-- | arch/x86/mm/gup.c | 475 | ||||
-rw-r--r-- | arch/x86/mm/hugetlbpage.c | 9 | ||||
-rw-r--r-- | arch/x86/mm/ident_map.c | 51 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 71 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 183 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 3 | ||||
-rw-r--r-- | arch/x86/mm/kasan_init_64.c | 15 | ||||
-rw-r--r-- | arch/x86/mm/mmap.c | 125 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 25 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 54 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/pgtable_32.c | 8 |
15 files changed, 481 insertions, 669 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 96d2b847e09e..0fbdcb64f9f8 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -2,7 +2,7 @@ KCOV_INSTRUMENT_tlb.o := n obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o + pat.o pgtable.o physaddr.o setup_nx.o tlb.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 58b5bee7ea27..9f305be71a72 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -110,7 +110,8 @@ static struct addr_marker address_markers[] = { #define PTE_LEVEL_MULT (PAGE_SIZE) #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) -#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) +#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) +#define PGD_LEVEL_MULT (PTRS_PER_PUD * P4D_LEVEL_MULT) #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ ({ \ @@ -286,14 +287,13 @@ static void note_page(struct seq_file *m, struct pg_state *st, } } -static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, - unsigned long P) +static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P) { int i; pte_t *start; pgprotval_t prot; - start = (pte_t *) pmd_page_vaddr(addr); + start = (pte_t *)pmd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PTE; i++) { prot = pte_flags(*start); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); @@ -304,14 +304,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, #if PTRS_PER_PMD > 1 -static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, - unsigned long P) +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) { int i; pmd_t *start; pgprotval_t prot; - start = (pmd_t *) pud_page_vaddr(addr); + start = (pmd_t *)pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { @@ -347,15 +346,14 @@ static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx) return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); } -static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, - unsigned long P) +static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) { int i; pud_t *start; pgprotval_t prot; pud_t *prev_pud = NULL; - start = (pud_t *) pgd_page_vaddr(addr); + start = (pud_t *)p4d_page_vaddr(addr); for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); @@ -377,9 +375,42 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, } #else -#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p) -#define pgd_large(a) pud_large(__pud(pgd_val(a))) -#define pgd_none(a) pud_none(__pud(pgd_val(a))) +#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p) +#define p4d_large(a) pud_large(__pud(p4d_val(a))) +#define p4d_none(a) pud_none(__pud(p4d_val(a))) +#endif + +#if PTRS_PER_P4D > 1 + +static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) +{ + int i; + p4d_t *start; + pgprotval_t prot; + + start = (p4d_t *)pgd_page_vaddr(addr); + + for (i = 0; i < PTRS_PER_P4D; i++) { + st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); + if (!p4d_none(*start)) { + if (p4d_large(*start) || !p4d_present(*start)) { + prot = p4d_flags(*start); + note_page(m, st, __pgprot(prot), 2); + } else { + walk_pud_level(m, st, *start, + P + i * P4D_LEVEL_MULT); + } + } else + note_page(m, st, __pgprot(0), 2); + + start++; + } +} + +#else +#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p) +#define pgd_large(a) p4d_large(__p4d(pgd_val(a))) +#define pgd_none(a) p4d_none(__p4d(pgd_val(a))) #endif static inline bool is_hypervisor_range(int idx) @@ -424,7 +455,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, prot = pgd_flags(*start); note_page(m, &st, __pgprot(prot), 1); } else { - walk_pud_level(m, &st, *start, + walk_p4d_level(m, &st, *start, i * PGD_LEVEL_MULT); } } else diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 428e31763cb9..8ad91a01cbc8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -253,6 +253,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -265,10 +266,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) /* * set_pgd(pgd, *pgd_k); here would be useless on PAE * and redundant with the set_pmd() on non-PAE. As would - * set_pud. + * set_p4d/set_pud. */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; @@ -384,6 +390,8 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(address)]; + p4d_t *p4d; + pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -392,7 +400,9 @@ static void dump_pagetable(unsigned long address) if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; #endif - pmd = pmd_offset(pud_offset(pgd, address), address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); + pmd = pmd_offset(pud, address); printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); /* @@ -425,6 +435,7 @@ void vmalloc_sync_all(void) static noinline int vmalloc_fault(unsigned long address) { pgd_t *pgd, *pgd_ref; + p4d_t *p4d, *p4d_ref; pud_t *pud, *pud_ref; pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; @@ -448,17 +459,37 @@ static noinline int vmalloc_fault(unsigned long address) if (pgd_none(*pgd)) { set_pgd(pgd, *pgd_ref); arch_flush_lazy_mmu_mode(); - } else { + } else if (CONFIG_PGTABLE_LEVELS > 4) { + /* + * With folded p4d, pgd_none() is always false, so the pgd may + * point to an empty page table entry and pgd_page_vaddr() + * will return garbage. + * + * We will do the correct sanity check on the p4d level. + */ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } + /* With 4-level paging, copying happens on the p4d level. */ + p4d = p4d_offset(pgd, address); + p4d_ref = p4d_offset(pgd_ref, address); + if (p4d_none(*p4d_ref)) + return -1; + + if (p4d_none(*p4d)) { + set_p4d(p4d, *p4d_ref); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref)); + } + /* * Below here mismatches are bugs because these lower tables * are shared: */ - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); + pud = pud_offset(p4d, address); + pud_ref = pud_offset(p4d_ref, address); if (pud_none(*pud_ref)) return -1; @@ -526,6 +557,7 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); pgd_t *pgd = base + pgd_index(address); + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -538,7 +570,15 @@ static void dump_pagetable(unsigned long address) if (!pgd_present(*pgd)) goto out; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (bad_address(p4d)) + goto bad; + + printk("P4D %lx ", p4d_val(*p4d)); + if (!p4d_present(*p4d) || p4d_large(*p4d)) + goto out; + + pud = pud_offset(p4d, address); if (bad_address(pud)) goto bad; @@ -1082,6 +1122,7 @@ static noinline int spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -1104,7 +1145,14 @@ spurious_fault(unsigned long error_code, unsigned long address) if (!pgd_present(*pgd)) return 0; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return 0; + + if (p4d_large(*p4d)) + return spurious_fault_check(error_code, (pte_t *) p4d); + + pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c deleted file mode 100644 index 1f3b6ef105cd..000000000000 --- a/arch/x86/mm/gup.c +++ /dev/null @@ -1,475 +0,0 @@ -/* - * Lockless get_user_pages_fast for x86 - * - * Copyright (C) 2008 Nick Piggin - * Copyright (C) 2008 Novell Inc. - */ -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/vmstat.h> -#include <linux/highmem.h> -#include <linux/swap.h> -#include <linux/memremap.h> - -#include <asm/mmu_context.h> -#include <asm/pgtable.h> - -static inline pte_t gup_get_pte(pte_t *ptep) -{ -#ifndef CONFIG_X86_PAE - return READ_ONCE(*ptep); -#else - /* - * With get_user_pages_fast, we walk down the pagetables without taking - * any locks. For this we would like to load the pointers atomically, - * but that is not possible (without expensive cmpxchg8b) on PAE. What - * we do have is the guarantee that a pte will only either go from not - * present to present, or present to not present or both -- it will not - * switch to a completely different present page without a TLB flush in - * between; something that we are blocking by holding interrupts off. - * - * Setting ptes from not present to present goes: - * ptep->pte_high = h; - * smp_wmb(); - * ptep->pte_low = l; - * - * And present to not present goes: - * ptep->pte_low = 0; - * smp_wmb(); - * ptep->pte_high = 0; - * - * We must ensure here that the load of pte_low sees l iff pte_high - * sees h. We load pte_high *after* loading pte_low, which ensures we - * don't see an older value of pte_high. *Then* we recheck pte_low, - * which ensures that we haven't picked up a changed pte high. We might - * have got rubbish values from pte_low and pte_high, but we are - * guaranteed that pte_low will not have the present bit set *unless* - * it is 'l'. And get_user_pages_fast only operates on present ptes, so - * we're safe. - * - * gup_get_pte should not be used or copied outside gup.c without being - * very careful -- it does not atomically load the pte or anything that - * is likely to be useful for you. - */ - pte_t pte; - -retry: - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - if (unlikely(pte.pte_low != ptep->pte_low)) - goto retry; - - return pte; -#endif -} - -static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) -{ - while ((*nr) - nr_start) { - struct page *page = pages[--(*nr)]; - - ClearPageReferenced(page); - put_page(page); - } -} - -/* - * 'pteval' can come from a pte, pmd or pud. We only check - * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the - * same value on all 3 types. - */ -static inline int pte_allows_gup(unsigned long pteval, int write) -{ - unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; - - if (write) - need_pte_bits |= _PAGE_RW; - - if ((pteval & need_pte_bits) != need_pte_bits) - return 0; - - /* Check memory protection keys permissions. */ - if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write)) - return 0; - - return 1; -} - -/* - * The performance critical leaf functions are made noinline otherwise gcc - * inlines everything into a single function which results in too much - * register pressure. - */ -static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct dev_pagemap *pgmap = NULL; - int nr_start = *nr, ret = 0; - pte_t *ptep, *ptem; - - /* - * Keep the original mapped PTE value (ptem) around since we - * might increment ptep off the end of the page when finishing - * our loop iteration. - */ - ptem = ptep = pte_offset_map(&pmd, addr); - do { - pte_t pte = gup_get_pte(ptep); - struct page *page; - - /* Similar to the PMD case, NUMA hinting must take slow path */ - if (pte_protnone(pte)) - break; - - if (!pte_allows_gup(pte_val(pte), write)) - break; - - if (pte_devmap(pte)) { - pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); - if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, pages); - break; - } - } else if (pte_special(pte)) - break; - - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); - get_page(page); - put_dev_pagemap(pgmap); - SetPageReferenced(page); - pages[*nr] = page; - (*nr)++; - - } while (ptep++, addr += PAGE_SIZE, addr != end); - if (addr == end) - ret = 1; - pte_unmap(ptem); - - return ret; -} - -static inline void get_head_page_multiple(struct page *page, int nr) -{ - VM_BUG_ON_PAGE(page != compound_head(page), page); - VM_BUG_ON_PAGE(page_count(page) == 0, page); - page_ref_add(page, nr); - SetPageReferenced(page); -} - -static int __gup_device_huge(unsigned long pfn, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - int nr_start = *nr; - struct dev_pagemap *pgmap = NULL; - - do { - struct page *page = pfn_to_page(pfn); - - pgmap = get_dev_pagemap(pfn, pgmap); - if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, pages); - return 0; - } - SetPageReferenced(page); - pages[*nr] = page; - get_page(page); - put_dev_pagemap(pgmap); - (*nr)++; - pfn++; - } while (addr += PAGE_SIZE, addr != end); - return 1; -} - -static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - unsigned long fault_pfn; - - fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - return __gup_device_huge(fault_pfn, addr, end, pages, nr); -} - -static int __gup_device_huge_pud(pud_t pud, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - unsigned long fault_pfn; - - fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - return __gup_device_huge(fault_pfn, addr, end, pages, nr); -} - -static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - int refs; - - if (!pte_allows_gup(pmd_val(pmd), write)) - return 0; - - VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); - if (pmd_devmap(pmd)) - return __gup_device_huge_pmd(pmd, addr, end, pages, nr); - - /* hugepages are never "special" */ - VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); - - refs = 0; - head = pmd_page(pmd); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - get_head_page_multiple(head, refs); - - return 1; -} - -static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp; - - pmdp = pmd_offset(&pud, addr); - do { - pmd_t pmd = *pmdp; - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - return 0; - if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { - /* - * NUMA hinting faults need to be handled in the GUP - * slowpath for accounting purposes and so that they - * can be serialised against THP migration. - */ - if (pmd_protnone(pmd)) - return 0; - if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) - return 0; - } else { - if (!gup_pte_range(pmd, addr, next, write, pages, nr)) - return 0; - } - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static noinline int gup_huge_pud(pud_t pud, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - int refs; - - if (!pte_allows_gup(pud_val(pud), write)) - return 0; - - VM_BUG_ON(!pfn_valid(pud_pfn(pud))); - if (pud_devmap(pud)) - return __gup_device_huge_pud(pud, addr, end, pages, nr); - - /* hugepages are never "special" */ - VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); - - refs = 0; - head = pud_page(pud); - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - get_head_page_multiple(head, refs); - - return 1; -} - -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp; - - pudp = pud_offset(&pgd, addr); - do { - pud_t pud = *pudp; - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (unlikely(pud_large(pud))) { - if (!gup_huge_pud(pud, addr, next, write, pages, nr)) - return 0; - } else { - if (!gup_pmd_range(pud, addr, next, write, pages, nr)) - return 0; - } - } while (pudp++, addr = next, addr != end); - - return 1; -} - -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - unsigned long flags; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - (void __user *)start, len))) - return 0; - - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch size - * will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables and pages from being freed on x86. - * - * So long as we atomically load page table pointers versus teardown - * (which we do on x86, with the above PAE exception), we can follow the - * address down to the the page and take a ref on it. - */ - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); - - return nr; -} - -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - - end = start + len; - if (end < start) - goto slow_irqon; - -#ifdef CONFIG_X86_64 - if (end >> __VIRTUAL_MASK_SHIFT) - goto slow_irqon; -#endif - - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch size - * will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables and pages from being freed on x86. - * - * So long as we atomically load page table pointers versus teardown - * (which we do on x86, with the above PAE exception), we can follow the - * address down to the the page and take a ref on it. - */ - local_irq_disable(); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) - goto slow; - } while (pgdp++, addr = next, addr != end); - local_irq_enable(); - - VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); - return nr; - - { - int ret; - -slow: - local_irq_enable(); -slow_irqon: - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - - ret = get_user_pages_unlocked(start, - (end - start) >> PAGE_SHIFT, - pages, write ? FOLL_WRITE : 0); - - /* Have to be a bit careful with return values */ - if (nr > 0) { - if (ret < 0) - ret = nr; - else - ret += nr; - } - - return ret; - } -} diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index c5066a260803..302f43fd9c28 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -12,10 +12,12 @@ #include <linux/pagemap.h> #include <linux/err.h> #include <linux/sysctl.h> +#include <linux/compat.h> #include <asm/mman.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/pgalloc.h> +#include <asm/elf.h> #if 0 /* This is just for testing */ struct page * @@ -82,8 +84,9 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, info.flags = 0; info.length = len; - info.low_limit = current->mm->mmap_legacy_base; - info.high_limit = TASK_SIZE; + info.low_limit = get_mmap_base(1); + info.high_limit = in_compat_syscall() ? + tasksize_32bit() : tasksize_64bit(); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); @@ -100,7 +103,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; - info.high_limit = current->mm->mmap_base; + info.high_limit = get_mmap_base(0); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 4473cb4f8b90..04210a29dd60 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -45,6 +45,34 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, return 0; } +static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + p4d_t *p4d = p4d_page + p4d_index(addr); + pud_t *pud; + + next = (addr & P4D_MASK) + P4D_SIZE; + if (next > end) + next = end; + + if (p4d_present(*p4d)) { + pud = pud_offset(p4d, 0); + ident_pud_init(info, pud, addr, next); + continue; + } + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + ident_pud_init(info, pud, addr, next); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} + int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long pstart, unsigned long pend) { @@ -55,27 +83,36 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); - pud_t *pud; + p4d_t *p4d; next = (addr & PGDIR_MASK) + PGDIR_SIZE; if (next > end) next = end; if (pgd_present(*pgd)) { - pud = pud_offset(pgd, 0); - result = ident_pud_init(info, pud, addr, next); + p4d = p4d_offset(pgd, 0); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; continue; } - pud = (pud_t *)info->alloc_pgt_page(info->context); - if (!pud) + p4d = (p4d_t *)info->alloc_pgt_page(info->context); + if (!p4d) return -ENOMEM; - result = ident_pud_init(info, pud, addr, next); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } else { + /* + * With p4d folded, pgd is equal to p4d. + * The pgd entry has to point to the pud page table in this case. + */ + pud_t *pud = pud_offset(p4d, 0); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } } return 0; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2b4b53e6793f..601b8e04e5c6 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -56,8 +56,6 @@ unsigned long highstart_pfn, highend_pfn; -static noinline int do_test_wp_bit(void); - bool __read_mostly __vmalloc_start_set = false; /* @@ -67,6 +65,7 @@ bool __read_mostly __vmalloc_start_set = false; */ static pmd_t * __init one_md_table_init(pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd_table; @@ -75,13 +74,15 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); return pmd_table; } #endif - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); return pmd_table; @@ -390,8 +391,11 @@ pte_t *kmap_pte; static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) { - return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), - vaddr), vaddr), vaddr); + pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d = p4d_offset(pgd, vaddr); + pud_t *pud = pud_offset(p4d, vaddr); + pmd_t *pmd = pmd_offset(pud, vaddr); + return pte_offset_kernel(pmd, vaddr); } static void __init kmap_init(void) @@ -410,6 +414,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) { unsigned long vaddr; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -418,7 +423,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); pgd = swapper_pg_dir + pgd_index(vaddr); - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); pmd = pmd_offset(pud, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; @@ -450,6 +456,7 @@ void __init native_pagetable_init(void) { unsigned long pfn, va; pgd_t *pgd, *base = swapper_pg_dir; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -469,7 +476,8 @@ void __init native_pagetable_init(void) if (!pgd_present(*pgd)) break; - pud = pud_offset(pgd, va); + p4d = p4d_offset(pgd, va); + pud = pud_offset(p4d, va); pmd = pmd_offset(pud, va); if (!pmd_present(*pmd)) break; @@ -716,20 +724,20 @@ void __init paging_init(void) */ static void __init test_wp_bit(void) { - printk(KERN_INFO - "Checking if this processor honours the WP bit even in supervisor mode..."); + char z = 0; + + printk(KERN_INFO "Checking if this processor honours the WP bit even in supervisor mode..."); - /* Any page-aligned address will do, the test is non-destructive */ - __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO); - boot_cpu_data.wp_works_ok = do_test_wp_bit(); - clear_fixmap(FIX_WP_TEST); + __set_fixmap(FIX_WP_TEST, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO); - if (!boot_cpu_data.wp_works_ok) { - printk(KERN_CONT "No.\n"); - panic("Linux doesn't support CPUs with broken WP."); - } else { + if (probe_kernel_write((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) { + clear_fixmap(FIX_WP_TEST); printk(KERN_CONT "Ok.\n"); + return; } + + printk(KERN_CONT "No.\n"); + panic("Linux doesn't support CPUs with broken WP."); } void __init mem_init(void) @@ -811,8 +819,7 @@ void __init mem_init(void) BUG_ON(VMALLOC_START >= VMALLOC_END); BUG_ON((unsigned long)high_memory > VMALLOC_START); - if (boot_cpu_data.wp_works_ok < 0) - test_wp_bit(); + test_wp_bit(); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -840,30 +847,6 @@ int arch_remove_memory(u64 start, u64 size) #endif #endif -/* - * This function cannot be __init, since exceptions don't work in that - * section. Put this after the callers, so that it cannot be inlined. - */ -static noinline int do_test_wp_bit(void) -{ - char tmp_reg; - int flag; - - __asm__ __volatile__( - " movb %0, %1 \n" - "1: movb %1, %0 \n" - " xorl %2, %2 \n" - "2: \n" - _ASM_EXTABLE(1b,2b) - :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), - "=q" (tmp_reg), - "=r" (flag) - :"2" (1) - :"memory"); - - return flag; -} - int kernel_set_to_readonly __read_mostly; void set_kernel_text_rw(void) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 15173d37f399..7bdda6f1d135 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -97,28 +97,38 @@ void sync_global_pgds(unsigned long start, unsigned long end) unsigned long address; for (address = start; address <= end; address += PGDIR_SIZE) { - const pgd_t *pgd_ref = pgd_offset_k(address); + pgd_t *pgd_ref = pgd_offset_k(address); + const p4d_t *p4d_ref; struct page *page; - if (pgd_none(*pgd_ref)) + /* + * With folded p4d, pgd_none() is always false, we need to + * handle synchonization on p4d level. + */ + BUILD_BUG_ON(pgd_none(*pgd_ref)); + p4d_ref = p4d_offset(pgd_ref, address); + + if (p4d_none(*p4d_ref)) continue; spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; + p4d_t *p4d; spinlock_t *pgt_lock; pgd = (pgd_t *)page_address(page) + pgd_index(address); + p4d = p4d_offset(pgd, address); /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) - BUG_ON(pgd_page_vaddr(*pgd) - != pgd_page_vaddr(*pgd_ref)); + if (!p4d_none(*p4d_ref) && !p4d_none(*p4d)) + BUG_ON(p4d_page_vaddr(*p4d) + != p4d_page_vaddr(*p4d_ref)); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); + if (p4d_none(*p4d)) + set_p4d(p4d, *p4d_ref); spin_unlock(pgt_lock); } @@ -149,16 +159,28 @@ static __ref void *spp_getpage(void) return ptr; } -static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) +static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr) { if (pgd_none(*pgd)) { - pud_t *pud = (pud_t *)spp_getpage(); - pgd_populate(&init_mm, pgd, pud); - if (pud != pud_offset(pgd, 0)) + p4d_t *p4d = (p4d_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, p4d); + if (p4d != p4d_offset(pgd, 0)) printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", - pud, pud_offset(pgd, 0)); + p4d, p4d_offset(pgd, 0)); + } + return p4d_offset(pgd, vaddr); +} + +static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr) +{ + if (p4d_none(*p4d)) { + pud_t *pud = (pud_t *)spp_getpage(); + p4d_populate(&init_mm, p4d, pud); + if (pud != pud_offset(p4d, 0)) + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + pud, pud_offset(p4d, 0)); } - return pud_offset(pgd, vaddr); + return pud_offset(p4d, vaddr); } static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) @@ -167,7 +189,7 @@ static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) pmd_t *pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); if (pmd != pmd_offset(pud, 0)) - printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n", pmd, pmd_offset(pud, 0)); } return pmd_offset(pud, vaddr); @@ -179,20 +201,15 @@ static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) pte_t *pte = (pte_t *) spp_getpage(); pmd_populate_kernel(&init_mm, pmd, pte); if (pte != pte_offset_kernel(pmd, 0)) - printk(KERN_ERR "PAGETABLE BUG #02!\n"); + printk(KERN_ERR "PAGETABLE BUG #03!\n"); } return pte_offset_kernel(pmd, vaddr); } -void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pud = pud_page + pud_index(vaddr); - pmd = fill_pmd(pud, vaddr); - pte = fill_pte(pmd, vaddr); + pmd_t *pmd = fill_pmd(pud, vaddr); + pte_t *pte = fill_pte(pmd, vaddr); set_pte(pte, new_pte); @@ -203,10 +220,25 @@ void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } +void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte) +{ + p4d_t *p4d = p4d_page + p4d_index(vaddr); + pud_t *pud = fill_pud(p4d, vaddr); + + __set_pte_vaddr(pud, vaddr, new_pte); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud = pud_page + pud_index(vaddr); + + __set_pte_vaddr(pud, vaddr, new_pte); +} + void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; - pud_t *pud_page; + p4d_t *p4d_page; pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); @@ -216,17 +248,20 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) "PGD FIXMAP MISSING, it should be setup in head.S!\n"); return; } - pud_page = (pud_t*)pgd_page_vaddr(*pgd); - set_pte_vaddr_pud(pud_page, vaddr, pteval); + + p4d_page = p4d_offset(pgd, 0); + set_pte_vaddr_p4d(p4d_page, vaddr, pteval); } pmd_t * __init populate_extra_pmd(unsigned long vaddr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pgd = pgd_offset_k(vaddr); - pud = fill_pud(pgd, vaddr); + p4d = fill_p4d(pgd, vaddr); + pud = fill_pud(p4d, vaddr); return fill_pmd(pud, vaddr); } @@ -245,6 +280,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, enum page_cache_mode cache) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgprot_t prot; @@ -255,11 +291,17 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { pgd = pgd_offset_k((unsigned long)__va(phys)); if (pgd_none(*pgd)) { + p4d = (p4d_t *) spp_getpage(); + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE | + _PAGE_USER)); + } + p4d = p4d_offset(pgd, (unsigned long)__va(phys)); + if (p4d_none(*p4d)) { pud = (pud_t *) spp_getpage(); - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE | _PAGE_USER)); } - pud = pud_offset(pgd, (unsigned long)__va(phys)); + pud = pud_offset(p4d, (unsigned long)__va(phys)); if (pud_none(*pud)) { pmd = (pmd_t *) spp_getpage(); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | @@ -563,12 +605,15 @@ kernel_physical_mapping_init(unsigned long paddr_start, for (; vaddr < vaddr_end; vaddr = vaddr_next) { pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d; pud_t *pud; vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; - if (pgd_val(*pgd)) { - pud = (pud_t *)pgd_page_vaddr(*pgd); + BUILD_BUG_ON(pgd_none(*pgd)); + p4d = p4d_offset(pgd, vaddr); + if (p4d_val(*p4d)) { + pud = (pud_t *)p4d_page_vaddr(*p4d); paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), page_size_mask); @@ -580,7 +625,7 @@ kernel_physical_mapping_init(unsigned long paddr_start, page_size_mask); spin_lock(&init_mm.page_table_lock); - pgd_populate(&init_mm, pgd, pud); + p4d_populate(&init_mm, p4d, pud); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } @@ -726,6 +771,24 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) spin_unlock(&init_mm.page_table_lock); } +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + /* free a pud talbe */ + free_pagetable(p4d_page(*p4d), 0); + spin_lock(&init_mm.page_table_lock); + p4d_clear(p4d); + spin_unlock(&init_mm.page_table_lock); +} + static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, bool direct) @@ -908,6 +971,32 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, update_page_count(PG_LEVEL_1G, -pages); } +static void __meminit +remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pud_t *pud_base; + p4d_t *p4d; + + p4d = p4d_start + p4d_index(addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + + if (!p4d_present(*p4d)) + continue; + + BUILD_BUG_ON(p4d_large(*p4d)); + + pud_base = (pud_t *)p4d_page_vaddr(*p4d); + remove_pud_table(pud_base, addr, next, direct); + free_pud_table(pud_base, p4d); + } + + if (direct) + update_page_count(PG_LEVEL_512G, -pages); +} + /* start and end are both virtual address. */ static void __meminit remove_pagetable(unsigned long start, unsigned long end, bool direct) @@ -915,7 +1004,7 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) unsigned long next; unsigned long addr; pgd_t *pgd; - pud_t *pud; + p4d_t *p4d; for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); @@ -924,8 +1013,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) if (!pgd_present(*pgd)) continue; - pud = (pud_t *)pgd_page_vaddr(*pgd); - remove_pud_table(pud, addr, next, direct); + p4d = (p4d_t *)pgd_page_vaddr(*pgd); + remove_p4d_table(p4d, addr, next, direct); } flush_tlb_all(); @@ -1090,6 +1179,7 @@ int kern_addr_valid(unsigned long addr) { unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -1101,7 +1191,11 @@ int kern_addr_valid(unsigned long addr) if (pgd_none(*pgd)) return 0; - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return 0; + + pud = pud_offset(p4d, addr); if (pud_none(*pud)) return 0; @@ -1158,6 +1252,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long addr; unsigned long next; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -1168,7 +1263,11 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, if (!pgd) return -ENOMEM; - pud = vmemmap_pud_populate(pgd, addr, node); + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return -ENOMEM; + + pud = vmemmap_pud_populate(p4d, addr, node); if (!pud) return -ENOMEM; @@ -1236,6 +1335,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, unsigned long end = (unsigned long)(start_page + size); unsigned long next; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; unsigned int nr_pages; @@ -1251,7 +1351,14 @@ void register_page_bootmem_memmap(unsigned long section_nr, } get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO); + + pud = pud_offset(p4d, addr); if (pud_none(*pud)) { next = (addr + PAGE_SIZE) & PAGE_MASK; continue; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 7aaa2635862d..a5e1cda85974 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -425,7 +425,8 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) /* Don't assume we're using swapper_pg_dir at this point */ pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(addr)]; - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); return pmd; diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 4c90cfdc128b..3d1059db6bf6 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -33,8 +33,19 @@ static int __init map_range(struct range *range) static void __init clear_pgds(unsigned long start, unsigned long end) { - for (; start < end; start += PGDIR_SIZE) - pgd_clear(pgd_offset_k(start)); + pgd_t *pgd; + + for (; start < end; start += PGDIR_SIZE) { + pgd = pgd_offset_k(start); + /* + * With folded p4d, pgd_clear() is nop, use p4d_clear() + * instead. + */ + if (CONFIG_PGTABLE_LEVELS < 5) + p4d_clear(p4d_offset(pgd, start)); + else + pgd_clear(pgd); + } } static void __init kasan_map_early_shadow(pgd_t *pgd) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 7940166c799b..19ad095b41df 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -30,30 +30,44 @@ #include <linux/limits.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> +#include <linux/compat.h> #include <asm/elf.h> struct va_alignment __read_mostly va_align = { .flags = -1, }; -static unsigned long stack_maxrandom_size(void) +unsigned long tasksize_32bit(void) +{ + return IA32_PAGE_OFFSET; +} + +unsigned long tasksize_64bit(void) +{ + return TASK_SIZE_MAX; +} + +static unsigned long stack_maxrandom_size(unsigned long task_size) { unsigned long max = 0; if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { - max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT; + max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); + max <<= PAGE_SHIFT; } return max; } -/* - * Top of mmap area (just below the process stack). - * - * Leave an at least ~128 MB hole with possible stack randomization. - */ -#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size()) -#define MAX_GAP (TASK_SIZE/6*5) +#ifdef CONFIG_COMPAT +# define mmap32_rnd_bits mmap_rnd_compat_bits +# define mmap64_rnd_bits mmap_rnd_bits +#else +# define mmap32_rnd_bits mmap_rnd_bits +# define mmap64_rnd_bits mmap_rnd_bits +#endif + +#define SIZE_128M (128 * 1024 * 1024UL) static int mmap_is_legacy(void) { @@ -66,54 +80,91 @@ static int mmap_is_legacy(void) return sysctl_legacy_va_layout; } -unsigned long arch_mmap_rnd(void) +static unsigned long arch_rnd(unsigned int rndbits) { - unsigned long rnd; - - if (mmap_is_ia32()) -#ifdef CONFIG_COMPAT - rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); -#else - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); -#endif - else - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); + return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT; +} - return rnd << PAGE_SHIFT; +unsigned long arch_mmap_rnd(void) +{ + if (!(current->flags & PF_RANDOMIZE)) + return 0; + return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); } -static unsigned long mmap_base(unsigned long rnd) +static unsigned long mmap_base(unsigned long rnd, unsigned long task_size) { unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap_min, gap_max; + + /* + * Top of mmap area (just below the process stack). + * Leave an at least ~128 MB hole with possible stack randomization. + */ + gap_min = SIZE_128M + stack_maxrandom_size(task_size); + gap_max = (task_size / 6) * 5; - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; + if (gap < gap_min) + gap = gap_min; + else if (gap > gap_max) + gap = gap_max; + + return PAGE_ALIGN(task_size - gap - rnd); +} - return PAGE_ALIGN(TASK_SIZE - gap - rnd); +static unsigned long mmap_legacy_base(unsigned long rnd, + unsigned long task_size) +{ + return __TASK_UNMAPPED_BASE(task_size) + rnd; } /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ +static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, + unsigned long random_factor, unsigned long task_size) +{ + *legacy_base = mmap_legacy_base(random_factor, task_size); + if (mmap_is_legacy()) + *base = *legacy_base; + else + *base = mmap_base(random_factor, task_size); +} + void arch_pick_mmap_layout(struct mm_struct *mm) { - unsigned long random_factor = 0UL; + if (mmap_is_legacy()) + mm->get_unmapped_area = arch_get_unmapped_area; + else + mm->get_unmapped_area = arch_get_unmapped_area_topdown; - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); + arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, + arch_rnd(mmap64_rnd_bits), tasksize_64bit()); + +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + /* + * The mmap syscall mapping base decision depends solely on the + * syscall type (64-bit or compat). This applies for 64bit + * applications and 32bit applications. The 64bit syscall uses + * mmap_base, the compat syscall uses mmap_compat_base. + */ + arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, + arch_rnd(mmap32_rnd_bits), tasksize_32bit()); +#endif +} - mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor; +unsigned long get_mmap_base(int is_legacy) +{ + struct mm_struct *mm = current->mm; - if (mmap_is_legacy()) { - mm->mmap_base = mm->mmap_legacy_base; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + if (in_compat_syscall()) { + return is_legacy ? mm->mmap_compat_legacy_base + : mm->mmap_compat_base; } +#endif + return is_legacy ? mm->mmap_legacy_base : mm->mmap_base; } const char *arch_vma_name(struct vm_area_struct *vma) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 12dcad7297a5..175f54ac6772 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -201,7 +201,7 @@ static void __init alloc_node_data(int nid) nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES, MEMBLOCK_ALLOC_ACCESSIBLE); if (!nd_pa) { - pr_err("Cannot find %zu bytes in node %d\n", + pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", nd_size, nid); return; } @@ -225,7 +225,7 @@ static void __init alloc_node_data(int nid) * numa_cleanup_meminfo - Cleanup a numa_meminfo * @mi: numa_meminfo to clean up * - * Sanitize @mi by merging and removing unncessary memblks. Also check for + * Sanitize @mi by merging and removing unnecessary memblks. Also check for * conflicts and clear unused memblks. * * RETURNS: @@ -314,20 +314,6 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) return 0; } -/* - * Set nodes, which have memory in @mi, in *@nodemask. - */ -static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, - const struct numa_meminfo *mi) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mi->blk); i++) - if (mi->blk[i].start != mi->blk[i].end && - mi->blk[i].nid != NUMA_NO_NODE) - node_set(mi->blk[i].nid, *nodemask); -} - /** * numa_reset_distance - Reset NUMA distance table * @@ -347,16 +333,12 @@ void __init numa_reset_distance(void) static int __init numa_alloc_distance(void) { - nodemask_t nodes_parsed; size_t size; int i, j, cnt = 0; u64 phys; /* size the new table and allocate it */ - nodes_parsed = numa_nodes_parsed; - numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); - - for_each_node_mask(i, nodes_parsed) + for_each_node_mask(i, numa_nodes_parsed) cnt = i; cnt++; size = cnt * cnt * sizeof(numa_distance[0]); @@ -535,7 +517,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) /* Account for nodes with cpus and no memory */ node_possible_map = numa_nodes_parsed; - numa_nodemask_from_meminfo(&node_possible_map, mi); if (WARN_ON(nodes_empty(node_possible_map))) return -EINVAL; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 28d42130243c..b5949017dead 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -346,6 +346,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -354,7 +355,15 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, if (pgd_none(*pgd)) return NULL; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d)) + return NULL; + + *level = PG_LEVEL_512G; + if (p4d_large(*p4d) || !p4d_present(*p4d)) + return (pte_t *)p4d; + + pud = pud_offset(p4d, address); if (pud_none(*pud)) return NULL; @@ -406,13 +415,18 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, pmd_t *lookup_pmd_address(unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pgd = pgd_offset_k(address); if (pgd_none(*pgd)) return NULL; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d)) + return NULL; + + pud = pud_offset(p4d, address); if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) return NULL; @@ -477,11 +491,13 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = (pgd_t *)page_address(page) + pgd_index(address); - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); set_pte_atomic((pte_t *)pmd, pte); } @@ -836,9 +852,9 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) pud_clear(pud); } -static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) { - pud_t *pud = pud_offset(pgd, start); + pud_t *pud = pud_offset(p4d, start); /* * Not on a GB page boundary? @@ -1004,8 +1020,8 @@ static long populate_pmd(struct cpa_data *cpa, return num_pages; } -static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, - pgprot_t pgprot) +static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, + pgprot_t pgprot) { pud_t *pud; unsigned long end; @@ -1026,7 +1042,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, cur_pages = (pre_end - start) >> PAGE_SHIFT; cur_pages = min_t(int, (int)cpa->numpages, cur_pages); - pud = pud_offset(pgd, start); + pud = pud_offset(p4d, start); /* * Need a PMD page? @@ -1047,7 +1063,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, if (cpa->numpages == cur_pages) return cur_pages; - pud = pud_offset(pgd, start); + pud = pud_offset(p4d, start); pud_pgprot = pgprot_4k_2_large(pgprot); /* @@ -1067,7 +1083,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, if (start < end) { long tmp; - pud = pud_offset(pgd, start); + pud = pud_offset(p4d, start); if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; @@ -1090,33 +1106,43 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) { pgprot_t pgprot = __pgprot(_KERNPG_TABLE); pud_t *pud = NULL; /* shut up gcc */ + p4d_t *p4d; pgd_t *pgd_entry; long ret; pgd_entry = cpa->pgd + pgd_index(addr); + if (pgd_none(*pgd_entry)) { + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + if (!p4d) + return -1; + + set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } + /* * Allocate a PUD page and hand it down for mapping. */ - if (pgd_none(*pgd_entry)) { + p4d = p4d_offset(pgd_entry, addr); + if (p4d_none(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); if (!pud) return -1; - set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); - ret = populate_pud(cpa, addr, pgd_entry, pgprot); + ret = populate_pud(cpa, addr, p4d, pgprot); if (ret < 0) { /* * Leave the PUD page in place in case some other CPU or thread * already found it, but remove any useless entries we just * added to it. */ - unmap_pud_range(pgd_entry, addr, + unmap_pud_range(p4d, addr, addr + (cpa->numpages << PAGE_SHIFT)); return ret; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6cbdff26bb96..38b6daf72deb 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -261,13 +261,15 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { + p4d_t *p4d; pud_t *pud; int i; if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ return; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 9adce776852b..3d275a791c76 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -26,6 +26,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -35,7 +36,12 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) BUG(); return; } - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + if (p4d_none(*p4d)) { + BUG(); + return; + } + pud = pud_offset(p4d, vaddr); if (pud_none(*pud)) { BUG(); return; |