diff options
199 files changed, 3312 insertions, 2163 deletions
diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst index 1c4e1825d769..8548b0b04e43 100644 --- a/Documentation/dev-tools/kcov.rst +++ b/Documentation/dev-tools/kcov.rst @@ -217,14 +217,15 @@ This allows to collect coverage from two types of kernel background threads: the global ones, that are spawned during kernel boot in a limited number of instances (e.g. one USB hub_event() worker thread is spawned per USB HCD); and the local ones, that are spawned when a user interacts with -some kernel interface (e.g. vhost workers). +some kernel interface (e.g. vhost workers); as well as from soft +interrupts. -To enable collecting coverage from a global background thread, a unique -global handle must be assigned and passed to the corresponding -kcov_remote_start() call. Then a userspace process can pass a list of such -handles to the KCOV_REMOTE_ENABLE ioctl in the handles array field of the -kcov_remote_arg struct. This will attach the used kcov device to the code -sections, that are referenced by those handles. +To enable collecting coverage from a global background thread or from a +softirq, a unique global handle must be assigned and passed to the +corresponding kcov_remote_start() call. Then a userspace process can pass +a list of such handles to the KCOV_REMOTE_ENABLE ioctl in the handles +array field of the kcov_remote_arg struct. This will attach the used kcov +device to the code sections, that are referenced by those handles. Since there might be many local background threads spawned from different userspace processes, we can't use a single global handle per annotation. @@ -242,7 +243,7 @@ handles as they don't belong to a particular subsystem. The bytes 4-7 are currently reserved and must be zero. In the future the number of bytes used for the subsystem or handle ids might be increased. -When a particular userspace proccess collects coverage by via a common +When a particular userspace proccess collects coverage via a common handle, kcov will collect coverage for each code section that is annotated to use the common handle obtained as kcov_handle from the current task_struct. However non common handles allow to collect coverage diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt new file mode 100644 index 000000000000..c527d05c0459 --- /dev/null +++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt @@ -0,0 +1,34 @@ +# +# Feature name: debug-vm-pgtable +# Kconfig: ARCH_HAS_DEBUG_VM_PGTABLE +# description: arch supports pgtable tests for semantics compliance +# + ----------------------- + | arch |status| + ----------------------- + | alpha: | TODO | + | arc: | ok | + | arm: | TODO | + | arm64: | ok | + | c6x: | TODO | + | csky: | TODO | + | h8300: | TODO | + | hexagon: | TODO | + | ia64: | TODO | + | m68k: | TODO | + | microblaze: | TODO | + | mips: | TODO | + | nds32: | TODO | + | nios2: | TODO | + | openrisc: | TODO | + | parisc: | TODO | + | powerpc: | ok | + | riscv: | TODO | + | s390: | ok | + | sh: | TODO | + | sparc: | TODO | + | um: | TODO | + | unicore32: | TODO | + | x86: | ok | + | xtensa: | TODO | + ----------------------- diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index ff306246d0f8..471ef22216c4 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -6,6 +6,7 @@ config ARC def_bool y select ARC_TIMERS + select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SETUP_DMA_OPS diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 1af00accb37f..6e5eafb3afdd 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -25,17 +25,8 @@ #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) #define PKMAP_NR(virt) (((virt) - PKMAP_BASE) >> PAGE_SHIFT) -#define kmap_prot PAGE_KERNEL - - #include <asm/cacheflush.h> -extern void *kmap(struct page *page); -extern void *kmap_high(struct page *page); -extern void *kmap_atomic(struct page *page); -extern void __kunmap_atomic(void *kvaddr); -extern void kunmap_high(struct page *page); - extern void kmap_init(void); static inline void flush_cache_kmaps(void) @@ -43,15 +34,6 @@ static inline void flush_cache_kmaps(void) flush_cache_all(); } -static inline void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - - #endif #endif diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index fc8849e4f72e..479b0d72d3cf 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -49,38 +49,23 @@ extern pte_t * pkmap_page_table; static pte_t * fixmap_page_table; -void *kmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return page_address(page); - - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - -void *kmap_atomic(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { int idx, cpu_idx; unsigned long vaddr; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - cpu_idx = kmap_atomic_idx_push(); idx = cpu_idx + KM_TYPE_NR * smp_processor_id(); vaddr = FIXMAP_ADDR(idx); set_pte_at(&init_mm, vaddr, fixmap_page_table + idx, - mk_pte(page, kmap_prot)); + mk_pte(page, prot)); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kv) +void kunmap_atomic_high(void *kv) { unsigned long kvaddr = (unsigned long)kv; @@ -102,11 +87,8 @@ void __kunmap_atomic(void *kv) kmap_atomic_idx_pop(); } - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr) { diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index eb4e4207cd3c..31811be38d78 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -10,8 +10,6 @@ #define PKMAP_NR(virt) (((virt) - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -#define kmap_prot PAGE_KERNEL - #define flush_cache_kmaps() \ do { \ if (cache_is_vivt()) \ @@ -20,9 +18,6 @@ extern pte_t *pkmap_page_table; -extern void *kmap_high(struct page *page); -extern void kunmap_high(struct page *page); - /* * The reason for kmap_high_get() is to ensure that the currently kmap'd * page usage count does not decrease to zero while we're using its @@ -63,10 +58,6 @@ static inline void *kmap_high_get(struct page *page) * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap(struct page *page); -extern void kunmap(struct page *page); -extern void *kmap_atomic(struct page *page); -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); #endif diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index befc8fcec98f..fba20607c53c 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -17,7 +17,6 @@ #else -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopud.h> #include <asm/memory.h> #include <asm/pgtable-hwdef.h> diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c index c9450982a155..d72b14c96670 100644 --- a/arch/arm/lib/uaccess_with_memcpy.c +++ b/arch/arm/lib/uaccess_with_memcpy.c @@ -24,6 +24,7 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) { unsigned long addr = (unsigned long)_addr; pgd_t *pgd; + p4d_t *p4d; pmd_t *pmd; pte_t *pte; pud_t *pud; @@ -33,7 +34,11 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd))) return 0; - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (unlikely(p4d_none(*p4d) || p4d_bad(*p4d))) + return 0; + + pud = pud_offset(p4d, addr); if (unlikely(pud_none(*pud) || pud_bad(*pud))) return 0; diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c index d96a101e5504..0631a7b02678 100644 --- a/arch/arm/mach-sa1100/assabet.c +++ b/arch/arm/mach-sa1100/assabet.c @@ -633,7 +633,7 @@ static void __init map_sa1100_gpio_regs( void ) int prot = PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_DOMAIN(DOMAIN_IO); pmd_t *pmd; - pmd = pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt); + pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(virt), virt), virt), virt); *pmd = __pmd(phys | prot); flush_pmd_entry(pmd); } diff --git a/arch/arm/mm/dump.c b/arch/arm/mm/dump.c index 7d6291f23251..677549d6854c 100644 --- a/arch/arm/mm/dump.c +++ b/arch/arm/mm/dump.c @@ -207,6 +207,7 @@ struct pg_level { static struct pg_level pg_level[] = { { }, { /* pgd */ + }, { /* p4d */ }, { /* pud */ }, { /* pmd */ .bits = section_bits, @@ -308,7 +309,7 @@ static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start, for (i = 0; i < PTRS_PER_PTE; i++, pte++) { addr = start + i * PAGE_SIZE; - note_page(st, addr, 4, pte_val(*pte), domain); + note_page(st, addr, 5, pte_val(*pte), domain); } } @@ -350,14 +351,14 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) addr += SECTION_SIZE; pmd++; domain = get_domain_name(pmd); - note_page(st, addr, 3, pmd_val(*pmd), domain); + note_page(st, addr, 4, pmd_val(*pmd), domain); } } } -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) { - pud_t *pud = pud_offset(pgd, 0); + pud_t *pud = pud_offset(p4d, 0); unsigned long addr; unsigned i; @@ -366,7 +367,23 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) if (!pud_none(*pud)) { walk_pmd(st, pud, addr); } else { - note_page(st, addr, 2, pud_val(*pud), NULL); + note_page(st, addr, 3, pud_val(*pud), NULL); + } + } +} + +static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start) +{ + p4d_t *p4d = p4d_offset(pgd, 0); + unsigned long addr; + unsigned i; + + for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { + addr = start + i * P4D_SIZE; + if (!p4d_none(*p4d)) { + walk_pud(st, p4d, addr); + } else { + note_page(st, addr, 2, p4d_val(*p4d), NULL); } } } @@ -381,7 +398,7 @@ static void walk_pgd(struct pg_state *st, struct mm_struct *mm, for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { addr = start + i * PGDIR_SIZE; if (!pgd_none(*pgd)) { - walk_pud(st, pgd, addr); + walk_p4d(st, pgd, addr); } else { note_page(st, addr, 1, pgd_val(*pgd), NULL); } diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index ae857f41f68d..489aaafa6ebd 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -91,6 +91,7 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, { spinlock_t *ptl; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -100,7 +101,11 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, if (pgd_none_or_clear_bad(pgd)) return 0; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none_or_clear_bad(p4d)) + return 0; + + pud = pud_offset(p4d, address); if (pud_none_or_clear_bad(pud)) return 0; diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 2dd5c41cbb8d..ff230e9affc4 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -43,19 +43,21 @@ void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr) printk("%s[%08lx] *pgd=%08llx", lvl, addr, (long long)pgd_val(*pgd)); do { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; - if (pgd_none(*pgd)) + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) break; - if (pgd_bad(*pgd)) { + if (p4d_bad(*p4d)) { pr_cont("(bad)"); break; } - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); if (PTRS_PER_PUD != 1) pr_cont(", *pud=%08llx", (long long)pud_val(*pud)); @@ -405,6 +407,7 @@ do_translation_fault(unsigned long addr, unsigned int fsr, { unsigned int index; pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -419,13 +422,16 @@ do_translation_fault(unsigned long addr, unsigned int fsr, pgd = cpu_get_pgd() + index; pgd_k = init_mm.pgd + index; - if (pgd_none(*pgd_k)) + p4d = p4d_offset(pgd, addr); + p4d_k = p4d_offset(pgd_k, addr); + + if (p4d_none(*p4d_k)) goto bad_area; - if (!pgd_present(*pgd)) - set_pgd(pgd, *pgd_k); + if (!p4d_present(*p4d)) + set_p4d(p4d, *p4d_k); - pud = pud_offset(pgd, addr); - pud_k = pud_offset(pgd_k, addr); + pud = pud_offset(p4d, addr); + pud_k = pud_offset(p4d_k, addr); if (pud_none(*pud_k)) goto bad_area; diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index a76f8ace9ce6..e013f6b81328 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -31,36 +31,13 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) return *ptep; } -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - -void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - -void *kmap_atomic(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned int idx; unsigned long vaddr; void *kmap; int type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - #ifdef CONFIG_DEBUG_HIGHMEM /* * There is no cache coherency issue when non VIVT, so force the @@ -90,13 +67,13 @@ void *kmap_atomic(struct page *page) * in place, so the contained TLB flush ensures the TLB is updated * with the new mapping. */ - set_fixmap_pte(idx, mk_pte(page, kmap_prot)); + set_fixmap_pte(idx, mk_pte(page, prot)); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int idx, type; @@ -118,10 +95,8 @@ void __kunmap_atomic(void *kvaddr) /* this address was obtained through kmap_high_get() */ kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); } - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void *kmap_atomic_pfn(unsigned long pfn) { diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c index a033f6134a64..cd54411ef1b8 100644 --- a/arch/arm/mm/idmap.c +++ b/arch/arm/mm/idmap.c @@ -68,7 +68,8 @@ static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end, static void idmap_add_pud(pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long prot) { - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); unsigned long next; do { diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 4e43455fab84..01e18e43b174 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -519,7 +519,7 @@ static inline void section_update(unsigned long addr, pmdval_t mask, { pmd_t *pmd; - pmd = pmd_offset(pud_offset(pgd_offset(mm, addr), addr), addr); + pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, addr), addr), addr), addr); #ifdef CONFIG_ARM_LPAE pmd[0] = __pmd((pmd_val(pmd[0]) & mask) | prot); diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 72286f9a4d30..75529d76d28c 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -142,12 +142,14 @@ static void unmap_area_sections(unsigned long virt, unsigned long size) { unsigned long addr = virt, end = virt + (size & ~(SZ_1M - 1)); pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmdp; flush_cache_vunmap(addr, end); pgd = pgd_offset_k(addr); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + pud = pud_offset(p4d, addr); pmdp = pmd_offset(pud, addr); do { pmd_t pmd = *pmdp; @@ -190,6 +192,7 @@ remap_area_sections(unsigned long virt, unsigned long pfn, { unsigned long addr = virt, end = virt + size; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -200,7 +203,8 @@ remap_area_sections(unsigned long virt, unsigned long pfn, unmap_area_sections(virt, size); pgd = pgd_offset_k(addr); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + pud = pud_offset(p4d, addr); pmd = pmd_offset(pud, addr); do { pmd[0] = __pmd(__pfn_to_phys(pfn) | type->prot_sect); @@ -222,6 +226,7 @@ remap_area_supersections(unsigned long virt, unsigned long pfn, { unsigned long addr = virt, end = virt + size; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -232,7 +237,8 @@ remap_area_supersections(unsigned long virt, unsigned long pfn, unmap_area_sections(virt, size); pgd = pgd_offset_k(virt); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + pud = pud_offset(p4d, addr); pmd = pmd_offset(pud, addr); do { unsigned long super_pmd_val, i; diff --git a/arch/arm/mm/mm.h b/arch/arm/mm/mm.h index 88c121ac14b3..4f1f72b75890 100644 --- a/arch/arm/mm/mm.h +++ b/arch/arm/mm/mm.h @@ -38,7 +38,7 @@ static inline pte_t get_top_pte(unsigned long va) static inline pmd_t *pmd_off_k(unsigned long virt) { - return pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt); + return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(virt), virt), virt), virt); } struct mem_type { diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index ec8d0008bfa1..c425288f1a86 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -357,7 +357,8 @@ static pte_t *pte_offset_late_fixmap(pmd_t *dir, unsigned long addr) static inline pmd_t * __init fixmap_pmd(unsigned long addr) { pgd_t *pgd = pgd_offset_k(addr); - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); return pmd; @@ -801,12 +802,12 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, } while (pmd++, addr = next, addr != end); } -static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, +static void __init alloc_init_pud(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys, const struct mem_type *type, void *(*alloc)(unsigned long sz), bool ng) { - pud_t *pud = pud_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); unsigned long next; do { @@ -816,6 +817,21 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, } while (pud++, addr = next, addr != end); } +static void __init alloc_init_p4d(pgd_t *pgd, unsigned long addr, + unsigned long end, phys_addr_t phys, + const struct mem_type *type, + void *(*alloc)(unsigned long sz), bool ng) +{ + p4d_t *p4d = p4d_offset(pgd, addr); + unsigned long next; + + do { + next = p4d_addr_end(addr, end); + alloc_init_pud(p4d, addr, next, phys, type, alloc, ng); + phys += next - addr; + } while (p4d++, addr = next, addr != end); +} + #ifndef CONFIG_ARM_LPAE static void __init create_36bit_mapping(struct mm_struct *mm, struct map_desc *md, @@ -863,7 +879,8 @@ static void __init create_36bit_mapping(struct mm_struct *mm, pgd = pgd_offset(mm, addr); end = addr + length; do { - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); int i; @@ -914,7 +931,7 @@ static void __init __create_mapping(struct mm_struct *mm, struct map_desc *md, do { unsigned long next = pgd_addr_end(addr, end); - alloc_init_pud(pgd, addr, next, phys, type, alloc, ng); + alloc_init_p4d(pgd, addr, next, phys, type, alloc, ng); phys += next - addr; addr = next; @@ -950,7 +967,13 @@ void __init create_mapping_late(struct mm_struct *mm, struct map_desc *md, bool ng) { #ifdef CONFIG_ARM_LPAE - pud_t *pud = pud_alloc(mm, pgd_offset(mm, md->virtual), md->virtual); + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_alloc(mm, pgd_offset(mm, md->virtual), md->virtual); + if (!WARN_ON(!p4d)) + return; + pud = pud_alloc(mm, p4d, md->virtual); if (WARN_ON(!pud)) return; pmd_alloc(mm, pud, 0); diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index 478bd2c6aa50..c5e1b27046a8 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -30,6 +30,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *new_pgd, *init_pgd; + p4d_t *new_p4d, *init_p4d; pud_t *new_pud, *init_pud; pmd_t *new_pmd, *init_pmd; pte_t *new_pte, *init_pte; @@ -53,8 +54,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) /* * Allocate PMD table for modules and pkmap mappings. */ - new_pud = pud_alloc(mm, new_pgd + pgd_index(MODULES_VADDR), + new_p4d = p4d_alloc(mm, new_pgd + pgd_index(MODULES_VADDR), MODULES_VADDR); + if (!new_p4d) + goto no_p4d; + + new_pud = pud_alloc(mm, new_p4d, MODULES_VADDR); if (!new_pud) goto no_pud; @@ -69,7 +74,11 @@ pgd_t *pgd_alloc(struct mm_struct *mm) * contains the machine vectors. The vectors are always high * with LPAE. */ - new_pud = pud_alloc(mm, new_pgd, 0); + new_p4d = p4d_alloc(mm, new_pgd, 0); + if (!new_p4d) + goto no_p4d; + + new_pud = pud_alloc(mm, new_p4d, 0); if (!new_pud) goto no_pud; @@ -91,7 +100,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_val(*new_pmd) |= PMD_DOMAIN(DOMAIN_VECTORS); #endif - init_pud = pud_offset(init_pgd, 0); + init_p4d = p4d_offset(init_pgd, 0); + init_pud = pud_offset(init_p4d, 0); init_pmd = pmd_offset(init_pud, 0); init_pte = pte_offset_map(init_pmd, 0); set_pte_ext(new_pte + 0, init_pte[0], 0); @@ -108,6 +118,8 @@ no_pte: no_pmd: pud_free(mm, new_pud); no_pud: + p4d_free(mm, new_p4d); +no_p4d: __pgd_free(new_pgd); no_pgd: return NULL; @@ -116,6 +128,7 @@ no_pgd: void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgtable_t pte; @@ -127,7 +140,11 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) if (pgd_none_or_clear_bad(pgd)) goto no_pgd; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + if (p4d_none_or_clear_bad(p4d)) + goto no_p4d; + + pud = pud_offset(p4d, 0); if (pud_none_or_clear_bad(pud)) goto no_pud; @@ -144,8 +161,11 @@ no_pmd: pmd_free(mm, pmd); mm_dec_nr_pmds(mm); no_pud: - pgd_clear(pgd); + p4d_clear(p4d); pud_free(mm, pud); +no_p4d: + pgd_clear(pgd); + p4d_free(mm, p4d); no_pgd: #ifdef CONFIG_ARM_LPAE /* @@ -156,15 +176,21 @@ no_pgd: continue; if (pgd_val(*pgd) & L_PGD_SWAPPER) continue; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + if (p4d_none_or_clear_bad(p4d)) + continue; + pud = pud_offset(p4d, 0); if (pud_none_or_clear_bad(pud)) continue; pmd = pmd_offset(pud, 0); pud_clear(pud); pmd_free(mm, pmd); mm_dec_nr_pmds(mm); - pgd_clear(pgd); + p4d_clear(p4d); pud_free(mm, pud); + mm_dec_nr_puds(mm); + pgd_clear(pgd); + p4d_free(mm, p4d); } #endif __pgd_free(pgd_base); diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index d0bc8bae7c8d..7f9d38444d6d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -12,6 +12,7 @@ config ARM64 select ARCH_HAS_DEBUG_WX select ARCH_BINFMT_ELF_STATE select ARCH_HAS_DEBUG_VIRTUAL + select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 324c8483d2b9..f1a74163d764 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -172,8 +172,8 @@ void kvm_clear_hyp_idmap(void); __pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE) #define kvm_mk_pud(pmdp) \ __pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE) -#define kvm_mk_pgd(pudp) \ - __pgd(__phys_to_pgd_val(__pa(pudp)) | PUD_TYPE_TABLE) +#define kvm_mk_p4d(pmdp) \ + __p4d(__phys_to_p4d_val(__pa(pmdp)) | PUD_TYPE_TABLE) #define kvm_set_pud(pudp, pud) set_pud(pudp, pud) @@ -299,6 +299,12 @@ static inline bool kvm_s2pud_young(pud_t pud) #define hyp_pud_table_empty(pudp) kvm_page_empty(pudp) #endif +#ifdef __PAGETABLE_P4D_FOLDED +#define hyp_p4d_table_empty(p4dp) (0) +#else +#define hyp_p4d_table_empty(p4dp) kvm_page_empty(p4dp) +#endif + struct kvm; #define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 172d76fa0245..58e93583ddb6 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -73,17 +73,17 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pudp) free_page((unsigned long)pudp); } -static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot) +static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot) { - set_pgd(pgdp, __pgd(__phys_to_pgd_val(pudp) | prot)); + set_p4d(p4dp, __p4d(__phys_to_p4d_val(pudp) | prot)); } -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, pud_t *pudp) +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp) { - __pgd_populate(pgdp, __pa(pudp), PUD_TYPE_TABLE); + __p4d_populate(p4dp, __pa(pudp), PUD_TYPE_TABLE); } #else -static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot) +static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot) { BUILD_BUG(); } diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h index acb0751a6606..b8f158ae2527 100644 --- a/arch/arm64/include/asm/pgtable-types.h +++ b/arch/arm64/include/asm/pgtable-types.h @@ -14,6 +14,7 @@ typedef u64 pteval_t; typedef u64 pmdval_t; typedef u64 pudval_t; +typedef u64 p4dval_t; typedef u64 pgdval_t; /* @@ -44,13 +45,11 @@ typedef struct { pteval_t pgprot; } pgprot_t; #define __pgprot(x) ((pgprot_t) { (x) } ) #if CONFIG_PGTABLE_LEVELS == 2 -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopmd.h> #elif CONFIG_PGTABLE_LEVELS == 3 -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopud.h> #elif CONFIG_PGTABLE_LEVELS == 4 -#include <asm-generic/5level-fixup.h> +#include <asm-generic/pgtable-nop4d.h> #endif #endif /* __ASM_PGTABLE_TYPES_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 9ce000f22d9e..1f3218fc52fc 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -298,6 +298,11 @@ static inline pte_t pgd_pte(pgd_t pgd) return __pte(pgd_val(pgd)); } +static inline pte_t p4d_pte(p4d_t p4d) +{ + return __pte(p4d_val(p4d)); +} + static inline pte_t pud_pte(pud_t pud) { return __pte(pud_val(pud)); @@ -401,6 +406,9 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) +#define __p4d_to_phys(p4d) __pte_to_phys(p4d_pte(p4d)) +#define __phys_to_p4d_val(phys) __phys_to_pte_val(phys) + #define __pgd_to_phys(pgd) __pte_to_phys(pgd_pte(pgd)) #define __phys_to_pgd_val(phys) __phys_to_pte_val(phys) @@ -592,49 +600,50 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) #define pud_ERROR(pud) __pud_error(__FILE__, __LINE__, pud_val(pud)) -#define pgd_none(pgd) (!pgd_val(pgd)) -#define pgd_bad(pgd) (!(pgd_val(pgd) & 2)) -#define pgd_present(pgd) (pgd_val(pgd)) +#define p4d_none(p4d) (!p4d_val(p4d)) +#define p4d_bad(p4d) (!(p4d_val(p4d) & 2)) +#define p4d_present(p4d) (p4d_val(p4d)) -static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) { - if (in_swapper_pgdir(pgdp)) { - set_swapper_pgd(pgdp, pgd); + if (in_swapper_pgdir(p4dp)) { + set_swapper_pgd((pgd_t *)p4dp, __pgd(p4d_val(p4d))); return; } - WRITE_ONCE(*pgdp, pgd); + WRITE_ONCE(*p4dp, p4d); dsb(ishst); isb(); } -static inline void pgd_clear(pgd_t *pgdp) +static inline void p4d_clear(p4d_t *p4dp) { - set_pgd(pgdp, __pgd(0)); + set_p4d(p4dp, __p4d(0)); } -static inline phys_addr_t pgd_page_paddr(pgd_t pgd) +static inline phys_addr_t p4d_page_paddr(p4d_t p4d) { - return __pgd_to_phys(pgd); + return __p4d_to_phys(p4d); } /* Find an entry in the frst-level page table. */ #define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) -#define pud_offset_phys(dir, addr) (pgd_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t)) +#define pud_offset_phys(dir, addr) (p4d_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t)) #define pud_offset(dir, addr) ((pud_t *)__va(pud_offset_phys((dir), (addr)))) #define pud_set_fixmap(addr) ((pud_t *)set_fixmap_offset(FIX_PUD, addr)) -#define pud_set_fixmap_offset(pgd, addr) pud_set_fixmap(pud_offset_phys(pgd, addr)) +#define pud_set_fixmap_offset(p4d, addr) pud_set_fixmap(pud_offset_phys(p4d, addr)) #define pud_clear_fixmap() clear_fixmap(FIX_PUD) -#define pgd_page(pgd) phys_to_page(__pgd_to_phys(pgd)) +#define p4d_page(p4d) pfn_to_page(__phys_to_pfn(__p4d_to_phys(p4d))) /* use ONLY for statically allocated translation tables */ #define pud_offset_kimg(dir,addr) ((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr)))) #else +#define p4d_page_paddr(p4d) ({ BUILD_BUG(); 0;}) #define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;}) /* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */ diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 326aac658b9d..9a364aeae5fb 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -68,41 +68,67 @@ static inline bool kvm_stage2_has_pud(struct kvm *kvm) #define S2_PUD_SIZE (1UL << S2_PUD_SHIFT) #define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) -static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd) +#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) +#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) +#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) +#define stage2_pgd_populate(kvm, pgd, p4d) pgd_populate(NULL, pgd, p4d) + +static inline p4d_t *stage2_p4d_offset(struct kvm *kvm, + pgd_t *pgd, unsigned long address) +{ + return p4d_offset(pgd, address); +} + +static inline void stage2_p4d_free(struct kvm *kvm, p4d_t *p4d) +{ +} + +static inline bool stage2_p4d_table_empty(struct kvm *kvm, p4d_t *p4dp) +{ + return false; +} + +static inline phys_addr_t stage2_p4d_addr_end(struct kvm *kvm, + phys_addr_t addr, phys_addr_t end) +{ + return end; +} + +static inline bool stage2_p4d_none(struct kvm *kvm, p4d_t p4d) { if (kvm_stage2_has_pud(kvm)) - return pgd_none(pgd); + return p4d_none(p4d); else return 0; } -static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp) +static inline void stage2_p4d_clear(struct kvm *kvm, p4d_t *p4dp) { if (kvm_stage2_has_pud(kvm)) - pgd_clear(pgdp); + p4d_clear(p4dp); } -static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd) +static inline bool stage2_p4d_present(struct kvm *kvm, p4d_t p4d) { if (kvm_stage2_has_pud(kvm)) - return pgd_present(pgd); + return p4d_present(p4d); else return 1; } -static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgd, pud_t *pud) +static inline void stage2_p4d_populate(struct kvm *kvm, p4d_t *p4d, pud_t *pud) { if (kvm_stage2_has_pud(kvm)) - pgd_populate(NULL, pgd, pud); + p4d_populate(NULL, p4d, pud); } static inline pud_t *stage2_pud_offset(struct kvm *kvm, - pgd_t *pgd, unsigned long address) + p4d_t *p4d, unsigned long address) { if (kvm_stage2_has_pud(kvm)) - return pud_offset(pgd, address); + return pud_offset(p4d, address); else - return (pud_t *)pgd; + return (pud_t *)p4d; } static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 5b73e92c99e3..a8a4b55f3a09 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -184,6 +184,7 @@ static int trans_pgd_map_page(pgd_t *trans_pgd, void *page, pgprot_t pgprot) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -196,7 +197,15 @@ static int trans_pgd_map_page(pgd_t *trans_pgd, void *page, pgd_populate(&init_mm, pgdp, pudp); } - pudp = pud_offset(pgdp, dst_addr); + p4dp = p4d_offset(pgdp, dst_addr); + if (p4d_none(READ_ONCE(*p4dp))) { + pudp = (void *)get_safe_page(GFP_ATOMIC); + if (!pudp) + return -ENOMEM; + p4d_populate(&init_mm, p4dp, pudp); + } + + pudp = pud_offset(p4dp, dst_addr); if (pud_none(READ_ONCE(*pudp))) { pmdp = (void *)get_safe_page(GFP_ATOMIC); if (!pmdp) @@ -419,7 +428,7 @@ static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, return 0; } -static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, +static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, unsigned long end) { pud_t *dst_pudp; @@ -427,15 +436,15 @@ static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, unsigned long next; unsigned long addr = start; - if (pgd_none(READ_ONCE(*dst_pgdp))) { + if (p4d_none(READ_ONCE(*dst_p4dp))) { dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC); if (!dst_pudp) return -ENOMEM; - pgd_populate(&init_mm, dst_pgdp, dst_pudp); + p4d_populate(&init_mm, dst_p4dp, dst_pudp); } - dst_pudp = pud_offset(dst_pgdp, start); + dst_pudp = pud_offset(dst_p4dp, start); - src_pudp = pud_offset(src_pgdp, start); + src_pudp = pud_offset(src_p4dp, start); do { pud_t pud = READ_ONCE(*src_pudp); @@ -454,6 +463,27 @@ static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, return 0; } +static int copy_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, + unsigned long end) +{ + p4d_t *dst_p4dp; + p4d_t *src_p4dp; + unsigned long next; + unsigned long addr = start; + + dst_p4dp = p4d_offset(dst_pgdp, start); + src_p4dp = p4d_offset(src_pgdp, start); + do { + next = p4d_addr_end(addr, end); + if (p4d_none(READ_ONCE(*src_p4dp))) + continue; + if (copy_pud(dst_p4dp, src_p4dp, addr, next)) + return -ENOMEM; + } while (dst_p4dp++, src_p4dp++, addr = next, addr != end); + + return 0; +} + static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, unsigned long end) { @@ -466,7 +496,7 @@ static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, next = pgd_addr_end(addr, end); if (pgd_none(READ_ONCE(*src_pgdp))) continue; - if (copy_pud(dst_pgdp, src_pgdp, addr, next)) + if (copy_p4d(dst_pgdp, src_pgdp, addr, next)) return -ENOMEM; } while (dst_pgdp++, src_pgdp++, addr = next, addr != end); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a1f6bc70c4e4..290154e32c0b 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -158,13 +158,22 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) { - pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL); + p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL); stage2_pgd_clear(kvm, pgd); kvm_tlb_flush_vmid_ipa(kvm, addr); - stage2_pud_free(kvm, pud_table); + stage2_p4d_free(kvm, p4d_table); put_page(virt_to_page(pgd)); } +static void clear_stage2_p4d_entry(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr) +{ + pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0); + stage2_p4d_clear(kvm, p4d); + kvm_tlb_flush_vmid_ipa(kvm, addr); + stage2_pud_free(kvm, pud_table); + put_page(virt_to_page(p4d)); +} + static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) { pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); @@ -208,12 +217,20 @@ static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp) dsb(ishst); } -static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp) +static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp) { - WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp)); + WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp)); dsb(ishst); } +static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp) +{ +#ifndef __PAGETABLE_P4D_FOLDED + WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp)); + dsb(ishst); +#endif +} + /* * Unmapping vs dcache management: * @@ -293,13 +310,13 @@ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, clear_stage2_pud_entry(kvm, pud, start_addr); } -static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, +static void unmap_stage2_puds(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr, phys_addr_t end) { phys_addr_t next, start_addr = addr; pud_t *pud, *start_pud; - start_pud = pud = stage2_pud_offset(kvm, pgd, addr); + start_pud = pud = stage2_pud_offset(kvm, p4d, addr); do { next = stage2_pud_addr_end(kvm, addr, end); if (!stage2_pud_none(kvm, *pud)) { @@ -317,6 +334,23 @@ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, } while (pud++, addr = next, addr != end); if (stage2_pud_table_empty(kvm, start_pud)) + clear_stage2_p4d_entry(kvm, p4d, start_addr); +} + +static void unmap_stage2_p4ds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next, start_addr = addr; + p4d_t *p4d, *start_p4d; + + start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr); + do { + next = stage2_p4d_addr_end(kvm, addr, end); + if (!stage2_p4d_none(kvm, *p4d)) + unmap_stage2_puds(kvm, p4d, addr, next); + } while (p4d++, addr = next, addr != end); + + if (stage2_p4d_table_empty(kvm, start_p4d)) clear_stage2_pgd_entry(kvm, pgd, start_addr); } @@ -351,7 +385,7 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) break; next = stage2_pgd_addr_end(kvm, addr, end); if (!stage2_pgd_none(kvm, *pgd)) - unmap_stage2_puds(kvm, pgd, addr, next); + unmap_stage2_p4ds(kvm, pgd, addr, next); /* * If the range is too large, release the kvm->mmu_lock * to prevent starvation and lockup detector warnings. @@ -391,13 +425,13 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, } while (pmd++, addr = next, addr != end); } -static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, +static void stage2_flush_puds(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr, phys_addr_t end) { pud_t *pud; phys_addr_t next; - pud = stage2_pud_offset(kvm, pgd, addr); + pud = stage2_pud_offset(kvm, p4d, addr); do { next = stage2_pud_addr_end(kvm, addr, end); if (!stage2_pud_none(kvm, *pud)) { @@ -409,6 +443,20 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, } while (pud++, addr = next, addr != end); } +static void stage2_flush_p4ds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + p4d_t *p4d; + phys_addr_t next; + + p4d = stage2_p4d_offset(kvm, pgd, addr); + do { + next = stage2_p4d_addr_end(kvm, addr, end); + if (!stage2_p4d_none(kvm, *p4d)) + stage2_flush_puds(kvm, p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + static void stage2_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { @@ -421,7 +469,7 @@ static void stage2_flush_memslot(struct kvm *kvm, do { next = stage2_pgd_addr_end(kvm, addr, end); if (!stage2_pgd_none(kvm, *pgd)) - stage2_flush_puds(kvm, pgd, addr, next); + stage2_flush_p4ds(kvm, pgd, addr, next); if (next != end) cond_resched_lock(&kvm->mmu_lock); @@ -454,12 +502,21 @@ static void stage2_flush_vm(struct kvm *kvm) static void clear_hyp_pgd_entry(pgd_t *pgd) { - pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); + p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL); pgd_clear(pgd); - pud_free(NULL, pud_table); + p4d_free(NULL, p4d_table); put_page(virt_to_page(pgd)); } +static void clear_hyp_p4d_entry(p4d_t *p4d) +{ + pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL); + VM_BUG_ON(p4d_huge(*p4d)); + p4d_clear(p4d); + pud_free(NULL, pud_table); + put_page(virt_to_page(p4d)); +} + static void clear_hyp_pud_entry(pud_t *pud) { pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); @@ -511,12 +568,12 @@ static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) clear_hyp_pud_entry(pud); } -static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end) { phys_addr_t next; pud_t *pud, *start_pud; - start_pud = pud = pud_offset(pgd, addr); + start_pud = pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); /* Hyp doesn't use huge puds */ @@ -525,6 +582,23 @@ static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) } while (pud++, addr = next, addr != end); if (hyp_pud_table_empty(start_pud)) + clear_hyp_p4d_entry(p4d); +} + +static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next; + p4d_t *p4d, *start_p4d; + + start_p4d = p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + /* Hyp doesn't use huge p4ds */ + if (!p4d_none(*p4d)) + unmap_hyp_puds(p4d, addr, next); + } while (p4d++, addr = next, addr != end); + + if (hyp_p4d_table_empty(start_p4d)) clear_hyp_pgd_entry(pgd); } @@ -548,7 +622,7 @@ static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd, do { next = pgd_addr_end(addr, end); if (!pgd_none(*pgd)) - unmap_hyp_puds(pgd, addr, next); + unmap_hyp_p4ds(pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -658,7 +732,7 @@ static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, return 0; } -static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, +static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start, unsigned long end, unsigned long pfn, pgprot_t prot) { @@ -669,7 +743,7 @@ static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, addr = start; do { - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); if (pud_none_or_clear_bad(pud)) { pmd = pmd_alloc_one(NULL, addr); @@ -691,12 +765,45 @@ static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, return 0; } +static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start, + unsigned long end, unsigned long pfn, + pgprot_t prot) +{ + p4d_t *p4d; + pud_t *pud; + unsigned long addr, next; + int ret; + + addr = start; + do { + p4d = p4d_offset(pgd, addr); + + if (p4d_none(*p4d)) { + pud = pud_alloc_one(NULL, addr); + if (!pud) { + kvm_err("Cannot allocate Hyp pud\n"); + return -ENOMEM; + } + kvm_p4d_populate(p4d, pud); + get_page(virt_to_page(p4d)); + } + + next = p4d_addr_end(addr, end); + ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot); + if (ret) + return ret; + pfn += (next - addr) >> PAGE_SHIFT; + } while (addr = next, addr != end); + + return 0; +} + static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, unsigned long start, unsigned long end, unsigned long pfn, pgprot_t prot) { pgd_t *pgd; - pud_t *pud; + p4d_t *p4d; unsigned long addr, next; int err = 0; @@ -707,18 +814,18 @@ static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); if (pgd_none(*pgd)) { - pud = pud_alloc_one(NULL, addr); - if (!pud) { - kvm_err("Cannot allocate Hyp pud\n"); + p4d = p4d_alloc_one(NULL, addr); + if (!p4d) { + kvm_err("Cannot allocate Hyp p4d\n"); err = -ENOMEM; goto out; } - kvm_pgd_populate(pgd, pud); + kvm_pgd_populate(pgd, p4d); get_page(virt_to_page(pgd)); } next = pgd_addr_end(addr, end); - err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot); + err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot); if (err) goto out; pfn += (next - addr) >> PAGE_SHIFT; @@ -1015,22 +1122,40 @@ void kvm_free_stage2_pgd(struct kvm *kvm) free_pages_exact(pgd, stage2_pgd_size(kvm)); } -static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, +static p4d_t *stage2_get_p4d(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, phys_addr_t addr) { pgd_t *pgd; - pud_t *pud; + p4d_t *p4d; pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); if (stage2_pgd_none(kvm, *pgd)) { if (!cache) return NULL; - pud = mmu_memory_cache_alloc(cache); - stage2_pgd_populate(kvm, pgd, pud); + p4d = mmu_memory_cache_alloc(cache); + stage2_pgd_populate(kvm, pgd, p4d); get_page(virt_to_page(pgd)); } - return stage2_pud_offset(kvm, pgd, addr); + return stage2_p4d_offset(kvm, pgd, addr); +} + +static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr) +{ + p4d_t *p4d; + pud_t *pud; + + p4d = stage2_get_p4d(kvm, cache, addr); + if (stage2_p4d_none(kvm, *p4d)) { + if (!cache) + return NULL; + pud = mmu_memory_cache_alloc(cache); + stage2_p4d_populate(kvm, p4d, pud); + get_page(virt_to_page(p4d)); + } + + return stage2_pud_offset(kvm, p4d, addr); } static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, @@ -1423,18 +1548,18 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, } /** - * stage2_wp_puds - write protect PGD range + * stage2_wp_puds - write protect P4D range * @pgd: pointer to pgd entry * @addr: range start address * @end: range end address */ -static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, +static void stage2_wp_puds(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr, phys_addr_t end) { pud_t *pud; phys_addr_t next; - pud = stage2_pud_offset(kvm, pgd, addr); + pud = stage2_pud_offset(kvm, p4d, addr); do { next = stage2_pud_addr_end(kvm, addr, end); if (!stage2_pud_none(kvm, *pud)) { @@ -1449,6 +1574,26 @@ static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, } /** + * stage2_wp_p4ds - write protect PGD range + * @pgd: pointer to pgd entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_p4ds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + p4d_t *p4d; + phys_addr_t next; + + p4d = stage2_p4d_offset(kvm, pgd, addr); + do { + next = stage2_p4d_addr_end(kvm, addr, end); + if (!stage2_p4d_none(kvm, *p4d)) + stage2_wp_puds(kvm, p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + +/** * stage2_wp_range() - write protect stage2 memory region range * @kvm: The KVM pointer * @addr: Start address of range @@ -1475,7 +1620,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) break; next = stage2_pgd_addr_end(kvm, addr, end); if (stage2_pgd_present(kvm, *pgd)) - stage2_wp_puds(kvm, pgd, addr, next); + stage2_wp_p4ds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index dff2d72b0883..df8ae73d950b 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -145,6 +145,7 @@ static void show_pte(unsigned long addr) pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd)); do { + p4d_t *p4dp, p4d; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; @@ -152,7 +153,13 @@ static void show_pte(unsigned long addr) if (pgd_none(pgd) || pgd_bad(pgd)) break; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + p4d = READ_ONCE(*p4dp); + pr_cont(", p4d=%016llx", p4d_val(p4d)); + if (p4d_none(p4d) || p4d_bad(p4d)) + break; + + pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); pr_cont(", pud=%016llx", pud_val(pud)); if (pud_none(pud) || pud_bad(pud)) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 07f154b8b84a..0a52ce46f020 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -67,11 +67,13 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr, pte_t *ptep, size_t *pgsize) { pgd_t *pgdp = pgd_offset(mm, addr); + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; *pgsize = PAGE_SIZE; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + pudp = pud_offset(p4dp, addr); pmdp = pmd_offset(pudp, addr); if ((pte_t *)pmdp == ptep) { *pgsize = PMD_SIZE; @@ -217,12 +219,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep = NULL; pgdp = pgd_offset(mm, addr); - pudp = pud_alloc(mm, pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + pudp = pud_alloc(mm, p4dp, addr); if (!pudp) return NULL; @@ -261,6 +265,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp, pud; pmd_t *pmdp, pmd; @@ -268,7 +273,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, if (!pgd_present(READ_ONCE(*pgdp))) return NULL; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + if (!p4d_present(READ_ONCE(*p4dp))) + return NULL; + + pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); if (sz != PUD_SIZE && pud_none(pud)) return NULL; diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index f87a32484ea8..2339811f317b 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -84,17 +84,17 @@ static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node, return early ? pmd_offset_kimg(pudp, addr) : pmd_offset(pudp, addr); } -static pud_t *__init kasan_pud_offset(pgd_t *pgdp, unsigned long addr, int node, +static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node, bool early) { - if (pgd_none(READ_ONCE(*pgdp))) { + if (p4d_none(READ_ONCE(*p4dp))) { phys_addr_t pud_phys = early ? __pa_symbol(kasan_early_shadow_pud) : kasan_alloc_zeroed_page(node); - __pgd_populate(pgdp, pud_phys, PMD_TYPE_TABLE); + __p4d_populate(p4dp, pud_phys, PMD_TYPE_TABLE); } - return early ? pud_offset_kimg(pgdp, addr) : pud_offset(pgdp, addr); + return early ? pud_offset_kimg(p4dp, addr) : pud_offset(p4dp, addr); } static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, @@ -126,11 +126,11 @@ static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr, } while (pmdp++, addr = next, addr != end && pmd_none(READ_ONCE(*pmdp))); } -static void __init kasan_pud_populate(pgd_t *pgdp, unsigned long addr, +static void __init kasan_pud_populate(p4d_t *p4dp, unsigned long addr, unsigned long end, int node, bool early) { unsigned long next; - pud_t *pudp = kasan_pud_offset(pgdp, addr, node, early); + pud_t *pudp = kasan_pud_offset(p4dp, addr, node, early); do { next = pud_addr_end(addr, end); @@ -138,6 +138,18 @@ static void __init kasan_pud_populate(pgd_t *pgdp, unsigned long addr, } while (pudp++, addr = next, addr != end && pud_none(READ_ONCE(*pudp))); } +static void __init kasan_p4d_populate(pgd_t *pgdp, unsigned long addr, + unsigned long end, int node, bool early) +{ + unsigned long next; + p4d_t *p4dp = p4d_offset(pgdp, addr); + + do { + next = p4d_addr_end(addr, end); + kasan_pud_populate(p4dp, addr, next, node, early); + } while (p4dp++, addr = next, addr != end); +} + static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, int node, bool early) { @@ -147,7 +159,7 @@ static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, pgdp = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - kasan_pud_populate(pgdp, addr, next, node, early); + kasan_p4d_populate(pgdp, addr, next, node, early); } while (pgdp++, addr = next, addr != end); } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index c299b73dd5e4..e7fbc6275329 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -290,18 +290,19 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, { unsigned long next; pud_t *pudp; - pgd_t pgd = READ_ONCE(*pgdp); + p4d_t *p4dp = p4d_offset(pgdp, addr); + p4d_t p4d = READ_ONCE(*p4dp); - if (pgd_none(pgd)) { + if (p4d_none(p4d)) { phys_addr_t pud_phys; BUG_ON(!pgtable_alloc); pud_phys = pgtable_alloc(PUD_SHIFT); - __pgd_populate(pgdp, pud_phys, PUD_TYPE_TABLE); - pgd = READ_ONCE(*pgdp); + __p4d_populate(p4dp, pud_phys, PUD_TYPE_TABLE); + p4d = READ_ONCE(*p4dp); } - BUG_ON(pgd_bad(pgd)); + BUG_ON(p4d_bad(p4d)); - pudp = pud_set_fixmap_offset(pgdp, addr); + pudp = pud_set_fixmap_offset(p4dp, addr); do { pud_t old_pud = READ_ONCE(*pudp); @@ -672,6 +673,7 @@ static void __init map_kernel(pgd_t *pgdp) READ_ONCE(*pgd_offset_k(FIXADDR_START))); } else if (CONFIG_PGTABLE_LEVELS > 3) { pgd_t *bm_pgdp; + p4d_t *bm_p4dp; pud_t *bm_pudp; /* * The fixmap shares its top level pgd entry with the kernel @@ -681,7 +683,8 @@ static void __init map_kernel(pgd_t *pgdp) */ BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); bm_pgdp = pgd_offset_raw(pgdp, FIXADDR_START); - bm_pudp = pud_set_fixmap_offset(bm_pgdp, FIXADDR_START); + bm_p4dp = p4d_offset(bm_pgdp, FIXADDR_START); + bm_pudp = pud_set_fixmap_offset(bm_p4dp, FIXADDR_START); pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd)); pud_clear_fixmap(); } else { @@ -715,6 +718,7 @@ void __init paging_init(void) int kern_addr_valid(unsigned long addr) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; @@ -726,7 +730,11 @@ int kern_addr_valid(unsigned long addr) if (pgd_none(READ_ONCE(*pgdp))) return 0; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + if (p4d_none(READ_ONCE(*p4dp))) + return 0; + + pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); if (pud_none(pud)) return 0; @@ -1069,6 +1077,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, unsigned long addr = start; unsigned long next; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; @@ -1079,7 +1088,11 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, if (!pgdp) return -ENOMEM; - pudp = vmemmap_pud_populate(pgdp, addr, node); + p4dp = vmemmap_p4d_populate(pgdp, addr, node); + if (!p4dp) + return -ENOMEM; + + pudp = vmemmap_pud_populate(p4dp, addr, node); if (!pudp) return -ENOMEM; @@ -1114,11 +1127,12 @@ void vmemmap_free(unsigned long start, unsigned long end, static inline pud_t * fixmap_pud(unsigned long addr) { pgd_t *pgdp = pgd_offset_k(addr); - pgd_t pgd = READ_ONCE(*pgdp); + p4d_t *p4dp = p4d_offset(pgdp, addr); + p4d_t p4d = READ_ONCE(*p4dp); - BUG_ON(pgd_none(pgd) || pgd_bad(pgd)); + BUG_ON(p4d_none(p4d) || p4d_bad(p4d)); - return pud_offset_kimg(pgdp, addr); + return pud_offset_kimg(p4dp, addr); } static inline pmd_t * fixmap_pmd(unsigned long addr) @@ -1144,25 +1158,27 @@ static inline pte_t * fixmap_pte(unsigned long addr) */ void __init early_fixmap_init(void) { - pgd_t *pgdp, pgd; + pgd_t *pgdp; + p4d_t *p4dp, p4d; pud_t *pudp; pmd_t *pmdp; unsigned long addr = FIXADDR_START; pgdp = pgd_offset_k(addr); - pgd = READ_ONCE(*pgdp); + p4dp = p4d_offset(pgdp, addr); + p4d = READ_ONCE(*p4dp); if (CONFIG_PGTABLE_LEVELS > 3 && - !(pgd_none(pgd) || pgd_page_paddr(pgd) == __pa_symbol(bm_pud))) { + !(p4d_none(p4d) || p4d_page_paddr(p4d) == __pa_symbol(bm_pud))) { /* * We only end up here if the kernel mapping and the fixmap * share the top level pgd entry, which should only happen on * 16k/4 levels configurations. */ BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); - pudp = pud_offset_kimg(pgdp, addr); + pudp = pud_offset_kimg(p4dp, addr); } else { - if (pgd_none(pgd)) - __pgd_populate(pgdp, __pa_symbol(bm_pud), PUD_TYPE_TABLE); + if (p4d_none(p4d)) + __p4d_populate(p4dp, __pa_symbol(bm_pud), PUD_TYPE_TABLE); pudp = fixmap_pud(addr); } if (pud_none(READ_ONCE(*pudp))) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index bde08090b838..4175bcb8ccb3 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -198,6 +198,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) bool kernel_page_present(struct page *page) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep; @@ -210,7 +211,11 @@ bool kernel_page_present(struct page *page) if (pgd_none(READ_ONCE(*pgdp))) return false; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + if (p4d_none(READ_ONCE(*p4dp))) + return false; + + pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); if (pud_none(pud)) return false; diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index a345a2f2c22e..14645e3d5cd5 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -30,22 +30,14 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); -extern void kunmap_high(struct page *page); - -extern void *kmap(struct page *page); -extern void kunmap(struct page *page); -extern void *kmap_atomic(struct page *page); -extern void __kunmap_atomic(void *kvaddr); +#define ARCH_HAS_KMAP_FLUSH_TLB +extern void kmap_flush_tlb(unsigned long addr); extern void *kmap_atomic_pfn(unsigned long pfn); -extern struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do {} while (0) extern void kmap_init(void); -#define kmap_prot PAGE_KERNEL - #endif /* __KERNEL__ */ #endif /* __ASM_CSKY_HIGHMEM_H */ diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 813129145f3d..3b3f622f5ae9 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -13,59 +13,39 @@ static pte_t *kmap_pte; unsigned long highstart_pfn, highend_pfn; -void *kmap(struct page *page) +void kmap_flush_tlb(unsigned long addr) { - void *addr; - - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - addr = kmap_high(page); - flush_tlb_one((unsigned long)addr); - - return addr; + flush_tlb_one(addr); } -EXPORT_SYMBOL(kmap); +EXPORT_SYMBOL(kmap_flush_tlb); -void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); +EXPORT_SYMBOL(kmap); -void *kmap_atomic(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte - idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, PAGE_KERNEL)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); flush_tlb_one((unsigned long)vaddr); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int idx; if (vaddr < FIXADDR_START) - goto out; + return; #ifdef CONFIG_DEBUG_HIGHMEM idx = KM_TYPE_NR*smp_processor_id() + kmap_atomic_idx(); @@ -78,11 +58,8 @@ void __kunmap_atomic(void *kvaddr) (void) idx; /* to kill a warning */ #endif kmap_atomic_idx_pop(); -out: - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); /* * This is the same as kmap_atomic() but can map memory that doesn't @@ -104,19 +81,6 @@ void *kmap_atomic_pfn(unsigned long pfn) return (void *) vaddr; } -struct page *kmap_atomic_to_page(void *ptr) -{ - unsigned long idx, vaddr = (unsigned long)ptr; - pte_t *pte; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); - return pte_page(*pte); -} - static void __init kmap_pages_init(void) { unsigned long vaddr; diff --git a/arch/h8300/include/asm/pgtable.h b/arch/h8300/include/asm/pgtable.h index 4d00152fab58..f00828720dc4 100644 --- a/arch/h8300/include/asm/pgtable.h +++ b/arch/h8300/include/asm/pgtable.h @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _H8300_PGTABLE_H #define _H8300_PGTABLE_H -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopud.h> #include <asm-generic/pgtable.h> extern void paging_init(void); diff --git a/arch/hexagon/include/asm/fixmap.h b/arch/hexagon/include/asm/fixmap.h index 933dac167504..97b1b062e750 100644 --- a/arch/hexagon/include/asm/fixmap.h +++ b/arch/hexagon/include/asm/fixmap.h @@ -16,7 +16,7 @@ #include <asm-generic/fixmap.h> #define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), \ - (vaddr)), (vaddr)), (vaddr)) + pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(vaddr), \ + (vaddr)), (vaddr)), (vaddr)), (vaddr)) #endif diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index d383e8bea5b2..2a17d4eb2fa4 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -12,7 +12,6 @@ * Page table definitions for Qualcomm Hexagon processor. */ #include <asm/page.h> -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopmd.h> /* A handy thing to have if one has the RAM. Declared in head.S */ diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index f4c491044882..2a3050345099 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -36,9 +36,9 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) #if CONFIG_PGTABLE_LEVELS == 4 static inline void -pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud) +p4d_populate(struct mm_struct *mm, p4d_t * p4d_entry, pud_t * pud) { - pgd_val(*pgd_entry) = __pa(pud); + p4d_val(*p4d_entry) = __pa(pud); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 0e7b645b76c6..787b0a91d255 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -283,12 +283,12 @@ extern unsigned long VMALLOC_END; #define pud_page(pud) virt_to_page((pud_val(pud) + PAGE_OFFSET)) #if CONFIG_PGTABLE_LEVELS == 4 -#define pgd_none(pgd) (!pgd_val(pgd)) -#define pgd_bad(pgd) (!ia64_phys_addr_valid(pgd_val(pgd))) -#define pgd_present(pgd) (pgd_val(pgd) != 0UL) -#define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) -#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_val(pgd) & _PFN_MASK)) -#define pgd_page(pgd) virt_to_page((pgd_val(pgd) + PAGE_OFFSET)) +#define p4d_none(p4d) (!p4d_val(p4d)) +#define p4d_bad(p4d) (!ia64_phys_addr_valid(p4d_val(p4d))) +#define p4d_present(p4d) (p4d_val(p4d) != 0UL) +#define p4d_clear(p4dp) (p4d_val(*(p4dp)) = 0UL) +#define p4d_page_vaddr(p4d) ((unsigned long) __va(p4d_val(p4d) & _PFN_MASK)) +#define p4d_page(p4d) virt_to_page((p4d_val(p4d) + PAGE_OFFSET)) #endif /* @@ -386,7 +386,7 @@ pgd_offset (const struct mm_struct *mm, unsigned long address) #if CONFIG_PGTABLE_LEVELS == 4 /* Find an entry in the second-level page table.. */ #define pud_offset(dir,addr) \ - ((pud_t *) pgd_page_vaddr(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) + ((pud_t *) p4d_page_vaddr(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) #endif /* Find an entry in the third-level page table.. */ @@ -580,10 +580,9 @@ extern struct page *zero_page_memmap_ptr; #if CONFIG_PGTABLE_LEVELS == 3 -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopud.h> #endif -#include <asm-generic/5level-fixup.h> +#include <asm-generic/pgtable-nop4d.h> #include <asm-generic/pgtable.h> #endif /* _ASM_IA64_PGTABLE_H */ diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 30d0c1fca99e..12242aa0dad1 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -29,6 +29,7 @@ static int mapped_kernel_page_is_present (unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; @@ -37,7 +38,11 @@ mapped_kernel_page_is_present (unsigned long address) if (pgd_none(*pgd) || pgd_bad(*pgd)) return 0; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || p4d_bad(*p4d)) + return 0; + + pud = pud_offset(p4d, address); if (pud_none(*pud) || pud_bad(*pud)) return 0; diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index d16e419fd712..32352a73df0c 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -30,12 +30,14 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { unsigned long taddr = htlbpage_to_page(addr); pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, taddr); - pud = pud_alloc(mm, pgd, taddr); + p4d = p4d_offset(pgd, taddr); + pud = pud_alloc(mm, p4d, taddr); if (pud) { pmd = pmd_alloc(mm, pud, taddr); if (pmd) @@ -49,17 +51,21 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr, unsigned long sz) { unsigned long taddr = htlbpage_to_page(addr); pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, taddr); if (pgd_present(*pgd)) { - pud = pud_offset(pgd, taddr); - if (pud_present(*pud)) { - pmd = pmd_offset(pud, taddr); - if (pmd_present(*pmd)) - pte = pte_offset_map(pmd, taddr); + p4d = p4d_offset(pgd, addr); + if (p4d_present(*p4d)) { + pud = pud_offset(p4d, taddr); + if (pud_present(*pud)) { + pmd = pmd_offset(pud, taddr); + if (pmd_present(*pmd)) + pte = pte_offset_map(pmd, taddr); + } } } diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index d637b4ea3147..ca760f6cb18f 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -208,6 +208,7 @@ static struct page * __init put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -215,7 +216,10 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot) pgd = pgd_offset_k(address); /* note: this is NOT pgd_offset()! */ { - pud = pud_alloc(&init_mm, pgd, address); + p4d = p4d_alloc(&init_mm, pgd, address); + if (!p4d) + goto out; + pud = pud_alloc(&init_mm, p4d, address); if (!pud) goto out; pmd = pmd_alloc(&init_mm, pud, address); @@ -382,6 +386,7 @@ int vmemmap_find_next_valid_pfn(int node, int i) do { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -392,7 +397,13 @@ int vmemmap_find_next_valid_pfn(int node, int i) continue; } - pud = pud_offset(pgd, end_address); + p4d = p4d_offset(pgd, end_address); + if (p4d_none(*p4d)) { + end_address += P4D_SIZE; + continue; + } + + pud = pud_offset(p4d, end_address); if (pud_none(*pud)) { end_address += PUD_SIZE; continue; @@ -430,6 +441,7 @@ int __init create_mem_map_page_table(u64 start, u64 end, void *arg) struct page *map_start, *map_end; int node; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -444,12 +456,20 @@ int __init create_mem_map_page_table(u64 start, u64 end, void *arg) for (address = start_page; address < end_page; address += PAGE_SIZE) { pgd = pgd_offset_k(address); if (pgd_none(*pgd)) { + p4d = memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, node); + if (!p4d) + goto err_alloc; + pgd_populate(&init_mm, pgd, p4d); + } + p4d = p4d_offset(pgd, address); + + if (p4d_none(*p4d)) { pud = memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, node); if (!pud) goto err_alloc; - pgd_populate(&init_mm, pgd, pud); + p4d_populate(&init_mm, p4d, pud); } - pud = pud_offset(pgd, address); + pud = pud_offset(p4d, address); if (pud_none(*pud)) { pmd = memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, node); diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 332c78e15198..284ca8fb54c1 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -26,7 +26,6 @@ #include <asm/fixmap.h> extern pte_t *kmap_pte; -extern pgprot_t kmap_prot; extern pte_t *pkmap_page_table; /* @@ -51,32 +50,6 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); -extern void kunmap_high(struct page *page); -extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); -extern void __kunmap_atomic(void *kvaddr); - -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - -static inline void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - -static inline void *kmap_atomic(struct page *page) -{ - return kmap_atomic_prot(page, kmap_prot); -} - #define flush_cache_kmaps() { flush_icache(); flush_dcache(); } #endif /* __KERNEL__ */ diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c index d7569f77fa15..92e0890416c9 100644 --- a/arch/microblaze/mm/highmem.c +++ b/arch/microblaze/mm/highmem.c @@ -32,18 +32,12 @@ */ #include <asm/tlbflush.h> -void *kmap_atomic_prot(struct page *page, pgprot_t prot) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -55,19 +49,16 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) return (void *) vaddr; } -EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int type; unsigned int idx; - if (vaddr < __fix_to_virt(FIX_KMAP_END)) { - pagefault_enable(); - preempt_enable(); + if (vaddr < __fix_to_virt(FIX_KMAP_END)) return; - } type = kmap_atomic_idx(); @@ -83,7 +74,5 @@ void __kunmap_atomic(void *kvaddr) local_flush_tlb_page(NULL, vaddr); kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index dcaa53d11339..d943f69784b1 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -49,8 +49,6 @@ unsigned long lowmem_size; #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); -pgprot_t kmap_prot; -EXPORT_SYMBOL(kmap_prot); static inline pte_t *virt_to_kpte(unsigned long vaddr) { @@ -68,7 +66,6 @@ static void __init highmem_init(void) pkmap_page_table = virt_to_kpte(PKMAP_BASE); kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); - kmap_prot = PAGE_KERNEL; } static void highmem_setup(void) diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 9d84aafc33d0..f1f788b57166 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -46,21 +46,14 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void * kmap_high(struct page *page); -extern void kunmap_high(struct page *page); - -extern void *kmap(struct page *page); -extern void kunmap(struct page *page); -extern void *kmap_atomic(struct page *page); -extern void __kunmap_atomic(void *kvaddr); +#define ARCH_HAS_KMAP_FLUSH_TLB +extern void kmap_flush_tlb(unsigned long addr); extern void *kmap_atomic_pfn(unsigned long pfn); #define flush_cache_kmaps() BUG_ON(cpu_has_dc_aliases) extern void kmap_init(void); -#define kmap_prot PAGE_KERNEL - #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index ad6df1cea866..3e81ba000096 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -14,9 +14,9 @@ #include <linux/sched.h> #include <linux/syscalls.h> #include <linux/mm.h> +#include <linux/highmem.h> #include <asm/cacheflush.h> -#include <asm/highmem.h> #include <asm/processor.h> #include <asm/cpu.h> #include <asm/cpu-features.h> @@ -103,7 +103,7 @@ void __flush_dcache_page(struct page *page) flush_data_cache_page(addr); if (PageHighMem(page)) - __kunmap_atomic((void *)addr); + kunmap_atomic((void *)addr); } EXPORT_SYMBOL(__flush_dcache_page); @@ -146,7 +146,7 @@ void __update_cache(unsigned long address, pte_t pte) flush_data_cache_page(addr); if (PageHighMem(page)) - __kunmap_atomic((void *)addr); + kunmap_atomic((void *)addr); ClearPageDcacheDirty(page); } diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index d08e6d7d533b..8e8726992720 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -12,71 +12,37 @@ static pte_t *kmap_pte; unsigned long highstart_pfn, highend_pfn; -void *kmap(struct page *page) +void kmap_flush_tlb(unsigned long addr) { - void *addr; - - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - addr = kmap_high(page); - flush_tlb_one((unsigned long)addr); - - return addr; + flush_tlb_one(addr); } -EXPORT_SYMBOL(kmap); +EXPORT_SYMBOL(kmap_flush_tlb); -void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. - * - * However when holding an atomic kmap is is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. - */ - -void *kmap_atomic(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte - idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, PAGE_KERNEL)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); local_flush_tlb_one((unsigned long)vaddr); return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int type __maybe_unused; - if (vaddr < FIXADDR_START) { // FIXME - pagefault_enable(); - preempt_enable(); + if (vaddr < FIXADDR_START) return; - } type = kmap_atomic_idx(); #ifdef CONFIG_DEBUG_HIGHMEM @@ -94,10 +60,8 @@ void __kunmap_atomic(void *kvaddr) } #endif kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); /* * This is the same as kmap_atomic() but can map memory that doesn't diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index b3a82c97ded3..5717647d14d1 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -32,7 +32,6 @@ #define LAST_PKMAP_MASK (LAST_PKMAP - 1) #define PKMAP_NR(virt) (((virt) - (PKMAP_BASE)) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -#define kmap_prot PAGE_KERNEL static inline void flush_cache_kmaps(void) { @@ -44,9 +43,6 @@ extern unsigned long highstart_pfn, highend_pfn; extern pte_t *pkmap_page_table; -extern void *kmap_high(struct page *page); -extern void kunmap_high(struct page *page); - extern void kmap_init(void); /* @@ -54,12 +50,7 @@ extern void kmap_init(void); * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap(struct page *page); -extern void kunmap(struct page *page); -extern void *kmap_atomic(struct page *page); -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); -extern struct page *kmap_atomic_to_page(void *ptr); #endif #endif diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index 022779af6148..4284cd59e21a 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -10,45 +10,18 @@ #include <asm/fixmap.h> #include <asm/tlbflush.h> -void *kmap(struct page *page) -{ - unsigned long vaddr; - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - vaddr = (unsigned long)kmap_high(page); - return (void *)vaddr; -} - -EXPORT_SYMBOL(kmap); - -void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - -EXPORT_SYMBOL(kunmap); - -void *kmap_atomic(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned int idx; unsigned long vaddr, pte; int type; pte_t *ptep; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - pte = (page_to_pfn(page) << PAGE_SHIFT) | (PAGE_KERNEL); + pte = (page_to_pfn(page) << PAGE_SHIFT) | prot; ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); set_pte(ptep, pte); @@ -58,10 +31,9 @@ void *kmap_atomic(struct page *page) __nds32__isb(); return (void *)vaddr; } +EXPORT_SYMBOL(kmap_atomic_high_prot); -EXPORT_SYMBOL(kmap_atomic); - -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { if (kvaddr >= (void *)FIXADDR_START) { unsigned long vaddr = (unsigned long)kvaddr; @@ -72,8 +44,5 @@ void __kunmap_atomic(void *kvaddr) ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); set_pte(ptep, 0); } - pagefault_enable(); - preempt_enable(); } - -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index f98b7f4519ba..47a1a3ea5734 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -22,7 +22,6 @@ #include <asm/tlbflush.h> #include <asm/pgtable-bits.h> -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopmd.h> #define FIRST_USER_ADDRESS 0UL @@ -100,7 +99,7 @@ extern pte_t invalid_pte_table[PAGE_SIZE/sizeof(pte_t)]; */ static inline void set_pmd(pmd_t *pmdptr, pmd_t pmdval) { - pmdptr->pud.pgd.pgd = pmdval.pud.pgd.pgd; + *pmdptr = pmdval; } /* to find an entry in a page-table-directory */ diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index ec9d8a9c426f..964eac1a21d0 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c @@ -242,6 +242,7 @@ vmalloc_fault: */ int offset = pgd_index(address); pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; @@ -253,8 +254,12 @@ vmalloc_fault: goto no_context; set_pgd(pgd, *pgd_k); - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + goto no_context; + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) goto no_context; pmd = pmd_offset(pud, address); diff --git a/arch/nios2/mm/ioremap.c b/arch/nios2/mm/ioremap.c index 819bdfcc2e71..fe821efb9a99 100644 --- a/arch/nios2/mm/ioremap.c +++ b/arch/nios2/mm/ioremap.c @@ -86,11 +86,15 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, if (address >= end) BUG(); do { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; error = -ENOMEM; - pud = pud_alloc(&init_mm, dir, address); + p4d = p4d_alloc(&init_mm, dir, address); + if (!p4d) + break; + pud = pud_alloc(&init_mm, p4d, address); if (!pud) break; pmd = pmd_alloc(&init_mm, pud, address); diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 7f3fb9ceb083..219979e57790 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -21,7 +21,6 @@ #ifndef __ASM_OPENRISC_PGTABLE_H #define __ASM_OPENRISC_PGTABLE_H -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopmd.h> #ifndef __ASSEMBLY__ diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 8af1cc78c4fb..6e0a11ac4c00 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -295,6 +295,7 @@ vmalloc_fault: int offset = pgd_index(address); pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; @@ -321,8 +322,13 @@ vmalloc_fault: * it exists. */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + goto no_context; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) goto no_context; diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index f94fe6d3f499..3bcdc1c26b23 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -68,6 +68,7 @@ static void __init map_ram(void) unsigned long v, p, e; pgprot_t prot; pgd_t *pge; + p4d_t *p4e; pud_t *pue; pmd_t *pme; pte_t *pte; @@ -87,7 +88,8 @@ static void __init map_ram(void) while (p < e) { int j; - pue = pud_offset(pge, v); + p4e = p4d_offset(pge, v); + pue = pud_offset(p4e, v); pme = pmd_offset(pue, v); if ((u32) pue != (u32) pge || (u32) pme != (u32) pge) { diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 0c83644bfa5c..99663fc1f997 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -100,37 +100,11 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma } } -#include <asm/kmap_types.h> - -#define ARCH_HAS_KMAP - -static inline void *kmap(struct page *page) -{ - might_sleep(); - return page_address(page); -} - -static inline void kunmap(struct page *page) -{ - flush_kernel_dcache_page_addr(page_address(page)); -} - -static inline void *kmap_atomic(struct page *page) -{ - preempt_disable(); - pagefault_disable(); - return page_address(page); -} - -static inline void __kunmap_atomic(void *addr) +#define ARCH_HAS_FLUSH_ON_KUNMAP +static inline void kunmap_flush_on_unmap(void *addr) { flush_kernel_dcache_page_addr(addr); - pagefault_enable(); - preempt_enable(); } -#define kmap_atomic_prot(page, prot) kmap_atomic(page) -#define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) - #endif /* _PARISC_CACHEFLUSH_H */ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a8eee7a64add..c4f36a0b6b6e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -116,6 +116,7 @@ config PPC # select ARCH_32BIT_OFF_T if PPC32 select ARCH_HAS_DEBUG_VIRTUAL + select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 7549393c4c43..6052b72216a6 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -2,7 +2,6 @@ #ifndef _ASM_POWERPC_BOOK3S_32_PGTABLE_H #define _ASM_POWERPC_BOOK3S_32_PGTABLE_H -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopmd.h> #include <asm/book3s/32/hash.h> diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 6fc4520092c7..73ad038ed10b 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -134,9 +134,9 @@ static inline int get_region_id(unsigned long ea) #define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS) #define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS) -static inline int hash__pgd_bad(pgd_t pgd) +static inline int hash__p4d_bad(p4d_t p4d) { - return (pgd_val(pgd) == 0); + return (p4d_val(p4d) == 0); } #ifdef CONFIG_STRICT_KERNEL_RWX extern void hash__mark_rodata_ro(void); diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index a41e91bd0580..69c5b051734f 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -85,9 +85,9 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); } -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +static inline void p4d_populate(struct mm_struct *mm, p4d_t *pgd, pud_t *pud) { - *pgd = __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS); + *pgd = __p4d(__pgtable_ptr_val(pud) | PGD_VAL_BITS); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index d6438659926c..87168eb9490c 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ -#include <asm-generic/5level-fixup.h> +#include <asm-generic/pgtable-nop4d.h> #ifndef __ASSEMBLY__ #include <linux/mmdebug.h> @@ -251,7 +251,7 @@ extern unsigned long __pmd_frag_size_shift; /* Bits to mask out from a PUD to get to the PMD page */ #define PUD_MASKED_BITS 0xc0000000000000ffUL /* Bits to mask out from a PGD to get to the PUD page */ -#define PGD_MASKED_BITS 0xc0000000000000ffUL +#define P4D_MASKED_BITS 0xc0000000000000ffUL /* * Used as an indicator for rcu callback functions @@ -949,54 +949,60 @@ static inline bool pud_access_permitted(pud_t pud, bool write) return pte_access_permitted(pud_pte(pud), write); } -#define pgd_write(pgd) pte_write(pgd_pte(pgd)) +#define __p4d_raw(x) ((p4d_t) { __pgd_raw(x) }) +static inline __be64 p4d_raw(p4d_t x) +{ + return pgd_raw(x.pgd); +} + +#define p4d_write(p4d) pte_write(p4d_pte(p4d)) -static inline void pgd_clear(pgd_t *pgdp) +static inline void p4d_clear(p4d_t *p4dp) { - *pgdp = __pgd(0); + *p4dp = __p4d(0); } -static inline int pgd_none(pgd_t pgd) +static inline int p4d_none(p4d_t p4d) { - return !pgd_raw(pgd); + return !p4d_raw(p4d); } -static inline int pgd_present(pgd_t pgd) +static inline int p4d_present(p4d_t p4d) { - return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT)); + return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PRESENT)); } -static inline pte_t pgd_pte(pgd_t pgd) +static inline pte_t p4d_pte(p4d_t p4d) { - return __pte_raw(pgd_raw(pgd)); + return __pte_raw(p4d_raw(p4d)); } -static inline pgd_t pte_pgd(pte_t pte) +static inline p4d_t pte_p4d(pte_t pte) { - return __pgd_raw(pte_raw(pte)); + return __p4d_raw(pte_raw(pte)); } -static inline int pgd_bad(pgd_t pgd) +static inline int p4d_bad(p4d_t p4d) { if (radix_enabled()) - return radix__pgd_bad(pgd); - return hash__pgd_bad(pgd); + return radix__p4d_bad(p4d); + return hash__p4d_bad(p4d); } -#define pgd_access_permitted pgd_access_permitted -static inline bool pgd_access_permitted(pgd_t pgd, bool write) +#define p4d_access_permitted p4d_access_permitted +static inline bool p4d_access_permitted(p4d_t p4d, bool write) { - return pte_access_permitted(pgd_pte(pgd), write); + return pte_access_permitted(p4d_pte(p4d), write); } -extern struct page *pgd_page(pgd_t pgd); +extern struct page *p4d_page(p4d_t p4d); /* Pointers in the page table tree are physical addresses */ #define __pgtable_ptr_val(ptr) __pa(ptr) #define pmd_page_vaddr(pmd) __va(pmd_val(pmd) & ~PMD_MASKED_BITS) #define pud_page_vaddr(pud) __va(pud_val(pud) & ~PUD_MASKED_BITS) -#define pgd_page_vaddr(pgd) __va(pgd_val(pgd) & ~PGD_MASKED_BITS) +#define p4d_page_vaddr(p4d) __va(p4d_val(p4d) & ~P4D_MASKED_BITS) #define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1)) #define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1)) @@ -1010,8 +1016,8 @@ extern struct page *pgd_page(pgd_t pgd); #define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) -#define pud_offset(pgdp, addr) \ - (((pud_t *) pgd_page_vaddr(*(pgdp))) + pud_index(addr)) +#define pud_offset(p4dp, addr) \ + (((pud_t *) p4d_page_vaddr(*(p4dp))) + pud_index(addr)) #define pmd_offset(pudp,addr) \ (((pmd_t *) pud_page_vaddr(*(pudp))) + pmd_index(addr)) #define pte_offset_kernel(dir,addr) \ @@ -1366,11 +1372,11 @@ static inline bool pud_is_leaf(pud_t pud) return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); } -#define pgd_is_leaf pgd_is_leaf -#define pgd_leaf pgd_is_leaf -static inline bool pgd_is_leaf(pgd_t pgd) +#define p4d_is_leaf p4d_is_leaf +#define p4d_leaf p4d_is_leaf +static inline bool p4d_is_leaf(p4d_t p4d) { - return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE)); + return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PTE)); } #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 08c222d5b764..0cba794c4fb8 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -30,7 +30,7 @@ /* Don't have anything in the reserved bits and leaf bits */ #define RADIX_PMD_BAD_BITS 0x60000000000000e0UL #define RADIX_PUD_BAD_BITS 0x60000000000000e0UL -#define RADIX_PGD_BAD_BITS 0x60000000000000e0UL +#define RADIX_P4D_BAD_BITS 0x60000000000000e0UL #define RADIX_PMD_SHIFT (PAGE_SHIFT + RADIX_PTE_INDEX_SIZE) #define RADIX_PUD_SHIFT (RADIX_PMD_SHIFT + RADIX_PMD_INDEX_SIZE) @@ -227,9 +227,9 @@ static inline int radix__pud_bad(pud_t pud) } -static inline int radix__pgd_bad(pgd_t pgd) +static inline int radix__p4d_bad(p4d_t p4d) { - return !!(pgd_val(pgd) & RADIX_PGD_BAD_BITS); + return !!(p4d_val(p4d) & RADIX_P4D_BAD_BITS); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index a4b65b186ec6..104026f7d6bc 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -30,7 +30,6 @@ #include <asm/fixmap.h> extern pte_t *kmap_pte; -extern pgprot_t kmap_prot; extern pte_t *pkmap_page_table; /* @@ -59,33 +58,6 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); -extern void kunmap_high(struct page *page); -extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); -extern void __kunmap_atomic(void *kvaddr); - -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - -static inline void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - -static inline void *kmap_atomic(struct page *page) -{ - return kmap_atomic_prot(page, kmap_prot); -} - - #define flush_cache_kmaps() flush_cache_all() #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index b04ba257fddb..3d0bc99dd520 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -2,7 +2,6 @@ #ifndef _ASM_POWERPC_NOHASH_32_PGTABLE_H #define _ASM_POWERPC_NOHASH_32_PGTABLE_H -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopmd.h> #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index b9534a793293..668aee6017e7 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -15,7 +15,7 @@ struct vmemmap_backing { }; extern struct vmemmap_backing *vmemmap_list; -#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, (unsigned long)PUD) +#define p4d_populate(MM, P4D, PUD) p4d_set(P4D, (unsigned long)PUD) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { diff --git a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h index c40ec32b8194..81b1c54e3cf1 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H #define _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H -#include <asm-generic/5level-fixup.h> +#include <asm-generic/pgtable-nop4d.h> /* * Entries per page directory level. The PTE level must use a 64b record @@ -45,41 +45,41 @@ #define PMD_MASKED_BITS 0 /* Bits to mask out from a PUD to get to the PMD page */ #define PUD_MASKED_BITS 0 -/* Bits to mask out from a PGD to get to the PUD page */ -#define PGD_MASKED_BITS 0 +/* Bits to mask out from a P4D to get to the PUD page */ +#define P4D_MASKED_BITS 0 /* * 4-level page tables related bits */ -#define pgd_none(pgd) (!pgd_val(pgd)) -#define pgd_bad(pgd) (pgd_val(pgd) == 0) -#define pgd_present(pgd) (pgd_val(pgd) != 0) -#define pgd_page_vaddr(pgd) (pgd_val(pgd) & ~PGD_MASKED_BITS) +#define p4d_none(p4d) (!p4d_val(p4d)) +#define p4d_bad(p4d) (p4d_val(p4d) == 0) +#define p4d_present(p4d) (p4d_val(p4d) != 0) +#define p4d_page_vaddr(p4d) (p4d_val(p4d) & ~P4D_MASKED_BITS) #ifndef __ASSEMBLY__ -static inline void pgd_clear(pgd_t *pgdp) +static inline void p4d_clear(p4d_t *p4dp) { - *pgdp = __pgd(0); + *p4dp = __p4d(0); } -static inline pte_t pgd_pte(pgd_t pgd) +static inline pte_t p4d_pte(p4d_t p4d) { - return __pte(pgd_val(pgd)); + return __pte(p4d_val(p4d)); } -static inline pgd_t pte_pgd(pte_t pte) +static inline p4d_t pte_p4d(pte_t pte) { - return __pgd(pte_val(pte)); + return __p4d(pte_val(pte)); } -extern struct page *pgd_page(pgd_t pgd); +extern struct page *p4d_page(p4d_t p4d); #endif /* !__ASSEMBLY__ */ -#define pud_offset(pgdp, addr) \ - (((pud_t *) pgd_page_vaddr(*(pgdp))) + \ +#define pud_offset(p4dp, addr) \ + (((pud_t *) p4d_page_vaddr(*(p4dp))) + \ (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) #define pud_ERROR(e) \ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 9a33b8bd842d..b360f262b9c6 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -175,11 +175,11 @@ static inline pud_t pte_pud(pte_t pte) return __pud(pte_val(pte)); } #define pud_write(pud) pte_write(pud_pte(pud)) -#define pgd_write(pgd) pte_write(pgd_pte(pgd)) +#define p4d_write(pgd) pte_write(p4d_pte(p4d)) -static inline void pgd_set(pgd_t *pgdp, unsigned long val) +static inline void p4d_set(p4d_t *p4dp, unsigned long val) { - *pgdp = __pgd(val); + *p4dp = __p4d(val); } /* diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index b1f1d5339735..bad9b324559d 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -44,12 +44,12 @@ struct mm_struct; #ifdef CONFIG_PPC32 static inline pmd_t *pmd_ptr(struct mm_struct *mm, unsigned long va) { - return pmd_offset(pud_offset(pgd_offset(mm, va), va), va); + return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va); } static inline pmd_t *pmd_ptr_k(unsigned long va) { - return pmd_offset(pud_offset(pgd_offset_k(va), va), va); + return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va); } static inline pte_t *virt_to_kpte(unsigned long vaddr) @@ -158,9 +158,9 @@ static inline bool pud_is_leaf(pud_t pud) } #endif -#ifndef pgd_is_leaf -#define pgd_is_leaf pgd_is_leaf -static inline bool pgd_is_leaf(pgd_t pgd) +#ifndef p4d_is_leaf +#define p4d_is_leaf p4d_is_leaf +static inline bool p4d_is_leaf(p4d_t p4d) { return false; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index aa12cd4078b3..d605ed0bb2e7 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -499,13 +499,14 @@ void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid) unsigned long ig; for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { + p4d_t *p4d = p4d_offset(pgd, 0); pud_t *pud; - if (!pgd_present(*pgd)) + if (!p4d_present(*p4d)) continue; - pud = pud_offset(pgd, 0); + pud = pud_offset(p4d, 0); kvmppc_unmap_free_pud(kvm, pud, lpid); - pgd_clear(pgd); + p4d_clear(p4d); } } @@ -566,6 +567,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, unsigned long *rmapp, struct rmap_nested **n_rmap) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud, *new_pud = NULL; pmd_t *pmd, *new_pmd = NULL; pte_t *ptep, *new_ptep = NULL; @@ -573,9 +575,11 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, /* Traverse the guest's 2nd-level tree, allocate new levels needed */ pgd = pgtable + pgd_index(gpa); + p4d = p4d_offset(pgd, gpa); + pud = NULL; - if (pgd_present(*pgd)) - pud = pud_offset(pgd, gpa); + if (p4d_present(*p4d)) + pud = pud_offset(p4d, gpa); else new_pud = pud_alloc_one(kvm->mm, gpa); @@ -596,13 +600,13 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, /* Now traverse again under the lock and change the tree */ ret = -ENOMEM; - if (pgd_none(*pgd)) { + if (p4d_none(*p4d)) { if (!new_pud) goto out_unlock; - pgd_populate(kvm->mm, pgd, new_pud); + p4d_populate(kvm->mm, p4d, new_pud); new_pud = NULL; } - pud = pud_offset(pgd, gpa); + pud = pud_offset(p4d, gpa); if (pud_is_leaf(*pud)) { unsigned long hgpa = gpa & PUD_MASK; @@ -1220,7 +1224,8 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf, unsigned long gpa; pgd_t *pgt; struct kvm_nested_guest *nested; - pgd_t pgd, *pgdp; + pgd_t *pgdp; + p4d_t p4d, *p4dp; pud_t pud, *pudp; pmd_t pmd, *pmdp; pte_t *ptep; @@ -1293,13 +1298,14 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf, } pgdp = pgt + pgd_index(gpa); - pgd = READ_ONCE(*pgdp); - if (!(pgd_val(pgd) & _PAGE_PRESENT)) { - gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE; + p4dp = p4d_offset(pgdp, gpa); + p4d = READ_ONCE(*p4dp); + if (!(p4d_val(p4d) & _PAGE_PRESENT)) { + gpa = (gpa & P4D_MASK) + P4D_SIZE; continue; } - pudp = pud_offset(&pgd, gpa); + pudp = pud_offset(&p4d, gpa); pud = READ_ONCE(*pudp); if (!(pud_val(pud) & _PAGE_PRESENT)) { gpa = (gpa & PUD_MASK) + PUD_SIZE; diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 3345f039a876..7a59f6863cec 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -107,13 +107,18 @@ static inline int unmap_patch_area(unsigned long addr) pte_t *ptep; pmd_t *pmdp; pud_t *pudp; + p4d_t *p4dp; pgd_t *pgdp; pgdp = pgd_offset_k(addr); if (unlikely(!pgdp)) return -EINVAL; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + if (unlikely(!p4dp)) + return -EINVAL; + + pudp = pud_offset(p4dp, addr); if (unlikely(!pudp)) return -EINVAL; diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 64733b9cb20a..9cd15937e88a 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -148,6 +148,7 @@ void hash__vmemmap_remove_mapping(unsigned long start, int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -155,7 +156,8 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); if (slab_is_available()) { pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); if (!pudp) return -ENOMEM; pmdp = pmd_alloc(&init_mm, pudp, ea); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 8f9edf07063a..97891ca0d428 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -65,17 +65,19 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa, { unsigned long pfn = pa >> PAGE_SHIFT; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; pgdp = pgd_offset_k(ea); - if (pgd_none(*pgdp)) { + p4dp = p4d_offset(pgdp, ea); + if (p4d_none(*p4dp)) { pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, region_start, region_end); - pgd_populate(&init_mm, pgdp, pudp); + p4d_populate(&init_mm, p4dp, pudp); } - pudp = pud_offset(pgdp, ea); + pudp = pud_offset(p4dp, ea); if (map_page_size == PUD_SIZE) { ptep = (pte_t *)pudp; goto set_the_pte; @@ -115,6 +117,7 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa, { unsigned long pfn = pa >> PAGE_SHIFT; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -137,7 +140,8 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa, * boot. */ pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); if (!pudp) return -ENOMEM; if (map_page_size == PUD_SIZE) { @@ -174,6 +178,7 @@ void radix__change_memory_range(unsigned long start, unsigned long end, { unsigned long idx; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -186,7 +191,8 @@ void radix__change_memory_range(unsigned long start, unsigned long end, for (idx = start; idx < end; idx += PAGE_SIZE) { pgdp = pgd_offset_k(idx); - pudp = pud_alloc(&init_mm, pgdp, idx); + p4dp = p4d_offset(pgdp, idx); + pudp = pud_alloc(&init_mm, p4dp, idx); if (!pudp) continue; if (pud_is_leaf(*pudp)) { @@ -850,6 +856,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) unsigned long addr, next; pud_t *pud_base; pgd_t *pgd; + p4d_t *p4d; spin_lock(&init_mm.page_table_lock); @@ -857,15 +864,16 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) next = pgd_addr_end(addr, end); pgd = pgd_offset_k(addr); - if (!pgd_present(*pgd)) + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) continue; - if (pgd_is_leaf(*pgd)) { - split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); + if (p4d_is_leaf(*p4d)) { + split_kernel_mapping(addr, end, P4D_SIZE, (pte_t *)p4d); continue; } - pud_base = (pud_t *)pgd_page_vaddr(*pgd); + pud_base = (pud_t *)p4d_page_vaddr(*p4d); remove_pud_table(pud_base, addr, next); } diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 2ef24a53f4c9..25a0c044bd93 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -54,15 +54,17 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, int npages) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; spinlock_t *ptl; pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) return; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); if (pud_none(*pud)) return; pmd = pmd_offset(pud, addr); diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index 320c1672b2ae..624b4438aff9 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -24,22 +24,11 @@ #include <linux/highmem.h> #include <linux/module.h> -/* - * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap - * gives a more generic (and caching) interface. But kmap_atomic can - * be used in IRQ contexts, so in some (very limited) cases we need - * it. - */ -void *kmap_atomic_prot(struct page *page, pgprot_t prot) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -49,17 +38,14 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - if (vaddr < __fix_to_virt(FIX_KMAP_END)) { - pagefault_enable(); - preempt_enable(); + if (vaddr < __fix_to_virt(FIX_KMAP_END)) return; - } if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) { int type = kmap_atomic_idx(); @@ -77,7 +63,5 @@ void __kunmap_atomic(void *kvaddr) } kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 4d5ed1093615..f122d0f2c295 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -119,6 +119,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pg; + p4d_t *p4; pud_t *pu; pmd_t *pm; hugepd_t *hpdp = NULL; @@ -128,20 +129,21 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz addr &= ~(sz-1); pg = pgd_offset(mm, addr); + p4 = p4d_offset(pg, addr); #ifdef CONFIG_PPC_BOOK3S_64 if (pshift == PGDIR_SHIFT) /* 16GB huge page */ - return (pte_t *) pg; + return (pte_t *) p4; else if (pshift > PUD_SHIFT) { /* * We need to use hugepd table */ ptl = &mm->page_table_lock; - hpdp = (hugepd_t *)pg; + hpdp = (hugepd_t *)p4; } else { pdshift = PUD_SHIFT; - pu = pud_alloc(mm, pg, addr); + pu = pud_alloc(mm, p4, addr); if (!pu) return NULL; if (pshift == PUD_SHIFT) @@ -166,10 +168,10 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz #else if (pshift >= PGDIR_SHIFT) { ptl = &mm->page_table_lock; - hpdp = (hugepd_t *)pg; + hpdp = (hugepd_t *)p4; } else { pdshift = PUD_SHIFT; - pu = pud_alloc(mm, pg, addr); + pu = pud_alloc(mm, p4, addr); if (!pu) return NULL; if (pshift >= PUD_SHIFT) { @@ -390,7 +392,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, mm_dec_nr_pmds(tlb->mm); } -static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, +static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { @@ -400,7 +402,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, start = addr; do { - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); next = pud_addr_end(addr, end); if (!is_hugepd(__hugepd(pud_val(*pud)))) { if (pud_none_or_clear_bad(pud)) @@ -435,8 +437,8 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, if (end - 1 > ceiling - 1) return; - pud = pud_offset(pgd, start); - pgd_clear(pgd); + pud = pud_offset(p4d, start); + p4d_clear(p4d); pud_free_tlb(tlb, pud, start); mm_dec_nr_puds(tlb->mm); } @@ -449,6 +451,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long floor, unsigned long ceiling) { pgd_t *pgd; + p4d_t *p4d; unsigned long next; /* @@ -471,10 +474,11 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, do { next = pgd_addr_end(addr, end); pgd = pgd_offset(tlb->mm, addr); + p4d = p4d_offset(pgd, addr); if (!is_hugepd(__hugepd(pgd_val(*pgd)))) { - if (pgd_none_or_clear_bad(pgd)) + if (p4d_none_or_clear_bad(p4d)) continue; - hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); + hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling); } else { unsigned long more; /* @@ -487,7 +491,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, if (more > next) next = more; - free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, + free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT, addr, next, floor, ceiling); } } while (addr = next, addr != end); diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index cbcad369fcb2..c99aa8cbaac5 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -121,7 +121,7 @@ static void __init kasan_unmap_early_shadow_vmalloc(void) phys_addr_t pa = __pa(kasan_early_shadow_page); for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { - pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); + pmd_t *pmd = pmd_ptr_k(k_cur); pte_t *ptep = pte_offset_kernel(pmd, k_cur); if ((pte_val(*ptep) & PTE_RPN_MASK) != pa) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 0fcea21f26b4..7cebb9c818d3 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -64,8 +64,6 @@ bool init_mem_is_free; #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); -pgprot_t kmap_prot; -EXPORT_SYMBOL(kmap_prot); #endif pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, @@ -245,7 +243,6 @@ void __init paging_init(void) pkmap_page_table = virt_to_kpte(PKMAP_BASE); kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); - kmap_prot = PAGE_KERNEL; #endif /* CONFIG_HIGHMEM */ printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n", diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c index 4637fdd469cf..77884e24281d 100644 --- a/arch/powerpc/mm/nohash/book3e_pgtable.c +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -73,6 +73,7 @@ static void __init *early_alloc_pgtable(unsigned long size) int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -80,7 +81,8 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE); if (slab_is_available()) { pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); if (!pudp) return -ENOMEM; pmdp = pmd_alloc(&init_mm, pudp, ea); @@ -91,13 +93,12 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) return -ENOMEM; } else { pgdp = pgd_offset_k(ea); -#ifndef __PAGETABLE_PUD_FOLDED - if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE); - pgd_populate(&init_mm, pgdp, pudp); + p4dp = p4d_offset(pgdp, ea); + if (p4d_none(*p4dp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + p4d_populate(&init_mm, p4dp, pmdp); } -#endif /* !__PAGETABLE_PUD_FOLDED */ - pudp = pud_offset(pgdp, ea); + pudp = pud_offset(p4dp, ea); if (pud_none(*pudp)) { pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); pud_populate(&init_mm, pudp, pmdp); diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index e3759b69f81b..c2499271f6c1 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -265,6 +265,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, void assert_pte_locked(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -272,7 +273,9 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) return; pgd = mm->pgd + pgd_index(addr); BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + BUG_ON(p4d_none(*p4d)); + pud = pud_offset(p4d, addr); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, addr); /* @@ -312,12 +315,13 @@ EXPORT_SYMBOL_GPL(vmalloc_to_phys); pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, bool *is_thp, unsigned *hpage_shift) { - pgd_t pgd, *pgdp; + pgd_t *pgdp; + p4d_t p4d, *p4dp; pud_t pud, *pudp; pmd_t pmd, *pmdp; pte_t *ret_pte; hugepd_t *hpdp = NULL; - unsigned pdshift = PGDIR_SHIFT; + unsigned pdshift; if (hpage_shift) *hpage_shift = 0; @@ -325,24 +329,28 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, if (is_thp) *is_thp = false; - pgdp = pgdir + pgd_index(ea); - pgd = READ_ONCE(*pgdp); /* * Always operate on the local stack value. This make sure the * value don't get updated by a parallel THP split/collapse, * page fault or a page unmap. The return pte_t * is still not * stable. So should be checked there for above conditions. + * Top level is an exception because it is folded into p4d. */ - if (pgd_none(pgd)) + pgdp = pgdir + pgd_index(ea); + p4dp = p4d_offset(pgdp, ea); + p4d = READ_ONCE(*p4dp); + pdshift = P4D_SHIFT; + + if (p4d_none(p4d)) return NULL; - if (pgd_is_leaf(pgd)) { - ret_pte = (pte_t *)pgdp; + if (p4d_is_leaf(p4d)) { + ret_pte = (pte_t *)p4dp; goto out; } - if (is_hugepd(__hugepd(pgd_val(pgd)))) { - hpdp = (hugepd_t *)&pgd; + if (is_hugepd(__hugepd(p4d_val(p4d)))) { + hpdp = (hugepd_t *)&p4d; goto out_huge; } @@ -352,7 +360,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, * irq disabled */ pdshift = PUD_SHIFT; - pudp = pud_offset(&pgd, ea); + pudp = pud_offset(&p4d, ea); pud = READ_ONCE(*pudp); if (pud_none(pud)) diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index e78832dce7bb..1f86a88fd4bb 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -101,13 +101,13 @@ EXPORT_SYMBOL(__pte_frag_size_shift); #ifndef __PAGETABLE_PUD_FOLDED /* 4 level page table */ -struct page *pgd_page(pgd_t pgd) +struct page *p4d_page(p4d_t p4d) { - if (pgd_is_leaf(pgd)) { - VM_WARN_ON(!pgd_huge(pgd)); - return pte_page(pgd_pte(pgd)); + if (p4d_is_leaf(p4d)) { + VM_WARN_ON(!p4d_huge(p4d)); + return pte_page(p4d_pte(p4d)); } - return virt_to_page(pgd_page_vaddr(pgd)); + return virt_to_page(p4d_page_vaddr(p4d)); } #endif diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index b6ed9578382f..6aaeb1eb3b9c 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -417,9 +417,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) } } -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) { - pud_t *pud = pud_offset(pgd, 0); + pud_t *pud = pud_offset(p4d, 0); unsigned long addr; unsigned int i; @@ -431,6 +431,20 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) } } +static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start) +{ + p4d_t *p4d = p4d_offset(pgd, 0); + unsigned long addr; + unsigned int i; + + for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { + addr = start + i * P4D_SIZE; + if (!p4d_none(*p4d)) + /* p4d exists */ + walk_pud(st, p4d, addr); + } +} + static void walk_pagetables(struct pg_state *st) { pgd_t *pgd = pgd_offset_k(0UL); @@ -445,7 +459,7 @@ static void walk_pagetables(struct pg_state *st) addr = KERN_VIRT_START + i * PGDIR_SIZE; if (!pgd_none(*pgd)) /* pgd exists */ - walk_pud(st, pgd, addr); + walk_p4d(st, pgd, addr); } } diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index d92bb8ea229c..b3fead0230c1 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -277,9 +277,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) } } -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) { - pud_t *pud = pud_offset(pgd, 0); + pud_t *pud = pud_offset(p4d, 0); unsigned long addr; unsigned int i; @@ -304,11 +304,13 @@ static void walk_pagetables(struct pg_state *st) * the hash pagetable. */ for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { - if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd)) + p4d_t *p4d = p4d_offset(pgd, 0); + + if (!p4d_none(*p4d) && !p4d_is_leaf(*p4d)) /* pgd exists */ - walk_pud(st, pgd, addr); + walk_pud(st, p4d, addr); else - note_page(st, addr, 1, pgd_val(*pgd)); + note_page(st, addr, 1, p4d_val(*p4d)); } } diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index b2cde1732301..5ace2f9a277e 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -337,39 +337,19 @@ static int pseries_remove_mem_node(struct device_node *np) static bool lmb_is_removable(struct drmem_lmb *lmb) { - int i, scns_per_block; - bool rc = true; - unsigned long pfn, block_sz; - u64 phys_addr; - if (!(lmb->flags & DRCONF_MEM_ASSIGNED)) return false; - block_sz = memory_block_size_bytes(); - scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; - phys_addr = lmb->base_addr; - #ifdef CONFIG_FA_DUMP /* * Don't hot-remove memory that falls in fadump boot memory area * and memory that is reserved for capturing old kernel memory. */ - if (is_fadump_memory_area(phys_addr, block_sz)) + if (is_fadump_memory_area(lmb->base_addr, memory_block_size_bytes())) return false; #endif - - for (i = 0; i < scns_per_block; i++) { - pfn = PFN_DOWN(phys_addr); - if (!pfn_in_present_section(pfn)) { - phys_addr += MIN_MEMORY_BLOCK_SIZE; - continue; - } - - rc = rc && is_mem_section_removable(pfn, PAGES_PER_SECTION); - phys_addr += MIN_MEMORY_BLOCK_SIZE; - } - - return rc; + /* device_offline() will determine if we can actually remove this lmb */ + return true; } static int dlpar_add_lmb(struct drmem_lmb *); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 7af840c0fc93..89415b84c597 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -3135,7 +3135,8 @@ static void show_pte(unsigned long addr) unsigned long tskv = 0; struct task_struct *tsk = NULL; struct mm_struct *mm; - pgd_t *pgdp, *pgdir; + pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -3159,28 +3160,26 @@ static void show_pte(unsigned long addr) catch_memory_errors = 1; sync(); - if (mm == &init_mm) { + if (mm == &init_mm) pgdp = pgd_offset_k(addr); - pgdir = pgd_offset_k(0); - } else { + else pgdp = pgd_offset(mm, addr); - pgdir = pgd_offset(mm, 0); - } - if (pgd_none(*pgdp)) { - printf("no linux page table for address\n"); + p4dp = p4d_offset(pgdp, addr); + + if (p4d_none(*p4dp)) { + printf("No valid P4D\n"); return; } - printf("pgd @ 0x%px\n", pgdir); - - if (pgd_is_leaf(*pgdp)) { - format_pte(pgdp, pgd_val(*pgdp)); + if (p4d_is_leaf(*p4dp)) { + format_pte(p4dp, p4d_val(*p4dp)); return; } - printf("pgdp @ 0x%px = 0x%016lx\n", pgdp, pgd_val(*pgdp)); - pudp = pud_offset(pgdp, addr); + printf("p4dp @ 0x%px = 0x%016lx\n", p4dp, p4d_val(*p4dp)); + + pudp = pud_offset(p4dp, addr); if (pud_none(*pudp)) { printf("No valid PUD\n"); diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d6dc6933adc2..f854faff38c3 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -59,6 +59,7 @@ config KASAN_SHADOW_OFFSET config S390 def_bool y select ARCH_BINFMT_ELF_STATE + select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE diff --git a/arch/sh/include/asm/pgtable-2level.h b/arch/sh/include/asm/pgtable-2level.h index bf1eb51c3ee5..08bff93927ff 100644 --- a/arch/sh/include/asm/pgtable-2level.h +++ b/arch/sh/include/asm/pgtable-2level.h @@ -2,7 +2,6 @@ #ifndef __ASM_SH_PGTABLE_2LEVEL_H #define __ASM_SH_PGTABLE_2LEVEL_H -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopmd.h> /* diff --git a/arch/sh/include/asm/pgtable-3level.h b/arch/sh/include/asm/pgtable-3level.h index 779260b721ca..0f80097e5c9c 100644 --- a/arch/sh/include/asm/pgtable-3level.h +++ b/arch/sh/include/asm/pgtable-3level.h @@ -2,7 +2,6 @@ #ifndef __ASM_SH_PGTABLE_3LEVEL_H #define __ASM_SH_PGTABLE_3LEVEL_H -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopud.h> /* diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h index 29274f0e428e..4acce5f2cbf9 100644 --- a/arch/sh/include/asm/pgtable_32.h +++ b/arch/sh/include/asm/pgtable_32.h @@ -407,13 +407,12 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) /* to find an entry in a page-table-directory. */ #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) #define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) -#define __pgd_offset(address) pgd_index(address) /* to find an entry in a kernel page-table-directory */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) -#define __pud_offset(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -#define __pmd_offset(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) /* Find an entry in the third-level page table.. */ #define pte_index(address) ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff --git a/arch/sh/include/asm/pgtable_64.h b/arch/sh/include/asm/pgtable_64.h index 1778bc5971e7..27cc282ec6c0 100644 --- a/arch/sh/include/asm/pgtable_64.h +++ b/arch/sh/include/asm/pgtable_64.h @@ -46,14 +46,13 @@ static __inline__ void set_pte(pte_t *pteptr, pte_t pteval) /* To find an entry in a generic PGD. */ #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) -#define __pgd_offset(address) pgd_index(address) #define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) /* To find an entry in a kernel PGD. */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) -#define __pud_offset(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -#define __pmd_offset(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) +/* #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) */ /* * PMD level access routines. Same notes as above. diff --git a/arch/sh/kernel/io_trapped.c b/arch/sh/kernel/io_trapped.c index 60c828a2b8a2..037aab2708b7 100644 --- a/arch/sh/kernel/io_trapped.c +++ b/arch/sh/kernel/io_trapped.c @@ -136,6 +136,7 @@ EXPORT_SYMBOL_GPL(match_trapped_io_handler); static struct trapped_io *lookup_tiop(unsigned long address) { pgd_t *pgd_k; + p4d_t *p4d_k; pud_t *pud_k; pmd_t *pmd_k; pte_t *pte_k; @@ -145,7 +146,11 @@ static struct trapped_io *lookup_tiop(unsigned long address) if (!pgd_present(*pgd_k)) return NULL; - pud_k = pud_offset(pgd_k, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index eee911422cf9..45943bcb7042 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -209,6 +209,7 @@ static void sh4_flush_cache_page(void *args) unsigned long address, pfn, phys; int map_coherent = 0; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -224,7 +225,8 @@ static void sh4_flush_cache_page(void *args) return; pgd = pgd_offset(vma->vm_mm, address); - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); pte = pte_offset_kernel(pmd, address); diff --git a/arch/sh/mm/cache-sh5.c b/arch/sh/mm/cache-sh5.c index 445b5e69b73c..442a77cc2957 100644 --- a/arch/sh/mm/cache-sh5.c +++ b/arch/sh/mm/cache-sh5.c @@ -383,6 +383,7 @@ static void sh64_dcache_purge_user_pages(struct mm_struct *mm, unsigned long addr, unsigned long end) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -397,7 +398,11 @@ static void sh64_dcache_purge_user_pages(struct mm_struct *mm, if (pgd_bad(*pgd)) return; - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d) || p4d_bad(*p4d)) + return; + + pud = pud_offset(p4d, addr); if (pud_none(*pud) || pud_bad(*pud)) return; diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 5f23d7907597..7260a1a7fdca 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -47,12 +47,13 @@ static void show_pte(struct mm_struct *mm, unsigned long addr) pgd = swapper_pg_dir; } - printk(KERN_ALERT "pgd = %p\n", pgd); + pr_alert("pgd = %p\n", pgd); pgd += pgd_index(addr); - printk(KERN_ALERT "[%08lx] *pgd=%0*Lx", addr, - (u32)(sizeof(*pgd) * 2), (u64)pgd_val(*pgd)); + pr_alert("[%08lx] *pgd=%0*llx", addr, (u32)(sizeof(*pgd) * 2), + (u64)pgd_val(*pgd)); do { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -61,33 +62,46 @@ static void show_pte(struct mm_struct *mm, unsigned long addr) break; if (pgd_bad(*pgd)) { - printk("(bad)"); + pr_cont("(bad)"); break; } - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (PTRS_PER_P4D != 1) + pr_cont(", *p4d=%0*Lx", (u32)(sizeof(*p4d) * 2), + (u64)p4d_val(*p4d)); + + if (p4d_none(*p4d)) + break; + + if (p4d_bad(*p4d)) { + pr_cont("(bad)"); + break; + } + + pud = pud_offset(p4d, addr); if (PTRS_PER_PUD != 1) - printk(", *pud=%0*Lx", (u32)(sizeof(*pud) * 2), - (u64)pud_val(*pud)); + pr_cont(", *pud=%0*llx", (u32)(sizeof(*pud) * 2), + (u64)pud_val(*pud)); if (pud_none(*pud)) break; if (pud_bad(*pud)) { - printk("(bad)"); + pr_cont("(bad)"); break; } pmd = pmd_offset(pud, addr); if (PTRS_PER_PMD != 1) - printk(", *pmd=%0*Lx", (u32)(sizeof(*pmd) * 2), - (u64)pmd_val(*pmd)); + pr_cont(", *pmd=%0*llx", (u32)(sizeof(*pmd) * 2), + (u64)pmd_val(*pmd)); if (pmd_none(*pmd)) break; if (pmd_bad(*pmd)) { - printk("(bad)"); + pr_cont("(bad)"); break; } @@ -96,17 +110,18 @@ static void show_pte(struct mm_struct *mm, unsigned long addr) break; pte = pte_offset_kernel(pmd, addr); - printk(", *pte=%0*Lx", (u32)(sizeof(*pte) * 2), - (u64)pte_val(*pte)); + pr_cont(", *pte=%0*llx", (u32)(sizeof(*pte) * 2), + (u64)pte_val(*pte)); } while (0); - printk("\n"); + pr_cont("\n"); } static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -116,8 +131,13 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) if (!pgd_present(*pgd_k)) return NULL; - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; @@ -188,14 +208,12 @@ show_fault_oops(struct pt_regs *regs, unsigned long address) if (!oops_may_print()) return; - printk(KERN_ALERT "BUG: unable to handle kernel "); - if (address < PAGE_SIZE) - printk(KERN_CONT "NULL pointer dereference"); - else - printk(KERN_CONT "paging request"); - - printk(KERN_CONT " at %08lx\n", address); printk(KERN_ALERT "PC:"); + pr_alert("BUG: unable to handle kernel %s at %08lx\n", + address < PAGE_SIZE ? "NULL pointer dereference" + : "paging request", + address); + pr_alert("PC:"); printk_address(regs->pc, 1); show_pte(NULL, address); diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 960deb1f24a1..acd5652a0de3 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -26,17 +26,21 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); if (pgd) { - pud = pud_alloc(mm, pgd, addr); - if (pud) { - pmd = pmd_alloc(mm, pud, addr); - if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (p4d) { + pud = pud_alloc(mm, p4d, addr); + if (pud) { + pmd = pmd_alloc(mm, pud, addr); + if (pmd) + pte = pte_alloc_map(mm, pmd, addr); + } } } @@ -47,17 +51,21 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); if (pgd) { - pud = pud_offset(pgd, addr); - if (pud) { - pmd = pmd_offset(pud, addr); - if (pmd) - pte = pte_offset_map(pmd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d) { + pud = pud_offset(p4d, addr); + if (pud) { + pmd = pmd_offset(pud, addr); + if (pmd) + pte = pte_offset_map(pmd, addr); + } } } diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 628f461b8993..a70ba0fdd0b3 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -45,6 +45,7 @@ void __init __weak plat_mem_setup(void) static pte_t *__get_pte_phys(unsigned long addr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -54,7 +55,13 @@ static pte_t *__get_pte_phys(unsigned long addr) return NULL; } - pud = pud_alloc(NULL, pgd, addr); + p4d = p4d_alloc(NULL, pgd, addr); + if (unlikely(!p4d)) { + p4d_ERROR(*p4d); + return NULL; + } + + pud = pud_alloc(NULL, p4d, addr); if (unlikely(!pud)) { pud_ERROR(*pud); return NULL; @@ -172,9 +179,9 @@ void __init page_table_range_init(unsigned long start, unsigned long end, unsigned long vaddr; vaddr = start; - i = __pgd_offset(vaddr); - j = __pud_offset(vaddr); - k = __pmd_offset(vaddr); + i = pgd_index(vaddr); + j = pud_index(vaddr); + k = pmd_index(vaddr); pgd = pgd_base + i; for ( ; (i < PTRS_PER_PGD) && (vaddr != end); pgd++, i++) { diff --git a/arch/sh/mm/kmap.c b/arch/sh/mm/kmap.c index 9e6b38b03cf7..0e7039137f5a 100644 --- a/arch/sh/mm/kmap.c +++ b/arch/sh/mm/kmap.c @@ -15,7 +15,7 @@ #include <asm/cacheflush.h> #define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)), (vaddr)) + pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)), (vaddr)), vaddr) static pte_t *kmap_coherent_pte; diff --git a/arch/sh/mm/tlbex_32.c b/arch/sh/mm/tlbex_32.c index 382262dc0c4b..1c53868632ee 100644 --- a/arch/sh/mm/tlbex_32.c +++ b/arch/sh/mm/tlbex_32.c @@ -23,6 +23,7 @@ handle_tlbmiss(struct pt_regs *regs, unsigned long error_code, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -42,7 +43,10 @@ handle_tlbmiss(struct pt_regs *regs, unsigned long error_code, pgd = pgd_offset(current->mm, address); } - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none_or_clear_bad(p4d)) + return 1; + pud = pud_offset(p4d, address); if (pud_none_or_clear_bad(pud)) return 1; pmd = pmd_offset(pud, address); diff --git a/arch/sh/mm/tlbex_64.c b/arch/sh/mm/tlbex_64.c index 8ff966dd0c74..0d015f7556fa 100644 --- a/arch/sh/mm/tlbex_64.c +++ b/arch/sh/mm/tlbex_64.c @@ -44,6 +44,7 @@ static int handle_tlbmiss(unsigned long long protection_flags, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -58,7 +59,11 @@ static int handle_tlbmiss(unsigned long long protection_flags, pgd = pgd_offset(current->mm, address); } - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || !p4d_present(*p4d)) + return 1; + + pud = pud_offset(p4d, address); if (pud_none(*pud) || !pud_present(*pud)) return 1; diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 18d776925c45..ddb03c04f1f3 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -25,11 +25,12 @@ #include <asm/vaddrs.h> #include <asm/kmap_types.h> #include <asm/pgtable.h> +#include <asm/pgtsrmmu.h> /* declarations for highmem.c */ extern unsigned long highstart_pfn, highend_pfn; -extern pgprot_t kmap_prot; +#define kmap_prot __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE) extern pte_t *pkmap_page_table; void kmap_init(void) __init; @@ -50,28 +51,6 @@ void kmap_init(void) __init; #define PKMAP_END (PKMAP_ADDR(LAST_PKMAP)) -void *kmap_high(struct page *page); -void kunmap_high(struct page *page); - -static inline void *kmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - -static inline void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - -void *kmap_atomic(struct page *page); -void __kunmap_atomic(void *kvaddr); - #define flush_cache_kmaps() flush_cache_all() #endif /* __KERNEL__ */ diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index d4a80adea7e5..6ff6e2a9f9b3 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -32,8 +32,6 @@ #include <asm/pgalloc.h> #include <asm/vaddrs.h> -pgprot_t kmap_prot; - static pte_t *kmap_pte; void __init kmap_init(void) @@ -50,19 +48,13 @@ void __init kmap_init(void) /* cache the first kmap pte */ kmap_pte = pte_offset_kernel(dir, address); - kmap_prot = __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE); } -void *kmap_atomic(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; long idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -77,7 +69,7 @@ void *kmap_atomic(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte-idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); /* XXX Fix - Anton */ #if 0 __flush_tlb_one(vaddr); @@ -87,18 +79,15 @@ void *kmap_atomic(struct page *page) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int type; - if (vaddr < FIXADDR_START) { // FIXME - pagefault_enable(); - preempt_enable(); + if (vaddr < FIXADDR_START) return; - } type = kmap_atomic_idx(); @@ -131,7 +120,5 @@ void __kunmap_atomic(void *kvaddr) #endif kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/sparc/mm/io-unit.c b/arch/sparc/mm/io-unit.c index 289276b99b01..08238d989cfd 100644 --- a/arch/sparc/mm/io-unit.c +++ b/arch/sparc/mm/io-unit.c @@ -10,7 +10,6 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/mm.h> -#include <linux/highmem.h> /* pte_offset_map => kmap_atomic */ #include <linux/bitops.h> #include <linux/dma-mapping.h> #include <linux/of.h> diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index b00dde13681b..f1e08e30b64e 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -12,7 +12,6 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/highmem.h> /* pte_offset_map => kmap_atomic */ #include <linux/dma-mapping.h> #include <linux/of.h> #include <linux/of_device.h> diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h index 3b8731b3a937..826f49edd94e 100644 --- a/arch/unicore32/include/asm/pgtable.h +++ b/arch/unicore32/include/asm/pgtable.h @@ -9,7 +9,6 @@ #ifndef __UNICORE_PGTABLE_H__ #define __UNICORE_PGTABLE_H__ -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopmd.h> #include <asm/cpu-single.h> diff --git a/arch/unicore32/kernel/hibernate.c b/arch/unicore32/kernel/hibernate.c index f3812245cc00..ccad051a79b6 100644 --- a/arch/unicore32/kernel/hibernate.c +++ b/arch/unicore32/kernel/hibernate.c @@ -33,9 +33,11 @@ struct swsusp_arch_regs swsusp_arch_regs_cpu0; static pmd_t *resume_one_md_table_init(pgd_t *pgd) { pud_t *pud; + p4d_t *p4d; pmd_t *pmd_table; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); return pmd_table; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 57d1c4e36738..1a54aeb40626 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -59,6 +59,7 @@ config X86 select ARCH_CLOCKSOURCE_INIT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_DEBUG_VIRTUAL + select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 28183ee3cc42..b9527a54db99 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -152,7 +152,6 @@ extern void reserve_top_address(unsigned long reserve); extern int fixmaps_set; extern pte_t *kmap_pte; -#define kmap_prot PAGE_KERNEL extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index a8059930056d..0f420b24e0fc 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,15 +58,6 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); -extern void kunmap_high(struct page *page); - -void *kmap(struct page *page); -void kunmap(struct page *page); - -void *kmap_atomic_prot(struct page *page, pgprot_t prot); -void *kmap_atomic(struct page *page); -void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index df1373415f11..8d03ffd43794 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -53,6 +53,12 @@ static inline void sync_initial_page_table(void) { } struct mm_struct; +#define mm_p4d_folded mm_p4d_folded +static inline bool mm_p4d_folded(struct mm_struct *mm) +{ + return !pgtable_l5_enabled(); +} + void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte); void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 0a1898b8552e..075fe51317b0 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,44 +4,11 @@ #include <linux/swap.h> /* for totalram_pages */ #include <linux/memblock.h> -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - -void kunmap(struct page *page) -{ - if (in_interrupt()) - BUG(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. - * - * However when holding an atomic kmap it is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. - */ -void *kmap_atomic_prot(struct page *page, pgprot_t prot) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -51,13 +18,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_prot); - -void *kmap_atomic(struct page *page) -{ - return kmap_atomic_prot(page, kmap_prot); -} -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); /* * This is the same as kmap_atomic() but can map memory that doesn't @@ -69,7 +30,7 @@ void *kmap_atomic_pfn(unsigned long pfn) } EXPORT_SYMBOL_GPL(kmap_atomic_pfn); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; @@ -99,11 +60,8 @@ void __kunmap_atomic(void *kvaddr) BUG_ON(vaddr >= (unsigned long)high_memory); } #endif - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void __init set_highmem_pages_init(void) { diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 04e9340eac4b..d6a10704307a 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -63,38 +63,11 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) extern pte_t *pkmap_page_table; -void *kmap_high(struct page *page); -void kunmap_high(struct page *page); - -static inline void *kmap(struct page *page) -{ - /* Check if this memory layout is broken because PKMAP overlaps - * page table. - */ - BUILD_BUG_ON(PKMAP_BASE < - TLBTEMP_BASE_1 + TLBTEMP_SIZE); - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - -static inline void kunmap(struct page *page) -{ - BUG_ON(in_interrupt()); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - static inline void flush_cache_kmaps(void) { flush_cache_all(); } -void *kmap_atomic(struct page *page); -void __kunmap_atomic(void *kvaddr); - void kmap_init(void); #endif diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index 184ceadccc1a..99b5ad137ab5 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -37,29 +37,24 @@ static inline enum fixed_addresses kmap_idx(int type, unsigned long color) color; } -void *kmap_atomic(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - idx = kmap_idx(kmap_atomic_idx_push(), DCACHE_ALIAS(page_to_phys(page))); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte + idx))); #endif - set_pte(kmap_pte + idx, mk_pte(page, PAGE_KERNEL_EXEC)); + set_pte(kmap_pte + idx, mk_pte(page, prot)); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { if (kvaddr >= (void *)FIXADDR_START && kvaddr < (void *)FIXADDR_TOP) { @@ -78,16 +73,17 @@ void __kunmap_atomic(void *kvaddr) kmap_atomic_idx_pop(); } - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void __init kmap_init(void) { unsigned long kmap_vstart; + /* Check if this memory layout is broken because PKMAP overlaps + * page table. + */ + BUILD_BUG_ON(PKMAP_BASE < TLBTEMP_BASE_1 + TLBTEMP_SIZE); /* cache the first kmap pte */ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); kmap_pte = kmap_get_fixmap_pte(kmap_vstart); diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 5ee8e3fae551..33e3b76c4fa9 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -29,7 +29,6 @@ static const char * const backends[] = { #if IS_ENABLED(CONFIG_CRYPTO_ZSTD) "zstd", #endif - NULL }; static void zcomp_strm_free(struct zcomp_strm *zstrm) @@ -64,7 +63,7 @@ bool zcomp_available_algorithm(const char *comp) { int i; - i = __sysfs_match_string(backends, -1, comp); + i = sysfs_match_string(backends, comp); if (i >= 0) return true; @@ -83,9 +82,9 @@ ssize_t zcomp_available_show(const char *comp, char *buf) { bool known_algorithm = false; ssize_t sz = 0; - int i = 0; + int i; - for (; backends[i]; i++) { + for (i = 0; i < ARRAY_SIZE(backends); i++) { if (!strcmp(comp, backends[i])) { known_algorithm = true; sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index 3107ce80e809..16850d5388ab 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -44,6 +44,7 @@ struct dax_region { * @dev - device core * @pgmap - pgmap for memmap setup / lifetime (driver owned) * @dax_mem_res: physical address range of hotadded DAX memory + * @dax_mem_name: name for hotadded DAX memory via add_memory_driver_managed() */ struct dev_dax { struct dax_region *region; diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index 1e678bdf5aed..275aa5f87399 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -14,6 +14,11 @@ #include "dax-private.h" #include "bus.h" +/* Memory resource name used for add_memory_driver_managed(). */ +static const char *kmem_name; +/* Set if any memory will remain added when the driver will be unloaded. */ +static bool any_hotremove_failed; + int dev_dax_kmem_probe(struct device *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); @@ -70,7 +75,12 @@ int dev_dax_kmem_probe(struct device *dev) */ new_res->flags = IORESOURCE_SYSTEM_RAM; - rc = add_memory(numa_node, new_res->start, resource_size(new_res)); + /* + * Ensure that future kexec'd kernels will not treat this as RAM + * automatically. + */ + rc = add_memory_driver_managed(numa_node, new_res->start, + resource_size(new_res), kmem_name); if (rc) { release_resource(new_res); kfree(new_res); @@ -100,6 +110,7 @@ static int dev_dax_kmem_remove(struct device *dev) */ rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size); if (rc) { + any_hotremove_failed = true; dev_err(dev, "DAX region %pR cannot be hotremoved until the next reboot\n", res); @@ -124,6 +135,7 @@ static int dev_dax_kmem_remove(struct device *dev) * permanently pinned as reserved by the unreleased * request_mem_region(). */ + any_hotremove_failed = true; return 0; } #endif /* CONFIG_MEMORY_HOTREMOVE */ @@ -137,12 +149,24 @@ static struct dax_device_driver device_dax_kmem_driver = { static int __init dax_kmem_init(void) { - return dax_driver_register(&device_dax_kmem_driver); + int rc; + + /* Resource name is permanently allocated if any hotremove fails. */ + kmem_name = kstrdup_const("System RAM (kmem)", GFP_KERNEL); + if (!kmem_name) + return -ENOMEM; + + rc = dax_driver_register(&device_dax_kmem_driver); + if (rc) + kfree_const(kmem_name); + return rc; } static void __exit dax_kmem_exit(void) { dax_driver_unregister(&device_dax_kmem_driver); + if (!any_hotremove_failed) + kfree_const(kmem_name); } MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index 52d2b71f1588..f09b096ba4fd 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -257,54 +257,6 @@ static int ttm_copy_io_page(void *dst, void *src, unsigned long page) return 0; } -#ifdef CONFIG_X86 -#define __ttm_kmap_atomic_prot(__page, __prot) kmap_atomic_prot(__page, __prot) -#define __ttm_kunmap_atomic(__addr) kunmap_atomic(__addr) -#else -#define __ttm_kmap_atomic_prot(__page, __prot) vmap(&__page, 1, 0, __prot) -#define __ttm_kunmap_atomic(__addr) vunmap(__addr) -#endif - - -/** - * ttm_kmap_atomic_prot - Efficient kernel map of a single page with - * specified page protection. - * - * @page: The page to map. - * @prot: The page protection. - * - * This function maps a TTM page using the kmap_atomic api if available, - * otherwise falls back to vmap. The user must make sure that the - * specified page does not have an aliased mapping with a different caching - * policy unless the architecture explicitly allows it. Also mapping and - * unmapping using this api must be correctly nested. Unmapping should - * occur in the reverse order of mapping. - */ -void *ttm_kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) - return kmap_atomic(page); - else - return __ttm_kmap_atomic_prot(page, prot); -} -EXPORT_SYMBOL(ttm_kmap_atomic_prot); - -/** - * ttm_kunmap_atomic_prot - Unmap a page that was mapped using - * ttm_kmap_atomic_prot. - * - * @addr: The virtual address from the map. - * @prot: The page protection. - */ -void ttm_kunmap_atomic_prot(void *addr, pgprot_t prot) -{ - if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) - kunmap_atomic(addr); - else - __ttm_kunmap_atomic(addr); -} -EXPORT_SYMBOL(ttm_kunmap_atomic_prot); - static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, unsigned long page, pgprot_t prot) @@ -316,13 +268,13 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, return -ENOMEM; src = (void *)((unsigned long)src + (page << PAGE_SHIFT)); - dst = ttm_kmap_atomic_prot(d, prot); + dst = kmap_atomic_prot(d, prot); if (!dst) return -ENOMEM; memcpy_fromio(dst, src, PAGE_SIZE); - ttm_kunmap_atomic_prot(dst, prot); + kunmap_atomic(dst); return 0; } @@ -338,13 +290,13 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, return -ENOMEM; dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT)); - src = ttm_kmap_atomic_prot(s, prot); + src = kmap_atomic_prot(s, prot); if (!src) return -ENOMEM; memcpy_toio(dst, src, PAGE_SIZE); - ttm_kunmap_atomic_prot(src, prot); + kunmap_atomic(src); return 0; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c index bb46ca0c458f..1629427d5734 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c @@ -27,6 +27,7 @@ **************************************************************************/ #include "vmwgfx_drv.h" +#include <linux/highmem.h> /* * Template that implements find_first_diff() for a generic @@ -374,12 +375,12 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, copy_size = min_t(u32, copy_size, PAGE_SIZE - src_page_offset); if (unmap_src) { - ttm_kunmap_atomic_prot(d->src_addr, d->src_prot); + kunmap_atomic(d->src_addr); d->src_addr = NULL; } if (unmap_dst) { - ttm_kunmap_atomic_prot(d->dst_addr, d->dst_prot); + kunmap_atomic(d->dst_addr); d->dst_addr = NULL; } @@ -388,8 +389,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, return -EINVAL; d->dst_addr = - ttm_kmap_atomic_prot(d->dst_pages[dst_page], - d->dst_prot); + kmap_atomic_prot(d->dst_pages[dst_page], + d->dst_prot); if (!d->dst_addr) return -ENOMEM; @@ -401,8 +402,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, return -EINVAL; d->src_addr = - ttm_kmap_atomic_prot(d->src_pages[src_page], - d->src_prot); + kmap_atomic_prot(d->src_pages[src_page], + d->src_prot); if (!d->src_addr) return -ENOMEM; @@ -499,9 +500,9 @@ int vmw_bo_cpu_blit(struct ttm_buffer_object *dst, } out: if (d.src_addr) - ttm_kunmap_atomic_prot(d.src_addr, d.src_prot); + kunmap_atomic(d.src_addr); if (d.dst_addr) - ttm_kunmap_atomic_prot(d.dst_addr, d.dst_prot); + kunmap_atomic(d.dst_addr); return ret; } diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 10af330153b5..451608e960a1 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -572,14 +572,12 @@ static void dma_req_free(struct kref *ref) struct mport_dma_req *req = container_of(ref, struct mport_dma_req, refcount); struct mport_cdev_priv *priv = req->priv; - unsigned int i; dma_unmap_sg(req->dmach->device->dev, req->sgt.sgl, req->sgt.nents, req->dir); sg_free_table(&req->sgt); if (req->page_list) { - for (i = 0; i < req->nr_pages; i++) - put_page(req->page_list[i]); + unpin_user_pages(req->page_list, req->nr_pages); kfree(req->page_list); } @@ -815,7 +813,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, struct mport_dma_req *req; struct mport_dev *md = priv->md; struct dma_chan *chan; - int i, ret; + int ret; int nents; if (xfer->length == 0) @@ -862,7 +860,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, goto err_req; } - pinned = get_user_pages_fast( + pinned = pin_user_pages_fast( (unsigned long)xfer->loc_addr & PAGE_MASK, nr_pages, dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0, @@ -870,7 +868,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, if (pinned != nr_pages) { if (pinned < 0) { - rmcd_error("get_user_pages_unlocked err=%ld", + rmcd_error("pin_user_pages_fast err=%ld", pinned); nr_pages = 0; } else @@ -951,8 +949,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, err_pg: if (!req->page_list) { - for (i = 0; i < nr_pages; i++) - put_page(page_list[i]); + unpin_user_pages(page_list, nr_pages); kfree(page_list); } err_req: @@ -2384,13 +2381,6 @@ static struct mport_dev *mport_cdev_add(struct rio_mport *mport) cdev_init(&md->cdev, &mport_fops); md->cdev.owner = THIS_MODULE; - ret = cdev_device_add(&md->cdev, &md->dev); - if (ret) { - rmcd_error("Failed to register mport %d (err=%d)", - mport->id, ret); - goto err_cdev; - } - INIT_LIST_HEAD(&md->doorbells); spin_lock_init(&md->db_lock); INIT_LIST_HEAD(&md->portwrites); @@ -2410,6 +2400,13 @@ static struct mport_dev *mport_cdev_add(struct rio_mport *mport) #else md->properties.transfer_mode |= RIO_TRANSFER_MODE_TRANSFER; #endif + + ret = cdev_device_add(&md->cdev, &md->dev); + if (ret) { + rmcd_error("Failed to register mport %d (err=%d)", + mport->id, ret); + goto err_cdev; + } ret = rio_query_mport(mport, &attr); if (!ret) { md->properties.flags = attr.flags; diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index aa45840d8273..de624c47e190 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -31,6 +31,7 @@ #include <linux/types.h> #include <linux/genalloc.h> #include <linux/io.h> +#include <linux/kcov.h> #include <linux/phy/phy.h> #include <linux/usb.h> @@ -1645,7 +1646,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb) /* pass ownership to the completion handler */ urb->status = status; + kcov_remote_start_usb((u64)urb->dev->bus->busnum); urb->complete(urb); + kcov_remote_stop(); usb_anchor_resume_wakeups(anchor); atomic_dec(&urb->use_count); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 44813ceecc47..e5d50bd880e9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -353,8 +353,6 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, return 0; } -#ifndef elf_map - static unsigned long elf_map(struct file *filep, unsigned long addr, const struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) @@ -394,8 +392,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } -#endif /* !elf_map */ - static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr) { int i, first_idx = -1, last_idx = -1; diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index 995883693cb2..06b9b9fddf70 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -64,15 +64,15 @@ static int load_em86(struct linux_binprm *bprm) * user environment and arguments are stored. */ remove_arg_zero(bprm); - retval = copy_strings_kernel(1, &bprm->filename, bprm); + retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) return retval; bprm->argc++; if (i_arg) { - retval = copy_strings_kernel(1, &i_arg, bprm); + retval = copy_string_kernel(i_arg, bprm); if (retval < 0) return retval; bprm->argc++; } - retval = copy_strings_kernel(1, &i_name, bprm); + retval = copy_string_kernel(i_name, bprm); if (retval < 0) return retval; bprm->argc++; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index bc5506619b7e..3880a82da1dc 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -163,13 +163,13 @@ static int load_misc_binary(struct linux_binprm *bprm) bprm->have_execfd = 1; /* make argv[1] be the path to the binary */ - retval = copy_strings_kernel(1, &bprm->interp, bprm); + retval = copy_string_kernel(bprm->interp, bprm); if (retval < 0) goto ret; bprm->argc++; /* add the interp as argv[0] */ - retval = copy_strings_kernel(1, &fmt->interpreter, bprm); + retval = copy_string_kernel(fmt->interpreter, bprm); if (retval < 0) goto ret; bprm->argc++; diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 0e8b953d12cf..1b6625e95958 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -106,19 +106,19 @@ static int load_script(struct linux_binprm *bprm) retval = remove_arg_zero(bprm); if (retval) return retval; - retval = copy_strings_kernel(1, &bprm->interp, bprm); + retval = copy_string_kernel(bprm->interp, bprm); if (retval < 0) return retval; bprm->argc++; *((char *)i_end) = '\0'; if (i_arg) { *((char *)i_sep) = '\0'; - retval = copy_strings_kernel(1, &i_arg, bprm); + retval = copy_string_kernel(i_arg, bprm); if (retval < 0) return retval; bprm->argc++; } - retval = copy_strings_kernel(1, &i_name, bprm); + retval = copy_string_kernel(i_name, bprm); if (retval) return retval; bprm->argc++; diff --git a/fs/exec.c b/fs/exec.c index e850ee7dd636..93ff1c4c7ebb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -590,24 +590,48 @@ out: } /* - * Like copy_strings, but get argv and its values from kernel memory. + * Copy and argument/environment string from the kernel to the processes stack. */ -int copy_strings_kernel(int argc, const char *const *__argv, - struct linux_binprm *bprm) +int copy_string_kernel(const char *arg, struct linux_binprm *bprm) { - int r; - mm_segment_t oldfs = get_fs(); - struct user_arg_ptr argv = { - .ptr.native = (const char __user *const __user *)__argv, - }; + int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */; + unsigned long pos = bprm->p; + + if (len == 0) + return -EFAULT; + if (!valid_arg_len(bprm, len)) + return -E2BIG; + + /* We're going to work our way backwards. */ + arg += len; + bprm->p -= len; + if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin) + return -E2BIG; - set_fs(KERNEL_DS); - r = copy_strings(argc, argv, bprm); - set_fs(oldfs); + while (len > 0) { + unsigned int bytes_to_copy = min_t(unsigned int, len, + min_not_zero(offset_in_page(pos), PAGE_SIZE)); + struct page *page; + char *kaddr; - return r; + pos -= bytes_to_copy; + arg -= bytes_to_copy; + len -= bytes_to_copy; + + page = get_arg_page(bprm, pos, 1); + if (!page) + return -E2BIG; + kaddr = kmap_atomic(page); + flush_arg_page(bprm, pos & PAGE_MASK, page); + memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy); + flush_kernel_dcache_page(page); + kunmap_atomic(kaddr); + put_arg_page(page); + } + + return 0; } -EXPORT_SYMBOL(copy_strings_kernel); +EXPORT_SYMBOL(copy_string_kernel); #ifdef CONFIG_MMU @@ -1883,7 +1907,7 @@ static int __do_execve_file(int fd, struct filename *filename, if (retval) goto out; - retval = copy_strings_kernel(1, &bprm->filename, bprm); + retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out; diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 3647c65a0f48..bbfe18c07417 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -632,20 +632,80 @@ error: } EXPORT_SYMBOL_GPL(fat_free_clusters); -/* 128kb is the whole sectors for FAT12 and FAT16 */ -#define FAT_READA_SIZE (128 * 1024) +struct fatent_ra { + sector_t cur; + sector_t limit; + + unsigned int ra_blocks; + sector_t ra_advance; + sector_t ra_next; + sector_t ra_limit; +}; -static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent, - unsigned long reada_blocks) +static void fat_ra_init(struct super_block *sb, struct fatent_ra *ra, + struct fat_entry *fatent, int ent_limit) { - const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; - sector_t blocknr; - int i, offset; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + const struct fatent_operations *ops = sbi->fatent_ops; + sector_t blocknr, block_end; + int offset; + /* + * This is the sequential read, so ra_pages * 2 (but try to + * align the optimal hardware IO size). + * [BTW, 128kb covers the whole sectors for FAT12 and FAT16] + */ + unsigned long ra_pages = sb->s_bdi->ra_pages; + unsigned int reada_blocks; + if (ra_pages > sb->s_bdi->io_pages) + ra_pages = rounddown(ra_pages, sb->s_bdi->io_pages); + reada_blocks = ra_pages << (PAGE_SHIFT - sb->s_blocksize_bits + 1); + + /* Initialize the range for sequential read */ ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); + ops->ent_blocknr(sb, ent_limit - 1, &offset, &block_end); + ra->cur = 0; + ra->limit = (block_end + 1) - blocknr; - for (i = 0; i < reada_blocks; i++) - sb_breadahead(sb, blocknr + i); + /* Advancing the window at half size */ + ra->ra_blocks = reada_blocks >> 1; + ra->ra_advance = ra->cur; + ra->ra_next = ra->cur; + ra->ra_limit = ra->cur + min_t(sector_t, reada_blocks, ra->limit); +} + +/* Assuming to be called before reading a new block (increments ->cur). */ +static void fat_ent_reada(struct super_block *sb, struct fatent_ra *ra, + struct fat_entry *fatent) +{ + if (ra->ra_next >= ra->ra_limit) + return; + + if (ra->cur >= ra->ra_advance) { + struct msdos_sb_info *sbi = MSDOS_SB(sb); + const struct fatent_operations *ops = sbi->fatent_ops; + struct blk_plug plug; + sector_t blocknr, diff; + int offset; + + ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); + + diff = blocknr - ra->cur; + blk_start_plug(&plug); + /* + * FIXME: we would want to directly use the bio with + * pages to reduce the number of segments. + */ + for (; ra->ra_next < ra->ra_limit; ra->ra_next++) + sb_breadahead(sb, ra->ra_next + diff); + blk_finish_plug(&plug); + + /* Advance the readahead window */ + ra->ra_advance += ra->ra_blocks; + ra->ra_limit += min_t(sector_t, + ra->ra_blocks, ra->limit - ra->ra_limit); + } + ra->cur++; } int fat_count_free_clusters(struct super_block *sb) @@ -653,27 +713,20 @@ int fat_count_free_clusters(struct super_block *sb) struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; - unsigned long reada_blocks, reada_mask, cur_block; + struct fatent_ra fatent_ra; int err = 0, free; lock_fat(sbi); if (sbi->free_clusters != -1 && sbi->free_clus_valid) goto out; - reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits; - reada_mask = reada_blocks - 1; - cur_block = 0; - free = 0; fatent_init(&fatent); fatent_set_entry(&fatent, FAT_START_ENT); + fat_ra_init(sb, &fatent_ra, &fatent, sbi->max_cluster); while (fatent.entry < sbi->max_cluster) { /* readahead of fat blocks */ - if ((cur_block & reada_mask) == 0) { - unsigned long rest = sbi->fat_length - cur_block; - fat_ent_reada(sb, &fatent, min(reada_blocks, rest)); - } - cur_block++; + fat_ent_reada(sb, &fatent_ra, &fatent); err = fat_ent_read_block(sb, &fatent); if (err) @@ -707,9 +760,9 @@ int fat_trim_fs(struct inode *inode, struct fstrim_range *range) struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; + struct fatent_ra fatent_ra; u64 ent_start, ent_end, minlen, trimmed = 0; u32 free = 0; - unsigned long reada_blocks, reada_mask, cur_block = 0; int err = 0; /* @@ -727,19 +780,13 @@ int fat_trim_fs(struct inode *inode, struct fstrim_range *range) if (ent_end >= sbi->max_cluster) ent_end = sbi->max_cluster - 1; - reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits; - reada_mask = reada_blocks - 1; - fatent_init(&fatent); lock_fat(sbi); fatent_set_entry(&fatent, ent_start); + fat_ra_init(sb, &fatent_ra, &fatent, ent_end + 1); while (fatent.entry <= ent_end) { /* readahead of fat blocks */ - if ((cur_block & reada_mask) == 0) { - unsigned long rest = sbi->fat_length - cur_block; - fat_ent_reada(sb, &fatent, min(reada_blocks, rest)); - } - cur_block++; + fat_ent_reada(sb, &fatent_ra, &fatent); err = fat_ent_read_block(sb, &fatent); if (err) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index e6e68b2274a5..a0cf99debb1e 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1519,6 +1519,12 @@ static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b, goto out; } + if (bpb->fat_fat_length == 0 && bpb->fat32_length == 0) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus number of FAT sectors"); + goto out; + } + error = 0; out: diff --git a/fs/proc/array.c b/fs/proc/array.c index 043311014db2..713ffac59bbb 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -248,8 +248,8 @@ void render_sigset_t(struct seq_file *m, const char *header, seq_putc(m, '\n'); } -static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, - sigset_t *catch) +static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *sigign, + sigset_t *sigcatch) { struct k_sigaction *k; int i; @@ -257,9 +257,9 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, k = p->sighand->action; for (i = 1; i <= _NSIG; ++i, ++k) { if (k->sa.sa_handler == SIG_IGN) - sigaddset(ign, i); + sigaddset(sigign, i); else if (k->sa.sa_handler != SIG_DFL) - sigaddset(catch, i); + sigaddset(sigcatch, i); } } diff --git a/fs/seq_file.c b/fs/seq_file.c index 70f5fdf99bf6..4e6239f33c06 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -6,6 +6,8 @@ * initial implementation -- AV, Oct 2001. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/cache.h> #include <linux/fs.h> #include <linux/export.h> @@ -233,9 +235,8 @@ Fill: p = m->op->next(m, p, &m->index); if (pos == m->index) { - pr_info_ratelimited("buggy seq_file .next function %ps " - "did not updated position index\n", - m->op->next); + pr_info_ratelimited("buggy .next function %ps did not update position index\n", + m->op->next); m->index++; } if (!p || IS_ERR(p)) { diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h deleted file mode 100644 index 58046ddc08d0..000000000000 --- a/include/asm-generic/5level-fixup.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _5LEVEL_FIXUP_H -#define _5LEVEL_FIXUP_H - -#define __ARCH_HAS_5LEVEL_HACK -#define __PAGETABLE_P4D_FOLDED 1 - -#define P4D_SHIFT PGDIR_SHIFT -#define P4D_SIZE PGDIR_SIZE -#define P4D_MASK PGDIR_MASK -#define MAX_PTRS_PER_P4D 1 -#define PTRS_PER_P4D 1 - -#define p4d_t pgd_t - -#define pud_alloc(mm, p4d, address) \ - ((unlikely(pgd_none(*(p4d))) && __pud_alloc(mm, p4d, address)) ? \ - NULL : pud_offset(p4d, address)) - -#define p4d_alloc(mm, pgd, address) (pgd) -#define p4d_alloc_track(mm, pgd, address, mask) (pgd) -#define p4d_offset(pgd, start) (pgd) - -#ifndef __ASSEMBLY__ -static inline int p4d_none(p4d_t p4d) -{ - return 0; -} - -static inline int p4d_bad(p4d_t p4d) -{ - return 0; -} - -static inline int p4d_present(p4d_t p4d) -{ - return 1; -} -#endif - -#define p4d_ERROR(p4d) do { } while (0) -#define p4d_clear(p4d) pgd_clear(p4d) -#define p4d_val(p4d) pgd_val(p4d) -#define p4d_populate(mm, p4d, pud) pgd_populate(mm, p4d, pud) -#define p4d_populate_safe(mm, p4d, pud) pgd_populate(mm, p4d, pud) -#define p4d_page(p4d) pgd_page(p4d) -#define p4d_page_vaddr(p4d) pgd_page_vaddr(p4d) - -#define __p4d(x) __pgd(x) -#define set_p4d(p4dp, p4d) set_pgd(p4dp, p4d) - -#undef p4d_free_tlb -#define p4d_free_tlb(tlb, x, addr) do { } while (0) -#define p4d_free(mm, x) do { } while (0) - -#undef p4d_addr_end -#define p4d_addr_end(addr, end) (end) - -#endif diff --git a/include/asm-generic/pgtable-nop4d-hack.h b/include/asm-generic/pgtable-nop4d-hack.h deleted file mode 100644 index 829bdb0d6327..000000000000 --- a/include/asm-generic/pgtable-nop4d-hack.h +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PGTABLE_NOP4D_HACK_H -#define _PGTABLE_NOP4D_HACK_H - -#ifndef __ASSEMBLY__ -#include <asm-generic/5level-fixup.h> - -#define __PAGETABLE_PUD_FOLDED 1 - -/* - * Having the pud type consist of a pgd gets the size right, and allows - * us to conceptually access the pgd entry that this pud is folded into - * without casting. - */ -typedef struct { pgd_t pgd; } pud_t; - -#define PUD_SHIFT PGDIR_SHIFT -#define PTRS_PER_PUD 1 -#define PUD_SIZE (1UL << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) - -/* - * The "pgd_xxx()" functions here are trivial for a folded two-level - * setup: the pud is never bad, and a pud always exists (as it's folded - * into the pgd entry) - */ -static inline int pgd_none(pgd_t pgd) { return 0; } -static inline int pgd_bad(pgd_t pgd) { return 0; } -static inline int pgd_present(pgd_t pgd) { return 1; } -static inline void pgd_clear(pgd_t *pgd) { } -#define pud_ERROR(pud) (pgd_ERROR((pud).pgd)) - -#define pgd_populate(mm, pgd, pud) do { } while (0) -#define pgd_populate_safe(mm, pgd, pud) do { } while (0) -/* - * (puds are folded into pgds so this doesn't get actually called, - * but the define is needed for a generic inline function.) - */ -#define set_pgd(pgdptr, pgdval) set_pud((pud_t *)(pgdptr), (pud_t) { pgdval }) - -static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) -{ - return (pud_t *)pgd; -} - -#define pud_val(x) (pgd_val((x).pgd)) -#define __pud(x) ((pud_t) { __pgd(x) }) - -#define pgd_page(pgd) (pud_page((pud_t){ pgd })) -#define pgd_page_vaddr(pgd) (pud_page_vaddr((pud_t){ pgd })) - -/* - * allocating and freeing a pud is trivial: the 1-entry pud is - * inside the pgd, so has no extra memory associated with it. - */ -#define pud_alloc_one(mm, address) NULL -#define pud_free(mm, x) do { } while (0) -#define __pud_free_tlb(tlb, x, a) do { } while (0) - -#undef pud_addr_end -#define pud_addr_end(addr, end) (end) - -#endif /* __ASSEMBLY__ */ -#endif /* _PGTABLE_NOP4D_HACK_H */ diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h index d3776cb494c0..ad05c1684bfc 100644 --- a/include/asm-generic/pgtable-nopud.h +++ b/include/asm-generic/pgtable-nopud.h @@ -4,9 +4,6 @@ #ifndef __ASSEMBLY__ -#ifdef __ARCH_USE_5LEVEL_HACK -#include <asm-generic/pgtable-nop4d-hack.h> -#else #include <asm-generic/pgtable-nop4d.h> #define __PAGETABLE_PUD_FOLDED 1 @@ -65,5 +62,4 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) #define pud_addr_end(addr, end) (end) #endif /* __ASSEMBLY__ */ -#endif /* !__ARCH_USE_5LEVEL_HACK */ #endif /* _PGTABLE_NOPUD_H */ diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index 0a9d042e075a..de1ccdcd5703 100644 --- a/include/drm/ttm/ttm_bo_api.h +++ b/include/drm/ttm/ttm_bo_api.h @@ -668,10 +668,6 @@ int ttm_bo_mmap_obj(struct vm_area_struct *vma, struct ttm_buffer_object *bo); int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma, struct ttm_bo_device *bdev); -void *ttm_kmap_atomic_prot(struct page *page, pgprot_t prot); - -void ttm_kunmap_atomic_prot(void *addr, pgprot_t prot); - /** * ttm_bo_io * diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index aece1b340e7d..4a20b7517dd0 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -130,8 +130,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm, extern int transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *sp_location); extern int bprm_change_interp(const char *interp, struct linux_binprm *bprm); -extern int copy_strings_kernel(int argc, const char *const *argv, - struct linux_binprm *bprm); +int copy_string_kernel(const char *arg, struct linux_binprm *bprm); extern void set_binfmt(struct linux_binfmt *new); extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t); diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 9acf654f0b19..99f2ac30b1d9 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -72,7 +72,7 @@ static inline int get_bitmask_order(unsigned int count) static __always_inline unsigned long hweight_long(unsigned long w) { - return sizeof(w) == 4 ? hweight32(w) : hweight64(w); + return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w); } /** diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h index 594d4e78654f..69b136e4dd2b 100644 --- a/include/linux/elfnote.h +++ b/include/linux/elfnote.h @@ -54,7 +54,7 @@ .popsection ; #define ELFNOTE(name, type, desc) \ - ELFNOTE_START(name, type, "") \ + ELFNOTE_START(name, type, "a") \ desc ; \ ELFNOTE_END diff --git a/include/linux/highmem.h b/include/linux/highmem.h index ea5cdbd8c2c3..d6e82e3de027 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -32,8 +32,65 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #include <asm/kmap_types.h> #ifdef CONFIG_HIGHMEM +extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); +extern void kunmap_atomic_high(void *kvaddr); #include <asm/highmem.h> +#ifndef ARCH_HAS_KMAP_FLUSH_TLB +static inline void kmap_flush_tlb(unsigned long addr) { } +#endif + +#ifndef kmap_prot +#define kmap_prot PAGE_KERNEL +#endif + +void *kmap_high(struct page *page); +static inline void *kmap(struct page *page) +{ + void *addr; + + might_sleep(); + if (!PageHighMem(page)) + addr = page_address(page); + else + addr = kmap_high(page); + kmap_flush_tlb((unsigned long)addr); + return addr; +} + +void kunmap_high(struct page *page); + +static inline void kunmap(struct page *page) +{ + might_sleep(); + if (!PageHighMem(page)) + return; + kunmap_high(page); +} + +/* + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because + * no global lock is needed and because the kmap code must perform a global TLB + * invalidation when the kmap pool wraps. + * + * However when holding an atomic kmap is is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + * + * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap + * gives a more generic (and caching) interface. But kmap_atomic can + * be used in IRQ contexts, so in some (very limited) cases we need + * it. + */ +static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + preempt_disable(); + pagefault_disable(); + if (!PageHighMem(page)) + return page_address(page); + return kmap_atomic_high_prot(page, prot); +} +#define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot) + /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; @@ -77,15 +134,21 @@ static inline struct page *kmap_to_page(void *addr) static inline unsigned long totalhigh_pages(void) { return 0UL; } -#ifndef ARCH_HAS_KMAP static inline void *kmap(struct page *page) { might_sleep(); return page_address(page); } +static inline void kunmap_high(struct page *page) +{ +} + static inline void kunmap(struct page *page) { +#ifdef ARCH_HAS_FLUSH_ON_KUNMAP + kunmap_flush_on_unmap(page_address(page)); +#endif } static inline void *kmap_atomic(struct page *page) @@ -96,16 +159,20 @@ static inline void *kmap_atomic(struct page *page) } #define kmap_atomic_prot(page, prot) kmap_atomic(page) -static inline void __kunmap_atomic(void *addr) +static inline void kunmap_atomic_high(void *addr) { - pagefault_enable(); - preempt_enable(); + /* + * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() + * handles re-enabling faults + preemption + */ +#ifdef ARCH_HAS_FLUSH_ON_KUNMAP + kunmap_flush_on_unmap(addr); +#endif } #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) #define kmap_flush_unused() do {} while(0) -#endif #endif /* CONFIG_HIGHMEM */ @@ -149,7 +216,9 @@ static inline void kmap_atomic_idx_pop(void) #define kunmap_atomic(addr) \ do { \ BUILD_BUG_ON(__same_type((addr), struct page *)); \ - __kunmap_atomic(addr); \ + kunmap_atomic_high(addr); \ + pagefault_enable(); \ + preempt_enable(); \ } while (0) diff --git a/include/linux/ioport.h b/include/linux/ioport.h index a9b9170b5dd2..cc9a5b4593ca 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -103,6 +103,7 @@ struct resource { #define IORESOURCE_MEM_32BIT (3<<3) #define IORESOURCE_MEM_SHADOWABLE (1<<5) /* dup: IORESOURCE_SHADOWABLE */ #define IORESOURCE_MEM_EXPANSIONROM (1<<6) +#define IORESOURCE_MEM_DRIVER_MANAGED (1<<7) /* PnP I/O specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IO_16BIT_ADDR (1<<0) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 93d9ada74ddd..fee7fab5d706 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -314,19 +314,12 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} #ifdef CONFIG_MEMORY_HOTREMOVE -extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern int remove_memory(int nid, u64 start, u64 size); extern void __remove_memory(int nid, u64 start, u64 size); #else -static inline bool is_mem_section_removable(unsigned long pfn, - unsigned long nr_pages) -{ - return false; -} - static inline void try_offline_node(int nid) {} static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) @@ -349,6 +342,8 @@ extern void __ref free_area_init_core_hotplug(int nid); extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource); +extern int add_memory_driver_managed(int nid, u64 start, u64 size, + const char *resource_name); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); extern void remove_pfn_range_from_zone(struct zone *zone, diff --git a/include/linux/mm.h b/include/linux/mm.h index 66e0977f970a..86adc71a972f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -776,6 +776,7 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) } extern void kvfree(const void *addr); +extern void kvfree_sensitive(const void *addr, size_t len); /* * Mapcount of compound page as a whole, does not include mapped sub-pages. @@ -1372,7 +1373,7 @@ static inline int cpu_pid_to_cpupid(int nid, int pid) static inline bool cpupid_pid_unset(int cpupid) { - return 1; + return true; } static inline void page_cpupid_reset_last(struct page *page) @@ -1725,7 +1726,7 @@ struct frame_vector { unsigned int nr_frames; /* Number of frames stored in ptrs array */ bool got_ref; /* Did we pin pages by getting page ref? */ bool is_pfns; /* Does array contain pages or pfns? */ - void *ptrs[0]; /* Array of pinned pfns / pages. Use + void *ptrs[]; /* Array of pinned pfns / pages. Use * pfns_vector_pages() or pfns_vector_pfns() * for access */ }; @@ -2069,11 +2070,6 @@ int __pte_alloc_kernel(pmd_t *pmd); #if defined(CONFIG_MMU) -/* - * The following ifdef needed to get the 5level-fixup.h header to work. - * Remove it when 5level-fixup.h has been removed. - */ -#ifndef __ARCH_HAS_5LEVEL_HACK static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { @@ -2102,8 +2098,6 @@ static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, return p4d_offset(pgd, address); } -#endif /* !__ARCH_HAS_5LEVEL_HACK */ - static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d, unsigned long address, pgtbl_mod_mask *mod_mask) diff --git a/include/linux/sched.h b/include/linux/sched.h index 57a5ce9f33c5..c5d96e3e7fff 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1247,6 +1247,9 @@ struct task_struct { /* KCOV sequence number: */ int kcov_sequence; + + /* Collect coverage from softirq context: */ + unsigned int kcov_softirq; #endif #ifdef CONFIG_MEMCG diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 1672cf6f7614..813614d4b71f 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -145,6 +145,25 @@ void *__seq_open_private(struct file *, const struct seq_operations *, int); int seq_open_private(struct file *, const struct seq_operations *, int); int seq_release_private(struct inode *, struct file *); +#define DEFINE_SEQ_ATTRIBUTE(__name) \ +static int __name ## _open(struct inode *inode, struct file *file) \ +{ \ + int ret = seq_open(file, &__name ## _sops); \ + if (!ret && inode->i_private) { \ + struct seq_file *seq_f = file->private_data; \ + seq_f->private = inode->i_private; \ + } \ + return ret; \ +} \ + \ +static const struct file_operations __name ## _fops = { \ + .owner = THIS_MODULE, \ + .open = __name ## _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = seq_release, \ +} + #define DEFINE_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ diff --git a/init/Kconfig b/init/Kconfig index fdb4f52609c6..2d12d38cdd88 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -260,6 +260,16 @@ config KERNEL_UNCOMPRESSED endchoice +config DEFAULT_INIT + string "Default init path" + default "" + help + This option determines the default init for the system if no init= + option is passed on the kernel command line. If the requested path is + not present, we will still then move on to attempting further + locations (e.g. /sbin/init, etc). If this is empty, we will just use + the fallback list when init= is not passed. + config DEFAULT_HOSTNAME string "Default hostname" default "(none)" diff --git a/init/main.c b/init/main.c index df32f67214d2..76df62fc3e2c 100644 --- a/init/main.c +++ b/init/main.c @@ -1433,6 +1433,16 @@ static int __ref kernel_init(void *unused) panic("Requested init %s failed (error %d).", execute_command, ret); } + + if (CONFIG_DEFAULT_INIT[0] != '\0') { + ret = run_init_process(CONFIG_DEFAULT_INIT); + if (ret) + pr_err("Default init %s failed (error %d)\n", + CONFIG_DEFAULT_INIT, ret); + else + return 0; + } + if (!try_to_run_init_process("/sbin/init") || !try_to_run_init_process("/etc/init") || !try_to_run_init_process("/bin/init") || diff --git a/kernel/kcov.c b/kernel/kcov.c index 8accc9722a81..55c5d883a93e 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -86,6 +86,18 @@ static DEFINE_SPINLOCK(kcov_remote_lock); static DEFINE_HASHTABLE(kcov_remote_map, 4); static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas); +struct kcov_percpu_data { + void *irq_area; + + unsigned int saved_mode; + unsigned int saved_size; + void *saved_area; + struct kcov *saved_kcov; + int saved_sequence; +}; + +DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); + /* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_find(u64 handle) { @@ -98,6 +110,7 @@ static struct kcov_remote *kcov_remote_find(u64 handle) return NULL; } +/* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_add(struct kcov *kcov, u64 handle) { struct kcov_remote *remote; @@ -119,16 +132,13 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) struct kcov_remote_area *area; struct list_head *pos; - kcov_debug("size = %u\n", size); list_for_each(pos, &kcov_remote_areas) { area = list_entry(pos, struct kcov_remote_area, list); if (area->size == size) { list_del(&area->list); - kcov_debug("rv = %px\n", area); return area; } } - kcov_debug("rv = NULL\n"); return NULL; } @@ -136,7 +146,6 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) static void kcov_remote_area_put(struct kcov_remote_area *area, unsigned int size) { - kcov_debug("area = %px, size = %u\n", area, size); INIT_LIST_HEAD(&area->list); area->size = size; list_add(&area->list, &kcov_remote_areas); @@ -148,9 +157,10 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru /* * We are interested in code coverage as a function of a syscall inputs, - * so we ignore code executed in interrupts. + * so we ignore code executed in interrupts, unless we are in a remote + * coverage collection section in a softirq. */ - if (!in_task()) + if (!in_task() && !(in_serving_softirq() && t->kcov_softirq)) return false; mode = READ_ONCE(t->kcov_mode); /* @@ -312,23 +322,26 @@ void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases) EXPORT_SYMBOL(__sanitizer_cov_trace_switch); #endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */ -static void kcov_start(struct task_struct *t, unsigned int size, - void *area, enum kcov_mode mode, int sequence) +static void kcov_start(struct task_struct *t, struct kcov *kcov, + unsigned int size, void *area, enum kcov_mode mode, + int sequence) { kcov_debug("t = %px, size = %u, area = %px\n", t, size, area); + t->kcov = kcov; /* Cache in task struct for performance. */ t->kcov_size = size; t->kcov_area = area; + t->kcov_sequence = sequence; /* See comment in check_kcov_mode(). */ barrier(); WRITE_ONCE(t->kcov_mode, mode); - t->kcov_sequence = sequence; } static void kcov_stop(struct task_struct *t) { WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED); barrier(); + t->kcov = NULL; t->kcov_size = 0; t->kcov_area = NULL; } @@ -336,7 +349,6 @@ static void kcov_stop(struct task_struct *t) static void kcov_task_reset(struct task_struct *t) { kcov_stop(t); - t->kcov = NULL; t->kcov_sequence = 0; t->kcov_handle = 0; } @@ -361,18 +373,18 @@ static void kcov_remote_reset(struct kcov *kcov) int bkt; struct kcov_remote *remote; struct hlist_node *tmp; + unsigned long flags; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) { if (remote->kcov != kcov) continue; - kcov_debug("removing handle %llx\n", remote->handle); hash_del(&remote->hnode); kfree(remote); } /* Do reset before unlock to prevent races with kcov_remote_start(). */ kcov_reset(kcov); - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); } static void kcov_disable(struct task_struct *t, struct kcov *kcov) @@ -401,12 +413,13 @@ static void kcov_put(struct kcov *kcov) void kcov_task_exit(struct task_struct *t) { struct kcov *kcov; + unsigned long flags; kcov = t->kcov; if (kcov == NULL) return; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); kcov_debug("t = %px, kcov->t = %px\n", t, kcov->t); /* * For KCOV_ENABLE devices we want to make sure that t->kcov->t == t, @@ -430,12 +443,12 @@ void kcov_task_exit(struct task_struct *t) * By combining all three checks into one we get: */ if (WARN_ON(kcov->t != t)) { - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); return; } /* Just to not leave dangling references behind. */ kcov_disable(t, kcov); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kcov_put(kcov); } @@ -446,12 +459,13 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) struct kcov *kcov = vma->vm_file->private_data; unsigned long size, off; struct page *page; + unsigned long flags; area = vmalloc_user(vma->vm_end - vma->vm_start); if (!area) return -ENOMEM; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { @@ -461,7 +475,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) if (!kcov->area) { kcov->area = area; vma->vm_flags |= VM_DONTEXPAND; - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); for (off = 0; off < size; off += PAGE_SIZE) { page = vmalloc_to_page(kcov->area + off); if (vm_insert_page(vma, vma->vm_start + off, page)) @@ -470,7 +484,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) return 0; } exit: - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); vfree(area); return res; } @@ -550,10 +564,10 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, int mode, i; struct kcov_remote_arg *remote_arg; struct kcov_remote *remote; + unsigned long flags; switch (cmd) { case KCOV_INIT_TRACE: - kcov_debug("KCOV_INIT_TRACE\n"); /* * Enable kcov in trace mode and setup buffer size. * Must happen before anything else. @@ -572,7 +586,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->mode = KCOV_MODE_INIT; return 0; case KCOV_ENABLE: - kcov_debug("KCOV_ENABLE\n"); /* * Enable coverage for the current task. * At this point user must have been enabled trace mode, @@ -590,15 +603,13 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, return mode; kcov_fault_in_area(kcov); kcov->mode = mode; - kcov_start(t, kcov->size, kcov->area, kcov->mode, + kcov_start(t, kcov, kcov->size, kcov->area, kcov->mode, kcov->sequence); - t->kcov = kcov; kcov->t = t; /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); return 0; case KCOV_DISABLE: - kcov_debug("KCOV_DISABLE\n"); /* Disable coverage for the current task. */ unused = arg; if (unused != 0 || current->kcov != kcov) @@ -610,7 +621,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov_put(kcov); return 0; case KCOV_REMOTE_ENABLE: - kcov_debug("KCOV_REMOTE_ENABLE\n"); if (kcov->mode != KCOV_MODE_INIT || !kcov->area) return -EINVAL; t = current; @@ -627,41 +637,42 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->t = t; kcov->remote = true; kcov->remote_size = remote_arg->area_size; - spin_lock(&kcov_remote_lock); + spin_lock_irqsave(&kcov_remote_lock, flags); for (i = 0; i < remote_arg->num_handles; i++) { - kcov_debug("handle %llx\n", remote_arg->handles[i]); if (!kcov_check_handle(remote_arg->handles[i], false, true, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->handles[i]); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } } if (remote_arg->common_handle) { - kcov_debug("common handle %llx\n", - remote_arg->common_handle); if (!kcov_check_handle(remote_arg->common_handle, true, false, false)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return -EINVAL; } remote = kcov_remote_add(kcov, remote_arg->common_handle); if (IS_ERR(remote)) { - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, + flags); kcov_disable(t, kcov); return PTR_ERR(remote); } t->kcov_handle = remote_arg->common_handle; } - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); return 0; @@ -677,6 +688,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) struct kcov_remote_arg *remote_arg = NULL; unsigned int remote_num_handles; unsigned long remote_arg_size; + unsigned long flags; if (cmd == KCOV_REMOTE_ENABLE) { if (get_user(remote_num_handles, (unsigned __user *)(arg + @@ -697,9 +709,9 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) } kcov = filep->private_data; - spin_lock(&kcov->lock); + spin_lock_irqsave(&kcov->lock, flags); res = kcov_ioctl_locked(kcov, cmd, arg); - spin_unlock(&kcov->lock); + spin_unlock_irqrestore(&kcov->lock, flags); kfree(remote_arg); @@ -716,8 +728,8 @@ static const struct file_operations kcov_fops = { /* * kcov_remote_start() and kcov_remote_stop() can be used to annotate a section - * of code in a kernel background thread to allow kcov to be used to collect - * coverage from that part of code. + * of code in a kernel background thread or in a softirq to allow kcov to be + * used to collect coverage from that part of code. * * The handle argument of kcov_remote_start() identifies a code section that is * used for coverage collection. A userspace process passes this handle to @@ -728,9 +740,9 @@ static const struct file_operations kcov_fops = { * the type of the kernel thread whose code is being annotated. * * For global kernel threads that are spawned in a limited number of instances - * (e.g. one USB hub_event() worker thread is spawned per USB HCD), each - * instance must be assigned a unique 4-byte instance id. The instance id is - * then combined with a 1-byte subsystem id to get a handle via + * (e.g. one USB hub_event() worker thread is spawned per USB HCD) and for + * softirqs, each instance must be assigned a unique 4-byte instance id. The + * instance id is then combined with a 1-byte subsystem id to get a handle via * kcov_remote_handle(subsystem_id, instance_id). * * For local kernel threads that are spawned from system calls handler when a @@ -749,70 +761,136 @@ static const struct file_operations kcov_fops = { * * See Documentation/dev-tools/kcov.rst for more details. * - * Internally, this function looks up the kcov device associated with the + * Internally, kcov_remote_start() looks up the kcov device associated with the * provided handle, allocates an area for coverage collection, and saves the * pointers to kcov and area into the current task_struct to allow coverage to * be collected via __sanitizer_cov_trace_pc() * In turns kcov_remote_stop() clears those pointers from task_struct to stop * collecting coverage and copies all collected coverage into the kcov area. */ + +static inline bool kcov_mode_enabled(unsigned int mode) +{ + return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED; +} + +void kcov_remote_softirq_start(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + unsigned int mode; + + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (kcov_mode_enabled(mode)) { + data->saved_mode = mode; + data->saved_size = t->kcov_size; + data->saved_area = t->kcov_area; + data->saved_sequence = t->kcov_sequence; + data->saved_kcov = t->kcov; + kcov_stop(t); + } +} + +void kcov_remote_softirq_stop(struct task_struct *t) +{ + struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); + + if (data->saved_kcov) { + kcov_start(t, data->saved_kcov, data->saved_size, + data->saved_area, data->saved_mode, + data->saved_sequence); + data->saved_mode = 0; + data->saved_size = 0; + data->saved_area = NULL; + data->saved_sequence = 0; + data->saved_kcov = NULL; + } +} + void kcov_remote_start(u64 handle) { + struct task_struct *t = current; struct kcov_remote *remote; + struct kcov *kcov; + unsigned int mode; void *area; - struct task_struct *t; unsigned int size; - enum kcov_mode mode; int sequence; + unsigned long flags; if (WARN_ON(!kcov_check_handle(handle, true, true, true))) return; - if (WARN_ON(!in_task())) + if (!in_task() && !in_serving_softirq()) return; - t = current; + + local_irq_save(flags); + /* - * Check that kcov_remote_start is not called twice - * nor called by user tasks (with enabled kcov). + * Check that kcov_remote_start() is not called twice in background + * threads nor called by user tasks (with enabled kcov). */ - if (WARN_ON(t->kcov)) + mode = READ_ONCE(t->kcov_mode); + if (WARN_ON(in_task() && kcov_mode_enabled(mode))) { + local_irq_restore(flags); return; - - kcov_debug("handle = %llx\n", handle); + } + /* + * Check that kcov_remote_start() is not called twice in softirqs. + * Note, that kcov_remote_start() can be called from a softirq that + * happened while collecting coverage from a background thread. + */ + if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); + return; + } spin_lock(&kcov_remote_lock); remote = kcov_remote_find(handle); if (!remote) { - kcov_debug("no remote found"); - spin_unlock(&kcov_remote_lock); + spin_unlock_irqrestore(&kcov_remote_lock, flags); return; } + kcov_debug("handle = %llx, context: %s\n", handle, + in_task() ? "task" : "softirq"); + kcov = remote->kcov; /* Put in kcov_remote_stop(). */ - kcov_get(remote->kcov); - t->kcov = remote->kcov; + kcov_get(kcov); /* * Read kcov fields before unlock to prevent races with * KCOV_DISABLE / kcov_remote_reset(). */ - size = remote->kcov->remote_size; - mode = remote->kcov->mode; - sequence = remote->kcov->sequence; - area = kcov_remote_area_get(size); - spin_unlock(&kcov_remote_lock); + mode = kcov->mode; + sequence = kcov->sequence; + if (in_task()) { + size = kcov->remote_size; + area = kcov_remote_area_get(size); + } else { + size = CONFIG_KCOV_IRQ_AREA_SIZE; + area = this_cpu_ptr(&kcov_percpu_data)->irq_area; + } + spin_unlock_irqrestore(&kcov_remote_lock, flags); + /* Can only happen when in_task(). */ if (!area) { area = vmalloc(size * sizeof(unsigned long)); if (!area) { - t->kcov = NULL; - kcov_put(remote->kcov); + kcov_put(kcov); return; } } + + local_irq_save(flags); + /* Reset coverage size. */ *(u64 *)area = 0; - kcov_debug("area = %px, size = %u", area, size); + if (in_serving_softirq()) { + kcov_remote_softirq_start(t); + t->kcov_softirq = 1; + } + kcov_start(t, kcov, size, area, mode, sequence); - kcov_start(t, size, area, mode, sequence); + local_irq_restore(flags); } EXPORT_SYMBOL(kcov_remote_start); @@ -876,34 +954,58 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area, void kcov_remote_stop(void) { struct task_struct *t = current; - struct kcov *kcov = t->kcov; - void *area = t->kcov_area; - unsigned int size = t->kcov_size; - int sequence = t->kcov_sequence; + struct kcov *kcov; + unsigned int mode; + void *area; + unsigned int size; + int sequence; + unsigned long flags; - if (!kcov) { - kcov_debug("no kcov found\n"); + if (!in_task() && !in_serving_softirq()) + return; + + local_irq_save(flags); + + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (!kcov_mode_enabled(mode)) { + local_irq_restore(flags); + return; + } + kcov = t->kcov; + area = t->kcov_area; + size = t->kcov_size; + sequence = t->kcov_sequence; + + if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { + local_irq_restore(flags); return; } kcov_stop(t); - t->kcov = NULL; + if (in_serving_softirq()) { + t->kcov_softirq = 0; + kcov_remote_softirq_stop(t); + } spin_lock(&kcov->lock); /* * KCOV_DISABLE could have been called between kcov_remote_start() - * and kcov_remote_stop(), hence the check. + * and kcov_remote_stop(), hence the sequence check. */ - kcov_debug("move if: %d == %d && %d\n", - sequence, kcov->sequence, (int)kcov->remote); if (sequence == kcov->sequence && kcov->remote) kcov_move_area(kcov->mode, kcov->area, kcov->size, area); spin_unlock(&kcov->lock); - spin_lock(&kcov_remote_lock); - kcov_remote_area_put(area, size); - spin_unlock(&kcov_remote_lock); + if (in_task()) { + spin_lock(&kcov_remote_lock); + kcov_remote_area_put(area, size); + spin_unlock(&kcov_remote_lock); + } + + local_irq_restore(flags); + /* Get in kcov_remote_start(). */ kcov_put(kcov); } EXPORT_SYMBOL(kcov_remote_stop); @@ -917,6 +1019,16 @@ EXPORT_SYMBOL(kcov_common_handle); static int __init kcov_init(void) { + int cpu; + + for_each_possible_cpu(cpu) { + void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE * + sizeof(unsigned long)); + if (!area) + return -ENOMEM; + per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area; + } + /* * The kcov debugfs file won't ever get removed and thus, * there is no need to protect it against removal races. The diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index faa74d5f6941..bb05fd52de85 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -540,6 +540,11 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) unsigned long sz = end - start + 1; /* Returning 0 will take to next memory range */ + + /* Don't use memory that will be detected and handled by a driver. */ + if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED) + return 0; + if (sz < kbuf->memsz) return 0; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 0fbdee78266b..50cd84f53df0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2475,24 +2475,14 @@ static int show_kprobe_addr(struct seq_file *pi, void *v) return 0; } -static const struct seq_operations kprobes_seq_ops = { +static const struct seq_operations kprobes_sops = { .start = kprobe_seq_start, .next = kprobe_seq_next, .stop = kprobe_seq_stop, .show = show_kprobe_addr }; -static int kprobes_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobes_seq_ops); -} - -static const struct file_operations debugfs_kprobes_operations = { - .open = kprobes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(kprobes); /* kprobes/blacklist -- shows which functions can not be probed */ static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) @@ -2529,24 +2519,13 @@ static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v) mutex_unlock(&kprobe_mutex); } -static const struct seq_operations kprobe_blacklist_seq_ops = { +static const struct seq_operations kprobe_blacklist_sops = { .start = kprobe_blacklist_seq_start, .next = kprobe_blacklist_seq_next, .stop = kprobe_blacklist_seq_stop, .show = kprobe_blacklist_seq_show, }; - -static int kprobe_blacklist_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobe_blacklist_seq_ops); -} - -static const struct file_operations debugfs_kprobe_blacklist_ops = { - .open = kprobe_blacklist_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(kprobe_blacklist); static int arm_all_kprobes(void) { @@ -2705,13 +2684,12 @@ static int __init debugfs_kprobe_init(void) dir = debugfs_create_dir("kprobes", NULL); - debugfs_create_file("list", 0400, dir, NULL, - &debugfs_kprobes_operations); + debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops); debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); debugfs_create_file("blacklist", 0400, dir, NULL, - &debugfs_kprobe_blacklist_ops); + &kprobe_blacklist_fops); return 0; } diff --git a/kernel/relay.c b/kernel/relay.c index 90c7a002436d..204867220f8a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -581,6 +581,11 @@ struct rchan *relay_open(const char *base_filename, return NULL; chan->buf = alloc_percpu(struct rchan_buf *); + if (!chan->buf) { + kfree(chan); + return NULL; + } + chan->version = RELAYFS_CHANNEL_VERSION; chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; @@ -991,14 +996,14 @@ static void relay_file_read_consume(struct rchan_buf *buf, /* * relay_file_read_avail - boolean, are there unconsumed bytes available? */ -static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) +static int relay_file_read_avail(struct rchan_buf *buf) { size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t produced = buf->subbufs_produced; size_t consumed = buf->subbufs_consumed; - relay_file_read_consume(buf, read_pos, 0); + relay_file_read_consume(buf, 0, 0); consumed = buf->subbufs_consumed; @@ -1059,23 +1064,20 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos, /** * relay_file_read_start_pos - find the first available byte to read - * @read_pos: file read position * @buf: relay channel buffer * - * If the @read_pos is in the middle of padding, return the + * If the read_pos is in the middle of padding, return the * position of the first actually available byte, otherwise * return the original value. */ -static size_t relay_file_read_start_pos(size_t read_pos, - struct rchan_buf *buf) +static size_t relay_file_read_start_pos(struct rchan_buf *buf) { size_t read_subbuf, padding, padding_start, padding_end; size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t consumed = buf->subbufs_consumed % n_subbufs; + size_t read_pos = consumed * subbuf_size + buf->bytes_consumed; - if (!read_pos) - read_pos = consumed * subbuf_size + buf->bytes_consumed; read_subbuf = read_pos / subbuf_size; padding = buf->padding[read_subbuf]; padding_start = (read_subbuf + 1) * subbuf_size - padding; @@ -1131,10 +1133,10 @@ static ssize_t relay_file_read(struct file *filp, do { void *from; - if (!relay_file_read_avail(buf, *ppos)) + if (!relay_file_read_avail(buf)) break; - read_start = relay_file_read_start_pos(*ppos, buf); + read_start = relay_file_read_start_pos(buf); avail = relay_file_read_subbuf_avail(read_start, buf); if (!avail) break; diff --git a/kernel/user.c b/kernel/user.c index 5235d7f49982..b1635d94a1f2 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -82,7 +82,7 @@ EXPORT_SYMBOL_GPL(init_user_ns); #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; -struct hlist_head uidhash_table[UIDHASH_SZ]; +static struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0217ed126f77..b9a450b1fbf7 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -658,6 +658,12 @@ config SCHED_STACK_END_CHECK data corruption or a sporadic crash at a later stage once the region is examined. The runtime overhead introduced is minimal. +config ARCH_HAS_DEBUG_VM_PGTABLE + bool + help + An architecture should select this when it can successfully + build and run DEBUG_VM_PGTABLE. + config DEBUG_VM bool "Debug VM" depends on DEBUG_KERNEL @@ -693,6 +699,22 @@ config DEBUG_VM_PGFLAGS If unsure, say N. +config DEBUG_VM_PGTABLE + bool "Debug arch page table for semantics compliance" + depends on MMU + depends on ARCH_HAS_DEBUG_VM_PGTABLE + default y if DEBUG_VM + help + This option provides a debug method which can be used to test + architecture page table helper functions on various platforms in + verifying if they comply with expected generic MM semantics. This + will help architecture code in making sure that any changes or + new additions of these helpers still conform to expected + semantics of the generic MM. Platforms will have to opt in for + this through ARCH_HAS_DEBUG_VM_PGTABLE. + + If unsure, say N. + config ARCH_HAS_DEBUG_VIRTUAL bool @@ -1774,6 +1796,15 @@ config KCOV_INSTRUMENT_ALL filesystem fuzzing with AFL) then you will want to enable coverage for more specific subsets of files, and should say n here. +config KCOV_IRQ_AREA_SIZE + hex "Size of interrupt coverage collection area in words" + depends on KCOV + default 0x40000 + help + KCOV uses preallocated per-cpu areas to collect coverage from + soft interrupts. This specifies the size of those areas in the + number of unsigned long words. + menuconfig RUNTIME_TESTING_MENU bool "Runtime Testing" def_bool y @@ -1991,6 +2022,19 @@ config TEST_LKM If unsure, say N. +config TEST_BITOPS + tristate "Test module for compilation of clear_bit/set_bit operations" + depends on m + help + This builds the "test_bitops" module that is much like the + TEST_LKM module except that it does a basic exercise of the + clear_bit and set_bit macros to make sure there are no compiler + warnings from C=1 sparse checker or -Wextra compilations. It has + no dependencies and doesn't run or load unless explicitly requested + by name. for example: modprobe test_bitops. + + If unsure, say N. + config TEST_VMALLOC tristate "Test module for stress/performance analysis of vmalloc allocator" default n diff --git a/lib/Makefile b/lib/Makefile index 5adf8949a757..32f19b4d1d2a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -56,6 +56,8 @@ obj-y += kstrtox.o obj-$(CONFIG_FIND_BIT_BENCHMARK) += find_bit_benchmark.o obj-$(CONFIG_TEST_BPF) += test_bpf.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o +obj-$(CONFIG_TEST_BITOPS) += test_bitops.o +CFLAGS_test_bitops.o += -Werror obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_IDA) += test_ida.o diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index 7852bfff50b1..451543937524 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -266,8 +266,7 @@ void __fprop_inc_percpu_max(struct fprop_global *p, if (numerator > (((u64)denominator) * max_frac) >> FPROP_FRAC_SHIFT) return; - } else - fprop_reflect_period_percpu(p, pl); - percpu_counter_add_batch(&pl->events, 1, PROP_BATCH); - percpu_counter_add(&p->events, 1); + } + + __fprop_inc_percpu(p, pl); } diff --git a/lib/math/prime_numbers.c b/lib/math/prime_numbers.c index 052f5b727be7..d42cebf7407f 100644 --- a/lib/math/prime_numbers.c +++ b/lib/math/prime_numbers.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#define pr_fmt(fmt) "prime numbers: " fmt "\n" +#define pr_fmt(fmt) "prime numbers: " fmt #include <linux/module.h> #include <linux/mutex.h> @@ -253,7 +253,7 @@ static void dump_primes(void) if (buf) bitmap_print_to_pagebuf(true, buf, p->primes, p->sz); - pr_info("primes.{last=%lu, .sz=%lu, .primes[]=...x%lx} = %s", + pr_info("primes.{last=%lu, .sz=%lu, .primes[]=...x%lx} = %s\n", p->last, p->sz, p->primes[BITS_TO_LONGS(p->sz) - 1], buf); rcu_read_unlock(); @@ -273,7 +273,7 @@ static int selftest(unsigned long max) bool fast = is_prime_number(x); if (slow != fast) { - pr_err("inconsistent result for is-prime(%lu): slow=%s, fast=%s!", + pr_err("inconsistent result for is-prime(%lu): slow=%s, fast=%s!\n", x, slow ? "yes" : "no", fast ? "yes" : "no"); goto err; } @@ -282,14 +282,14 @@ static int selftest(unsigned long max) continue; if (next_prime_number(last) != x) { - pr_err("incorrect result for next-prime(%lu): expected %lu, got %lu", + pr_err("incorrect result for next-prime(%lu): expected %lu, got %lu\n", last, x, next_prime_number(last)); goto err; } last = x; } - pr_info("selftest(%lu) passed, last prime was %lu", x, last); + pr_info("%s(%lu) passed, last prime was %lu\n", __func__, x, last); return 0; err: diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 8d092609928e..0ba686b8fe57 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#define pr_fmt(fmt) "%s: " fmt "\n", __func__ +#define pr_fmt(fmt) "%s: " fmt, __func__ #include <linux/kernel.h> #include <linux/sched.h> @@ -141,8 +141,8 @@ static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu) for_each_possible_cpu(cpu) count += *per_cpu_ptr(percpu_count, cpu); - pr_debug("global %ld percpu %ld", - atomic_long_read(&ref->count), (long)count); + pr_debug("global %lu percpu %lu\n", + atomic_long_read(&ref->count), count); /* * It's crucial that we sum the percpu counters _before_ adding the sum diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index b90ec550183a..34696a348864 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -98,6 +98,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) { unsigned long max_addr, src_addr; + might_fault(); if (unlikely(count <= 0)) return 0; diff --git a/lib/test_bitops.c b/lib/test_bitops.c new file mode 100644 index 000000000000..fd50b3ae4a14 --- /dev/null +++ b/lib/test_bitops.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Intel Corporation + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/printk.h> + +/* a tiny module only meant to test set/clear_bit */ + +/* use an enum because thats the most common BITMAP usage */ +enum bitops_fun { + BITOPS_4 = 4, + BITOPS_7 = 7, + BITOPS_11 = 11, + BITOPS_31 = 31, + BITOPS_88 = 88, + BITOPS_LAST = 255, + BITOPS_LENGTH = 256 +}; + +static DECLARE_BITMAP(g_bitmap, BITOPS_LENGTH); + +static int __init test_bitops_startup(void) +{ + pr_warn("Loaded test module\n"); + set_bit(BITOPS_4, g_bitmap); + set_bit(BITOPS_7, g_bitmap); + set_bit(BITOPS_11, g_bitmap); + set_bit(BITOPS_31, g_bitmap); + set_bit(BITOPS_88, g_bitmap); + return 0; +} + +static void __exit test_bitops_unstartup(void) +{ + int bit_set; + + clear_bit(BITOPS_4, g_bitmap); + clear_bit(BITOPS_7, g_bitmap); + clear_bit(BITOPS_11, g_bitmap); + clear_bit(BITOPS_31, g_bitmap); + clear_bit(BITOPS_88, g_bitmap); + + bit_set = find_first_bit(g_bitmap, BITOPS_LAST); + if (bit_set != BITOPS_LAST) + pr_err("ERROR: FOUND SET BIT %d\n", bit_set); + + pr_warn("Unloaded test module\n"); +} + +module_init(test_bitops_startup); +module_exit(test_bitops_unstartup); + +MODULE_AUTHOR("Jesse Brandeburg <jesse.brandeburg@intel.com>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Bit testing module"); diff --git a/lib/test_lockup.c b/lib/test_lockup.c index ea09ca335b21..419fbaceba73 100644 --- a/lib/test_lockup.c +++ b/lib/test_lockup.c @@ -142,7 +142,7 @@ module_param(reallocate_pages, bool, 0400); MODULE_PARM_DESC(reallocate_pages, "free and allocate pages between iterations"); struct file *test_file; -struct inode *test_inode; +static struct inode *test_inode; static char test_file_path[256]; module_param_string(file_path, test_file_path, sizeof(test_file_path), 0400); MODULE_PARM_DESC(file_path, "file path to test"); diff --git a/lib/ubsan.c b/lib/ubsan.c index f8c0ccf35f29..cb9af3f6b77e 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -189,7 +189,7 @@ static void handle_overflow(struct overflow_data *data, void *lhs, ubsan_epilogue(); } -void __ubsan_handle_add_overflow(struct overflow_data *data, +void __ubsan_handle_add_overflow(void *data, void *lhs, void *rhs) { @@ -197,23 +197,23 @@ void __ubsan_handle_add_overflow(struct overflow_data *data, } EXPORT_SYMBOL(__ubsan_handle_add_overflow); -void __ubsan_handle_sub_overflow(struct overflow_data *data, +void __ubsan_handle_sub_overflow(void *data, void *lhs, void *rhs) { handle_overflow(data, lhs, rhs, '-'); } EXPORT_SYMBOL(__ubsan_handle_sub_overflow); -void __ubsan_handle_mul_overflow(struct overflow_data *data, +void __ubsan_handle_mul_overflow(void *data, void *lhs, void *rhs) { handle_overflow(data, lhs, rhs, '*'); } EXPORT_SYMBOL(__ubsan_handle_mul_overflow); -void __ubsan_handle_negate_overflow(struct overflow_data *data, - void *old_val) +void __ubsan_handle_negate_overflow(void *_data, void *old_val) { + struct overflow_data *data = _data; char old_val_str[VALUE_LENGTH]; if (suppress_report(&data->location)) @@ -231,9 +231,9 @@ void __ubsan_handle_negate_overflow(struct overflow_data *data, EXPORT_SYMBOL(__ubsan_handle_negate_overflow); -void __ubsan_handle_divrem_overflow(struct overflow_data *data, - void *lhs, void *rhs) +void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs) { + struct overflow_data *data = _data; char rhs_val_str[VALUE_LENGTH]; if (suppress_report(&data->location)) @@ -326,10 +326,9 @@ void __ubsan_handle_type_mismatch(struct type_mismatch_data *data, } EXPORT_SYMBOL(__ubsan_handle_type_mismatch); -void __ubsan_handle_type_mismatch_v1(struct type_mismatch_data_v1 *data, - void *ptr) +void __ubsan_handle_type_mismatch_v1(void *_data, void *ptr) { - + struct type_mismatch_data_v1 *data = _data; struct type_mismatch_data_common common_data = { .location = &data->location, .type = data->type, @@ -341,8 +340,9 @@ void __ubsan_handle_type_mismatch_v1(struct type_mismatch_data_v1 *data, } EXPORT_SYMBOL(__ubsan_handle_type_mismatch_v1); -void __ubsan_handle_out_of_bounds(struct out_of_bounds_data *data, void *index) +void __ubsan_handle_out_of_bounds(void *_data, void *index) { + struct out_of_bounds_data *data = _data; char index_str[VALUE_LENGTH]; if (suppress_report(&data->location)) @@ -357,9 +357,9 @@ void __ubsan_handle_out_of_bounds(struct out_of_bounds_data *data, void *index) } EXPORT_SYMBOL(__ubsan_handle_out_of_bounds); -void __ubsan_handle_shift_out_of_bounds(struct shift_out_of_bounds_data *data, - void *lhs, void *rhs) +void __ubsan_handle_shift_out_of_bounds(void *_data, void *lhs, void *rhs) { + struct shift_out_of_bounds_data *data = _data; struct type_descriptor *rhs_type = data->rhs_type; struct type_descriptor *lhs_type = data->lhs_type; char rhs_str[VALUE_LENGTH]; @@ -399,8 +399,9 @@ out: EXPORT_SYMBOL(__ubsan_handle_shift_out_of_bounds); -void __ubsan_handle_builtin_unreachable(struct unreachable_data *data) +void __ubsan_handle_builtin_unreachable(void *_data) { + struct unreachable_data *data = _data; ubsan_prologue(&data->location, "unreachable"); pr_err("calling __builtin_unreachable()\n"); ubsan_epilogue(); @@ -408,9 +409,9 @@ void __ubsan_handle_builtin_unreachable(struct unreachable_data *data) } EXPORT_SYMBOL(__ubsan_handle_builtin_unreachable); -void __ubsan_handle_load_invalid_value(struct invalid_value_data *data, - void *val) +void __ubsan_handle_load_invalid_value(void *_data, void *val) { + struct invalid_value_data *data = _data; char val_str[VALUE_LENGTH]; if (suppress_report(&data->location)) diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c index 2c13ecc5bb2c..ed1f3df27260 100644 --- a/lib/zlib_inflate/inffast.c +++ b/lib/zlib_inflate/inffast.c @@ -10,17 +10,6 @@ #ifndef ASMINF -/* Allow machine dependent optimization for post-increment or pre-increment. - Based on testing to date, - Pre-increment preferred for: - - PowerPC G3 (Adler) - - MIPS R5000 (Randers-Pehrson) - Post-increment preferred for: - - none - No measurable difference: - - Pentium III (Anderson) - - M68060 (Nikl) - */ union uu { unsigned short us; unsigned char b[2]; @@ -38,16 +27,6 @@ get_unaligned16(const unsigned short *p) return mm.us; } -#ifdef POSTINC -# define OFF 0 -# define PUP(a) *(a)++ -# define UP_UNALIGNED(a) get_unaligned16((a)++) -#else -# define OFF 1 -# define PUP(a) *++(a) -# define UP_UNALIGNED(a) get_unaligned16(++(a)) -#endif - /* Decode literal, length, and distance codes and write out the resulting literal and match bytes until either not enough input or output is @@ -115,9 +94,9 @@ void inflate_fast(z_streamp strm, unsigned start) /* copy state to local variables */ state = (struct inflate_state *)strm->state; - in = strm->next_in - OFF; + in = strm->next_in; last = in + (strm->avail_in - 5); - out = strm->next_out - OFF; + out = strm->next_out; beg = out - (start - strm->avail_out); end = out + (strm->avail_out - 257); #ifdef INFLATE_STRICT @@ -138,9 +117,9 @@ void inflate_fast(z_streamp strm, unsigned start) input data or output space */ do { if (bits < 15) { - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; } this = lcode[hold & lmask]; @@ -150,14 +129,14 @@ void inflate_fast(z_streamp strm, unsigned start) bits -= op; op = (unsigned)(this.op); if (op == 0) { /* literal */ - PUP(out) = (unsigned char)(this.val); + *out++ = (unsigned char)(this.val); } else if (op & 16) { /* length base */ len = (unsigned)(this.val); op &= 15; /* number of extra bits */ if (op) { if (bits < op) { - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; } len += (unsigned)hold & ((1U << op) - 1); @@ -165,9 +144,9 @@ void inflate_fast(z_streamp strm, unsigned start) bits -= op; } if (bits < 15) { - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; } this = dcode[hold & dmask]; @@ -180,10 +159,10 @@ void inflate_fast(z_streamp strm, unsigned start) dist = (unsigned)(this.val); op &= 15; /* number of extra bits */ if (bits < op) { - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; if (bits < op) { - hold += (unsigned long)(PUP(in)) << bits; + hold += (unsigned long)(*in++) << bits; bits += 8; } } @@ -205,13 +184,13 @@ void inflate_fast(z_streamp strm, unsigned start) state->mode = BAD; break; } - from = window - OFF; + from = window; if (write == 0) { /* very common case */ from += wsize - op; if (op < len) { /* some from window */ len -= op; do { - PUP(out) = PUP(from); + *out++ = *from++; } while (--op); from = out - dist; /* rest from output */ } @@ -222,14 +201,14 @@ void inflate_fast(z_streamp strm, unsigned start) if (op < len) { /* some from end of window */ len -= op; do { - PUP(out) = PUP(from); + *out++ = *from++; } while (--op); - from = window - OFF; + from = window; if (write < len) { /* some from start of window */ op = write; len -= op; do { - PUP(out) = PUP(from); + *out++ = *from++; } while (--op); from = out - dist; /* rest from output */ } @@ -240,21 +219,21 @@ void inflate_fast(z_streamp strm, unsigned start) if (op < len) { /* some from window */ len -= op; do { - PUP(out) = PUP(from); + *out++ = *from++; } while (--op); from = out - dist; /* rest from output */ } } while (len > 2) { - PUP(out) = PUP(from); - PUP(out) = PUP(from); - PUP(out) = PUP(from); + *out++ = *from++; + *out++ = *from++; + *out++ = *from++; len -= 3; } if (len) { - PUP(out) = PUP(from); + *out++ = *from++; if (len > 1) - PUP(out) = PUP(from); + *out++ = *from++; } } else { @@ -264,29 +243,29 @@ void inflate_fast(z_streamp strm, unsigned start) from = out - dist; /* copy direct from output */ /* minimum length is three */ /* Align out addr */ - if (!((long)(out - 1 + OFF) & 1)) { - PUP(out) = PUP(from); + if (!((long)(out - 1) & 1)) { + *out++ = *from++; len--; } - sout = (unsigned short *)(out - OFF); + sout = (unsigned short *)(out); if (dist > 2) { unsigned short *sfrom; - sfrom = (unsigned short *)(from - OFF); + sfrom = (unsigned short *)(from); loops = len >> 1; do #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - PUP(sout) = PUP(sfrom); + *sout++ = *sfrom++; #else - PUP(sout) = UP_UNALIGNED(sfrom); + *sout++ = get_unaligned16(sfrom++); #endif while (--loops); - out = (unsigned char *)sout + OFF; - from = (unsigned char *)sfrom + OFF; + out = (unsigned char *)sout; + from = (unsigned char *)sfrom; } else { /* dist == 1 or dist == 2 */ unsigned short pat16; - pat16 = *(sout-1+OFF); + pat16 = *(sout-1); if (dist == 1) { union uu mm; /* copy one char pattern to both bytes */ @@ -296,12 +275,12 @@ void inflate_fast(z_streamp strm, unsigned start) } loops = len >> 1; do - PUP(sout) = pat16; + *sout++ = pat16; while (--loops); - out = (unsigned char *)sout + OFF; + out = (unsigned char *)sout; } if (len & 1) - PUP(out) = PUP(from); + *out++ = *from++; } } else if ((op & 64) == 0) { /* 2nd level distance code */ @@ -336,8 +315,8 @@ void inflate_fast(z_streamp strm, unsigned start) hold &= (1U << bits) - 1; /* update state and return */ - strm->next_in = in + OFF; - strm->next_out = out + OFF; + strm->next_in = in; + strm->next_out = out; strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last)); strm->avail_out = (unsigned)(out < end ? 257 + (end - out) : 257 - (out - end)); diff --git a/mm/Kconfig b/mm/Kconfig index e3490ecac839..cffc276fa19c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -133,6 +133,9 @@ config HAVE_FAST_GUP depends on MMU bool +# Don't discard allocated memory used to track "memory" and "reserved" memblocks +# after early boot, so it can still be used to test for validity of memory. +# Also, memblocks are updated with memory hot(un)plug. config ARCH_KEEP_MEMBLOCK bool @@ -155,6 +158,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG + depends on 64BIT || BROKEN select NUMA_KEEP_MEMINFO if NUMA config MEMORY_HOTPLUG_SPARSE diff --git a/mm/Makefile b/mm/Makefile index fccd3756b25f..662fd1504646 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -88,6 +88,7 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o +obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o diff --git a/mm/compaction.c b/mm/compaction.c index 14d2fe231ea4..fd988b7e5f2b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1401,7 +1401,7 @@ fast_isolate_freepages(struct compact_control *cc) if (scan_start) { /* * Use the highest PFN found above min. If one was - * not found, be pessemistic for direct compaction + * not found, be pessimistic for direct compaction * and use the min mark. */ if (highest) { diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c new file mode 100644 index 000000000000..188c18908964 --- /dev/null +++ b/mm/debug_vm_pgtable.c @@ -0,0 +1,382 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This kernel test validates architecture page table helpers and + * accessors and helps in verifying their continued compliance with + * expected generic MM semantics. + * + * Copyright (C) 2019 ARM Ltd. + * + * Author: Anshuman Khandual <anshuman.khandual@arm.com> + */ +#define pr_fmt(fmt) "debug_vm_pgtable: %s: " fmt, __func__ + +#include <linux/gfp.h> +#include <linux/highmem.h> +#include <linux/hugetlb.h> +#include <linux/kernel.h> +#include <linux/kconfig.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/mm_types.h> +#include <linux/module.h> +#include <linux/pfn_t.h> +#include <linux/printk.h> +#include <linux/random.h> +#include <linux/spinlock.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/start_kernel.h> +#include <linux/sched/mm.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> + +#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC) + +/* + * On s390 platform, the lower 4 bits are used to identify given page table + * entry type. But these bits might affect the ability to clear entries with + * pxx_clear() because of how dynamic page table folding works on s390. So + * while loading up the entries do not change the lower 4 bits. It does not + * have affect any other platform. + */ +#define S390_MASK_BITS 4 +#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS) +#define RANDOM_NZVALUE GENMASK(7, 0) + +static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pte_t pte = pfn_pte(pfn, prot); + + WARN_ON(!pte_same(pte, pte)); + WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte)))); + WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte)))); + WARN_ON(!pte_write(pte_mkwrite(pte_wrprotect(pte)))); + WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte)))); + WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte)))); + WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte)))); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pmd_t pmd = pfn_pmd(pfn, prot); + + WARN_ON(!pmd_same(pmd, pmd)); + WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd)))); + WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd)))); + WARN_ON(!pmd_write(pmd_mkwrite(pmd_wrprotect(pmd)))); + WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd)))); + WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd)))); + WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd)))); + /* + * A huge page does not point to next level page table + * entry. Hence this must qualify as pmd_bad(). + */ + WARN_ON(!pmd_bad(pmd_mkhuge(pmd))); +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pud_t pud = pfn_pud(pfn, prot); + + WARN_ON(!pud_same(pud, pud)); + WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud)))); + WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud)))); + WARN_ON(pud_write(pud_wrprotect(pud_mkwrite(pud)))); + WARN_ON(pud_young(pud_mkold(pud_mkyoung(pud)))); + + if (mm_pmd_folded(mm)) + return; + + /* + * A huge page does not point to next level page table + * entry. Hence this must qualify as pud_bad(). + */ + WARN_ON(!pud_bad(pud_mkhuge(pud))); +} +#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ +static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot) +{ + p4d_t p4d; + + memset(&p4d, RANDOM_NZVALUE, sizeof(p4d_t)); + WARN_ON(!p4d_same(p4d, p4d)); +} + +static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pgd_t pgd; + + memset(&pgd, RANDOM_NZVALUE, sizeof(pgd_t)); + WARN_ON(!pgd_same(pgd, pgd)); +} + +#ifndef __PAGETABLE_PUD_FOLDED +static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) +{ + pud_t pud = READ_ONCE(*pudp); + + if (mm_pmd_folded(mm)) + return; + + pud = __pud(pud_val(pud) | RANDOM_ORVALUE); + WRITE_ONCE(*pudp, pud); + pud_clear(pudp); + pud = READ_ONCE(*pudp); + WARN_ON(!pud_none(pud)); +} + +static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, + pmd_t *pmdp) +{ + pud_t pud; + + if (mm_pmd_folded(mm)) + return; + /* + * This entry points to next level page table page. + * Hence this must not qualify as pud_bad(). + */ + pmd_clear(pmdp); + pud_clear(pudp); + pud_populate(mm, pudp, pmdp); + pud = READ_ONCE(*pudp); + WARN_ON(pud_bad(pud)); +} +#else /* !__PAGETABLE_PUD_FOLDED */ +static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) { } +static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, + pmd_t *pmdp) +{ +} +#endif /* PAGETABLE_PUD_FOLDED */ + +#ifndef __PAGETABLE_P4D_FOLDED +static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) +{ + p4d_t p4d = READ_ONCE(*p4dp); + + if (mm_pud_folded(mm)) + return; + + p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE); + WRITE_ONCE(*p4dp, p4d); + p4d_clear(p4dp); + p4d = READ_ONCE(*p4dp); + WARN_ON(!p4d_none(p4d)); +} + +static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, + pud_t *pudp) +{ + p4d_t p4d; + + if (mm_pud_folded(mm)) + return; + + /* + * This entry points to next level page table page. + * Hence this must not qualify as p4d_bad(). + */ + pud_clear(pudp); + p4d_clear(p4dp); + p4d_populate(mm, p4dp, pudp); + p4d = READ_ONCE(*p4dp); + WARN_ON(p4d_bad(p4d)); +} + +static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) +{ + pgd_t pgd = READ_ONCE(*pgdp); + + if (mm_p4d_folded(mm)) + return; + + pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE); + WRITE_ONCE(*pgdp, pgd); + pgd_clear(pgdp); + pgd = READ_ONCE(*pgdp); + WARN_ON(!pgd_none(pgd)); +} + +static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, + p4d_t *p4dp) +{ + pgd_t pgd; + + if (mm_p4d_folded(mm)) + return; + + /* + * This entry points to next level page table page. + * Hence this must not qualify as pgd_bad(). + */ + p4d_clear(p4dp); + pgd_clear(pgdp); + pgd_populate(mm, pgdp, p4dp); + pgd = READ_ONCE(*pgdp); + WARN_ON(pgd_bad(pgd)); +} +#else /* !__PAGETABLE_P4D_FOLDED */ +static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) { } +static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) { } +static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, + pud_t *pudp) +{ +} +static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, + p4d_t *p4dp) +{ +} +#endif /* PAGETABLE_P4D_FOLDED */ + +static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep, + unsigned long vaddr) +{ + pte_t pte = READ_ONCE(*ptep); + + pte = __pte(pte_val(pte) | RANDOM_ORVALUE); + set_pte_at(mm, vaddr, ptep, pte); + barrier(); + pte_clear(mm, vaddr, ptep); + pte = READ_ONCE(*ptep); + WARN_ON(!pte_none(pte)); +} + +static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp) +{ + pmd_t pmd = READ_ONCE(*pmdp); + + pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE); + WRITE_ONCE(*pmdp, pmd); + pmd_clear(pmdp); + pmd = READ_ONCE(*pmdp); + WARN_ON(!pmd_none(pmd)); +} + +static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + pmd_t pmd; + + /* + * This entry points to next level page table page. + * Hence this must not qualify as pmd_bad(). + */ + pmd_clear(pmdp); + pmd_populate(mm, pmdp, pgtable); + pmd = READ_ONCE(*pmdp); + WARN_ON(pmd_bad(pmd)); +} + +static unsigned long __init get_random_vaddr(void) +{ + unsigned long random_vaddr, random_pages, total_user_pages; + + total_user_pages = (TASK_SIZE - FIRST_USER_ADDRESS) / PAGE_SIZE; + + random_pages = get_random_long() % total_user_pages; + random_vaddr = FIRST_USER_ADDRESS + random_pages * PAGE_SIZE; + + return random_vaddr; +} + +static int __init debug_vm_pgtable(void) +{ + struct mm_struct *mm; + pgd_t *pgdp; + p4d_t *p4dp, *saved_p4dp; + pud_t *pudp, *saved_pudp; + pmd_t *pmdp, *saved_pmdp, pmd; + pte_t *ptep; + pgtable_t saved_ptep; + pgprot_t prot; + phys_addr_t paddr; + unsigned long vaddr, pte_aligned, pmd_aligned; + unsigned long pud_aligned, p4d_aligned, pgd_aligned; + spinlock_t *uninitialized_var(ptl); + + pr_info("Validating architecture page table helpers\n"); + prot = vm_get_page_prot(VMFLAGS); + vaddr = get_random_vaddr(); + mm = mm_alloc(); + if (!mm) { + pr_err("mm_struct allocation failed\n"); + return 1; + } + + /* + * PFN for mapping at PTE level is determined from a standard kernel + * text symbol. But pfns for higher page table levels are derived by + * masking lower bits of this real pfn. These derived pfns might not + * exist on the platform but that does not really matter as pfn_pxx() + * helpers will still create appropriate entries for the test. This + * helps avoid large memory block allocations to be used for mapping + * at higher page table levels. + */ + paddr = __pa_symbol(&start_kernel); + + pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT; + pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT; + pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT; + p4d_aligned = (paddr & P4D_MASK) >> PAGE_SHIFT; + pgd_aligned = (paddr & PGDIR_MASK) >> PAGE_SHIFT; + WARN_ON(!pfn_valid(pte_aligned)); + + pgdp = pgd_offset(mm, vaddr); + p4dp = p4d_alloc(mm, pgdp, vaddr); + pudp = pud_alloc(mm, p4dp, vaddr); + pmdp = pmd_alloc(mm, pudp, vaddr); + ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl); + + /* + * Save all the page table page addresses as the page table + * entries will be used for testing with random or garbage + * values. These saved addresses will be used for freeing + * page table pages. + */ + pmd = READ_ONCE(*pmdp); + saved_p4dp = p4d_offset(pgdp, 0UL); + saved_pudp = pud_offset(p4dp, 0UL); + saved_pmdp = pmd_offset(pudp, 0UL); + saved_ptep = pmd_pgtable(pmd); + + pte_basic_tests(pte_aligned, prot); + pmd_basic_tests(pmd_aligned, prot); + pud_basic_tests(pud_aligned, prot); + p4d_basic_tests(p4d_aligned, prot); + pgd_basic_tests(pgd_aligned, prot); + + pte_clear_tests(mm, ptep, vaddr); + pmd_clear_tests(mm, pmdp); + pud_clear_tests(mm, pudp); + p4d_clear_tests(mm, p4dp); + pgd_clear_tests(mm, pgdp); + + pte_unmap_unlock(ptep, ptl); + + pmd_populate_tests(mm, pmdp, saved_ptep); + pud_populate_tests(mm, pudp, saved_pmdp); + p4d_populate_tests(mm, p4dp, saved_pudp); + pgd_populate_tests(mm, pgdp, saved_p4dp); + + p4d_free(mm, saved_p4dp); + pud_free(mm, saved_pudp); + pmd_free(mm, saved_pmdp); + pte_free(mm, saved_ptep); + + mm_dec_nr_puds(mm); + mm_dec_nr_pmds(mm); + mm_dec_nr_ptes(mm); + mmdrop(mm); + return 0; +} +late_initcall(debug_vm_pgtable); diff --git a/mm/filemap.c b/mm/filemap.c index 455990621989..b1a41890d80e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1256,7 +1256,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); * instead. * * The read of PG_waiters has to be after (or concurrently with) PG_locked - * being cleared, but a memory barrier should be unneccssary since it is + * being cleared, but a memory barrier should be unnecessary since it is * in the same byte as PG_locked. */ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem) diff --git a/mm/frontswap.c b/mm/frontswap.c index 60bb20e8a951..bfa3a339253e 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -87,7 +87,7 @@ static inline void inc_frontswap_invalidates(void) { } * * This would not guards us against the user deciding to call swapoff right as * we are calling the backend to initialize (so swapon is in action). - * Fortunatly for us, the swapon_mutex has been taked by the callee so we are + * Fortunately for us, the swapon_mutex has been taken by the callee so we are * OK. The other scenario where calls to frontswap_store (called via * swap_writepage) is racing with frontswap_invalidate_area (called via * swapoff) is again guarded by the swap subsystem. @@ -413,8 +413,8 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, } /* - * Used to check if it's necessory and feasible to unuse pages. - * Return 1 when nothing to do, 0 when need to shink pages, + * Used to check if it's necessary and feasible to unuse pages. + * Return 1 when nothing to do, 0 when need to shrink pages, * error code when there is an error. */ static int __frontswap_shrink(unsigned long target_pages, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e8669885232f..4368e964d2aa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -522,7 +522,7 @@ void prep_transhuge_page(struct page *page) bool is_transparent_hugepage(struct page *page) { if (!PageCompound(page)) - return 0; + return false; page = compound_head(page); return is_huge_zero_page(page) || diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ac0d7bbc0692..dcb34d7f5562 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -85,7 +85,7 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) spin_unlock(&spool->lock); /* If no pages are used, and no other handles to the subpool - * remain, give up any reservations mased on minimum size and + * remain, give up any reservations based on minimum size and * free the subpool */ if (free) { if (spool->min_hpages != -1) @@ -133,7 +133,7 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) * the request. Otherwise, return the number of pages by which the * global pools must be adjusted (upward). The returned value may * only be different than the passed value (delta) in the case where - * a subpool minimum size must be manitained. + * a subpool minimum size must be maintained. */ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, long delta) @@ -473,7 +473,7 @@ out_of_memory: * * Return the number of new huge pages added to the map. This number is greater * than or equal to zero. If file_region entries needed to be allocated for - * this operation and we were not able to allocate, it ruturns -ENOMEM. + * this operation and we were not able to allocate, it returns -ENOMEM. * region_add of regions of length 1 never allocate file_regions and cannot * fail; region_chg will always allocate at least 1 entry and a region_add for * 1 page will only require at most 1 entry. @@ -988,7 +988,7 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) * We know VM_NORESERVE is not set. Therefore, there SHOULD * be a region map for all pages. The only situation where * there is no region map is if a hole was punched via - * fallocate. In this case, there really are no reverves to + * fallocate. In this case, there really are no reserves to * use. This situation is indicated if chg != 0. */ if (chg) @@ -1519,7 +1519,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) * For gigantic hugepages allocated through bootmem at * boot, it's safer to be consistent with the not-gigantic * hugepages and clear the PG_reserved bit from all tail pages - * too. Otherwse drivers using get_user_pages() to access tail + * too. Otherwise drivers using get_user_pages() to access tail * pages may get the reference counting wrong if they see * PG_reserved set on a tail page (despite the head page not * having PG_reserved set). Enforcing this consistency between @@ -4579,9 +4579,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * entry could be a migration/hwpoison entry at this point, so this * check prevents the kernel from going below assuming that we have - * a active hugepage in pagecache. This goto expects the 2nd page fault, - * and is_hugetlb_entry_(migration|hwpoisoned) check will properly - * handle it. + * an active hugepage in pagecache. This goto expects the 2nd page + * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will + * properly handle it. */ if (!pte_present(entry)) goto out_mutex; diff --git a/mm/internal.h b/mm/internal.h index 9117bca90f4b..791e4b5a807c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -132,7 +132,7 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); * * zonelist, preferred_zone and highest_zoneidx are set first in * __alloc_pages_nodemask() for the fast path, and might be later changed - * in __alloc_pages_slowpath(). All other functions pass the whole strucure + * in __alloc_pages_slowpath(). All other functions pass the whole structure * by a const pointer. */ struct alloc_context { diff --git a/mm/kasan/init.c b/mm/kasan/init.c index ce45c491ebcd..fe6be0be1f76 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -250,20 +250,9 @@ int __ref kasan_populate_early_shadow(const void *shadow_start, * 3,2 - level page tables where we don't have * puds,pmds, so pgd_populate(), pud_populate() * is noops. - * - * The ifndef is required to avoid build breakage. - * - * With 5level-fixup.h, pgd_populate() is not nop and - * we reference kasan_early_shadow_p4d. It's not defined - * unless 5-level paging enabled. - * - * The ifndef can be dropped once all KASAN-enabled - * architectures will switch to pgtable-nop4d.h. */ -#ifndef __ARCH_HAS_5LEVEL_HACK pgd_populate(&init_mm, pgd, lm_alias(kasan_early_shadow_p4d)); -#endif p4d = p4d_offset(pgd, addr); p4d_populate(&init_mm, p4d, lm_alias(kasan_early_shadow_pud)); @@ -612,7 +612,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, * Move the old stable node to the second dimension * queued in the hlist_dup. The invariant is that all * dup stable_nodes in the chain->hlist point to pages - * that are wrprotected and have the exact same + * that are write protected and have the exact same * content. */ stable_node_chain_add_dup(dup, chain); @@ -1148,7 +1148,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, /* * No need to check ksm_use_zero_pages here: we can only have a - * zero_page here if ksm_use_zero_pages was enabled alreaady. + * zero_page here if ksm_use_zero_pages was enabled already. */ if (!is_zero_pfn(page_to_pfn(kpage))) { get_page(kpage); @@ -1608,7 +1608,7 @@ again: * continue. All KSM pages belonging to the * stable_node dups in a stable_node chain * have the same content and they're - * wrprotected at all times. Any will work + * write protected at all times. Any will work * fine to continue the walk. */ tree_page = get_ksm_page(stable_node_any, @@ -1843,7 +1843,7 @@ again: * continue. All KSM pages belonging to the * stable_node dups in a stable_node chain * have the same content and they're - * wrprotected at all times. Any will work + * write protected at all times. Any will work * fine to continue the walk. */ tree_page = get_ksm_page(stable_node_any, @@ -2001,7 +2001,7 @@ static void stable_tree_append(struct rmap_item *rmap_item, * duplicate. page_migration could break later if rmap breaks, * so we can as well crash here. We really need to check for * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check - * for other negative values as an undeflow if detected here + * for other negative values as an underflow if detected here * for the first time (and not when decreasing rmap_hlist_len) * would be sign of memory corruption in the stable_node. */ diff --git a/mm/list_lru.c b/mm/list_lru.c index 4d5294c39bba..9222910ab1cb 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -213,7 +213,7 @@ restart: /* * decrement nr_to_walk first so that we don't livelock if we - * get stuck on large numbesr of LRU_RETRY items + * get stuck on large numbers of LRU_RETRY items */ if (!*nr_to_walk) break; diff --git a/mm/memblock.c b/mm/memblock.c index 743659d88fc4..39aceafc57f6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -78,7 +78,7 @@ * * memblock_alloc*() - these functions return the **virtual** address * of the allocated memory. * - * Note, that both API variants use implict assumptions about allowed + * Note, that both API variants use implicit assumptions about allowed * memory ranges and the fallback methods. Consult the documentation * of memblock_alloc_internal() and memblock_alloc_range_nid() * functions for more elaborate description. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5381afb23d58..3dde78f5b918 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3186,7 +3186,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, * Test whether @memcg has children, dead or alive. Note that this * function doesn't care whether @memcg has use_hierarchy enabled and * returns %true if there are child csses according to the cgroup - * hierarchy. Testing use_hierarchy is the caller's responsiblity. + * hierarchy. Testing use_hierarchy is the caller's responsibility. */ static inline bool memcg_has_children(struct mem_cgroup *memcg) { @@ -4838,7 +4838,7 @@ static struct cftype mem_cgroup_legacy_files[] = { * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of * memory-controlled cgroups to 64k. * - * However, there usually are many references to the oflline CSS after + * However, there usually are many references to the offline CSS after * the cgroup has been destroyed, such as page cache or reclaimable * slab objects, that don't need to hang on to the ID. We want to keep * those dead CSS from occupying IDs, or we might quickly exhaust the diff --git a/mm/memory.c b/mm/memory.c index 7b70398f76a0..d97e8848892d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2467,7 +2467,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src, } /* - * The same page can be mapped back since last copy attampt. + * The same page can be mapped back since last copy attempt. * Try to copy again under PTL. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { @@ -4436,19 +4436,11 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&mm->page_table_lock); -#ifndef __ARCH_HAS_5LEVEL_HACK if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); p4d_populate(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); -#else - if (!pgd_present(*p4d)) { - mm_inc_nr_puds(mm); - pgd_populate(mm, p4d, new); - } else /* Another has populated it */ - pud_free(mm, new); -#endif /* __ARCH_HAS_5LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 926ec704e835..c4d5c45820d0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -98,11 +98,14 @@ void mem_hotplug_done(void) u64 max_mem_size = U64_MAX; /* add this memory to iomem resource */ -static struct resource *register_memory_resource(u64 start, u64 size) +static struct resource *register_memory_resource(u64 start, u64 size, + const char *resource_name) { struct resource *res; unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - char *resource_name = "System RAM"; + + if (strcmp(resource_name, "System RAM")) + flags |= IORESOURCE_MEM_DRIVER_MANAGED; /* * Make sure value parsed from 'mem=' only restricts memory adding @@ -862,10 +865,9 @@ static void reset_node_present_pages(pg_data_t *pgdat) } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) +static pg_data_t __ref *hotadd_new_pgdat(int nid) { struct pglist_data *pgdat; - unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); if (!pgdat) { @@ -895,9 +897,8 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) } /* we can use NODE_DATA(nid) from here */ - pgdat->node_id = nid; - pgdat->node_start_pfn = start_pfn; + pgdat->node_start_pfn = 0; /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_core_hotplug(nid); @@ -932,7 +933,6 @@ static void rollback_node_hotadd(int nid) /** * try_online_node - online a node if offlined * @nid: the node ID - * @start: start addr of the node * @set_node_online: Whether we want to online the node * called by cpu_up() to online a node without onlined memory. * @@ -941,7 +941,7 @@ static void rollback_node_hotadd(int nid) * 0 -> the node is already online * -ENOMEM -> the node could not be allocated */ -static int __try_online_node(int nid, u64 start, bool set_node_online) +static int __try_online_node(int nid, bool set_node_online) { pg_data_t *pgdat; int ret = 1; @@ -949,7 +949,7 @@ static int __try_online_node(int nid, u64 start, bool set_node_online) if (node_online(nid)) return 0; - pgdat = hotadd_new_pgdat(nid, start); + pgdat = hotadd_new_pgdat(nid); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; @@ -973,7 +973,7 @@ int try_online_node(int nid) int ret; mem_hotplug_begin(); - ret = __try_online_node(nid, 0, true); + ret = __try_online_node(nid, true); mem_hotplug_done(); return ret; } @@ -1017,17 +1017,17 @@ int __ref add_memory_resource(int nid, struct resource *res) if (ret) return ret; + if (!node_possible(nid)) { + WARN(1, "node %d was absent from the node_possible_map\n", nid); + return -EINVAL; + } + mem_hotplug_begin(); - /* - * Add new range to memblock so that when hotadd_new_pgdat() is called - * to allocate new pgdat, get_pfn_range_for_nid() will be able to find - * this new range and calculate total pages correctly. The range will - * be removed at hot-remove time. - */ - memblock_add_node(start, size, nid); + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_add_node(start, size, nid); - ret = __try_online_node(nid, start, false); + ret = __try_online_node(nid, false); if (ret < 0) goto error; new_node = ret; @@ -1060,7 +1060,8 @@ int __ref add_memory_resource(int nid, struct resource *res) BUG_ON(ret); /* create new memmap entry */ - firmware_map_add_hotplug(start, start + size, "System RAM"); + if (!strcmp(res->name, "System RAM")) + firmware_map_add_hotplug(start, start + size, "System RAM"); /* device_online() will take the lock when calling online_pages() */ mem_hotplug_done(); @@ -1074,7 +1075,8 @@ error: /* rollback pgdat allocation and others */ if (new_node) rollback_node_hotadd(nid); - memblock_remove(start, size); + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_remove(start, size); mem_hotplug_done(); return ret; } @@ -1085,7 +1087,7 @@ int __ref __add_memory(int nid, u64 start, u64 size) struct resource *res; int ret; - res = register_memory_resource(start, size); + res = register_memory_resource(start, size, "System RAM"); if (IS_ERR(res)) return PTR_ERR(res); @@ -1107,82 +1109,57 @@ int add_memory(int nid, u64 start, u64 size) } EXPORT_SYMBOL_GPL(add_memory); -#ifdef CONFIG_MEMORY_HOTREMOVE /* - * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy - * set and the size of the free page is given by page_order(). Using this, - * the function determines if the pageblock contains only free pages. - * Due to buddy contraints, a free page at least the size of a pageblock will - * be located at the start of the pageblock + * Add special, driver-managed memory to the system as system RAM. Such + * memory is not exposed via the raw firmware-provided memmap as system + * RAM, instead, it is detected and added by a driver - during cold boot, + * after a reboot, and after kexec. + * + * Reasons why this memory should not be used for the initial memmap of a + * kexec kernel or for placing kexec images: + * - The booting kernel is in charge of determining how this memory will be + * used (e.g., use persistent memory as system RAM) + * - Coordination with a hypervisor is required before this memory + * can be used (e.g., inaccessible parts). + * + * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided + * memory map") are created. Also, the created memory resource is flagged + * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case + * this memory as well (esp., not place kexec images onto it). + * + * The resource_name (visible via /proc/iomem) has to have the format + * "System RAM ($DRIVER)". */ -static inline int pageblock_free(struct page *page) +int add_memory_driver_managed(int nid, u64 start, u64 size, + const char *resource_name) { - return PageBuddy(page) && page_order(page) >= pageblock_order; -} + struct resource *res; + int rc; -/* Return the pfn of the start of the next active pageblock after a given pfn */ -static unsigned long next_active_pageblock(unsigned long pfn) -{ - struct page *page = pfn_to_page(pfn); + if (!resource_name || + strstr(resource_name, "System RAM (") != resource_name || + resource_name[strlen(resource_name) - 1] != ')') + return -EINVAL; - /* Ensure the starting page is pageblock-aligned */ - BUG_ON(pfn & (pageblock_nr_pages - 1)); + lock_device_hotplug(); - /* If the entire pageblock is free, move to the end of free page */ - if (pageblock_free(page)) { - int order; - /* be careful. we don't have locks, page_order can be changed.*/ - order = page_order(page); - if ((order < MAX_ORDER) && (order >= pageblock_order)) - return pfn + (1 << order); + res = register_memory_resource(start, size, resource_name); + if (IS_ERR(res)) { + rc = PTR_ERR(res); + goto out_unlock; } - return pfn + pageblock_nr_pages; -} - -static bool is_pageblock_removable_nolock(unsigned long pfn) -{ - struct page *page = pfn_to_page(pfn); - struct zone *zone; - - /* - * We have to be careful here because we are iterating over memory - * sections which are not zone aware so we might end up outside of - * the zone but still within the section. - * We have to take care about the node as well. If the node is offline - * its NODE_DATA will be NULL - see page_zone. - */ - if (!node_online(page_to_nid(page))) - return false; - - zone = page_zone(page); - pfn = page_to_pfn(page); - if (!zone_spans_pfn(zone, pfn)) - return false; - - return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE, - MEMORY_OFFLINE); -} - -/* Checks if this range of memory is likely to be hot-removable. */ -bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) -{ - unsigned long end_pfn, pfn; - - end_pfn = min(start_pfn + nr_pages, - zone_end_pfn(page_zone(pfn_to_page(start_pfn)))); - - /* Check the starting page of each pageblock within the range */ - for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) { - if (!is_pageblock_removable_nolock(pfn)) - return false; - cond_resched(); - } + rc = add_memory_resource(nid, res); + if (rc < 0) + release_memory_resource(res); - /* All pageblocks in the memory block are likely to be hot-removable */ - return true; +out_unlock: + unlock_device_hotplug(); + return rc; } +EXPORT_SYMBOL_GPL(add_memory_driver_managed); +#ifdef CONFIG_MEMORY_HOTREMOVE /* * Confirm all pages in a range [start, end) belong to the same zone (skipping * memory holes). When true, return the zone. @@ -1360,7 +1337,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, } /* - * Check all pages in range, recoreded as memory resource, are isolated. + * Check all pages in range, recorded as memory resource, are isolated. */ static int check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, @@ -1746,8 +1723,12 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); arch_remove_memory(nid, start, size, NULL); - memblock_free(start, size); - memblock_remove(start, size); + + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { + memblock_free(start, size); + memblock_remove(start, size); + } + __release_memory_resource(start, size); try_offline_node(nid); diff --git a/mm/mmap.c b/mm/mmap.c index f609e9ec4a25..39bd60c20a82 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1207,7 +1207,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, } /* - * Rough compatbility check to quickly see if it's even worth looking + * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. * * They need to have the same vm_file, and the flags can only differ diff --git a/mm/mremap.c b/mm/mremap.c index 57b1f999f789..7d69e3fe51aa 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -785,7 +785,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, out: if (offset_in_page(ret)) { vm_unacct_memory(charged); - locked = 0; + locked = false; } if (downgraded) up_read(¤t->mm->mmap_sem); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d79ed1f88c7a..28b3e7a67565 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -257,7 +257,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, * requiring writeback. * * This number of dirtyable pages is the base value of which the - * user-configurable dirty ratio is the effictive number of pages that + * user-configurable dirty ratio is the effective number of pages that * are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * diff --git a/mm/slub.c b/mm/slub.c index d52487919278..b8f798b50d44 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2013,7 +2013,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, #ifdef CONFIG_PREEMPTION /* - * Calculate the next globally unique transaction for disambiguiation + * Calculate the next globally unique transaction for disambiguation * during cmpxchg. The transactions start with the cpu number and are then * incremented by CONFIG_NR_CPUS. */ diff --git a/mm/sparse.c b/mm/sparse.c index 1aee5a481571..6284328cd9f2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -288,7 +288,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) /* * Mark all memblocks as present using memory_present(). This is a - * convienence function that is useful for a number of arches + * convenience function that is useful for a number of arches * to mark all of the systems memory as present during initialization. */ void __init memblocks_present(void) diff --git a/mm/util.c b/mm/util.c index 41b47d8cae09..cd62e6fb5318 100644 --- a/mm/util.c +++ b/mm/util.c @@ -604,6 +604,24 @@ void kvfree(const void *addr) } EXPORT_SYMBOL(kvfree); +/** + * kvfree_sensitive - Free a data object containing sensitive information. + * @addr: address of the data object to be freed. + * @len: length of the data object. + * + * Use the special memzero_explicit() function to clear the content of a + * kvmalloc'ed object containing sensitive data to make sure that the + * compiler won't optimize out the data clearing. + */ +void kvfree_sensitive(const void *addr, size_t len) +{ + if (likely(!ZERO_OR_NULL_PTR(addr))) { + memzero_explicit((void *)addr, len); + kvfree(addr); + } +} +EXPORT_SYMBOL(kvfree_sensitive); + static inline void *__page_rmapping(struct page *page) { unsigned long mapping; @@ -796,10 +814,6 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; - VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < - -(s64)vm_committed_as_batch * num_online_cpus(), - "memory commitment underflow"); - vm_acct_memory(pages); /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1e94497b7388..3091c2ca60df 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2317,7 +2317,7 @@ static inline void __vfree_deferred(const void *addr) * Use raw_cpu_ptr() because this can be called from preemptible * context. Preemption is absolutely fine here, because the llist_add() * implementation is lockless, so it works even if we are adding to - * nother cpu's list. schedule_work() should be fine with this too. + * another cpu's list. schedule_work() should be fine with this too. */ struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); diff --git a/mm/vmscan.c b/mm/vmscan.c index 3792dd19788c..b6d84326bdf2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -682,7 +682,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, freed += ret; /* * Bail out if someone want to register a new shrinker to - * prevent the regsitration from being stalled for long periods + * prevent the registration from being stalled for long periods * by parallel ongoing shrinking. */ if (rwsem_is_contended(&shrinker_rwsem)) { @@ -1613,7 +1613,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) /* * Update LRU sizes after isolating pages. The LRU size updates must - * be complete before mem_cgroup_update_lru_size due to a santity check. + * be complete before mem_cgroup_update_lru_size due to a sanity check. */ static __always_inline void update_lru_sizes(struct lruvec *lruvec, enum lru_list lru, unsigned long *nr_zone_taken) @@ -2371,7 +2371,7 @@ out: /* * Minimally target SWAP_CLUSTER_MAX pages to keep - * reclaim moving forwards, avoiding decremeting + * reclaim moving forwards, avoiding decrementing * sc->priority further than desirable. */ scan = max(scan, SWAP_CLUSTER_MAX); diff --git a/mm/vmstat.c b/mm/vmstat.c index a7db29f7e5f7..3fb23a21f6dd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -2069,24 +2069,14 @@ static int unusable_show(struct seq_file *m, void *arg) return 0; } -static const struct seq_operations unusable_op = { +static const struct seq_operations unusable_sops = { .start = frag_start, .next = frag_next, .stop = frag_stop, .show = unusable_show, }; -static int unusable_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &unusable_op); -} - -static const struct file_operations unusable_file_ops = { - .open = unusable_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(unusable); static void extfrag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) @@ -2121,24 +2111,14 @@ static int extfrag_show(struct seq_file *m, void *arg) return 0; } -static const struct seq_operations extfrag_op = { +static const struct seq_operations extfrag_sops = { .start = frag_start, .next = frag_next, .stop = frag_stop, .show = extfrag_show, }; -static int extfrag_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &extfrag_op); -} - -static const struct file_operations extfrag_file_ops = { - .open = extfrag_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(extfrag); static int __init extfrag_debug_init(void) { @@ -2147,10 +2127,10 @@ static int __init extfrag_debug_init(void) extfrag_debug_root = debugfs_create_dir("extfrag", NULL); debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, - &unusable_file_ops); + &unusable_fops); debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, - &extfrag_file_ops); + &extfrag_fops); return 0; } diff --git a/mm/zbud.c b/mm/zbud.c index de5dd4ddaa82..bc93aa4e46fc 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -243,7 +243,7 @@ static struct zbud_header *init_zbud_page(struct page *page) zhdr->last_chunks = 0; INIT_LIST_HEAD(&zhdr->buddy); INIT_LIST_HEAD(&zhdr->lru); - zhdr->under_reclaim = 0; + zhdr->under_reclaim = false; return zhdr; } diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index bf9e0e87a6ef..10a0e035a787 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -246,6 +246,8 @@ list_types(0) if ($list_types); $fix = 1 if ($fix_inplace); $check_orig = $check; +die "$P: --git cannot be used with --file or --fix\n" if ($git && ($file || $fix)); + my $exit = 0; my $perl_version_ok = 1; @@ -269,11 +271,11 @@ if ($color =~ /^[01]$/) { } elsif ($color =~ /^auto$/i) { $color = (-t STDOUT); } else { - die "Invalid color mode: $color\n"; + die "$P: Invalid color mode: $color\n"; } # skip TAB size 1 to avoid additional checks on $tabsize - 1 -die "Invalid TAB size: $tabsize\n" if ($tabsize < 2); +die "$P: Invalid TAB size: $tabsize\n" if ($tabsize < 2); sub hash_save_array_words { my ($hashRef, $arrayRef) = @_; @@ -1060,6 +1062,7 @@ for my $filename (@ARGV) { while (<$FILE>) { chomp; push(@rawlines, $_); + $vname = qq("$1") if ($filename eq '-' && $_ =~ m/^Subject:\s+(.+)/i); } close($FILE); @@ -1676,8 +1679,16 @@ sub ctx_statement_level { sub ctx_locate_comment { my ($first_line, $end_line) = @_; + # If c99 comment on the current line, or the line before or after + my ($current_comment) = ($rawlines[$end_line - 1] =~ m@^\+.*(//.*$)@); + return $current_comment if (defined $current_comment); + ($current_comment) = ($rawlines[$end_line - 2] =~ m@^[\+ ].*(//.*$)@); + return $current_comment if (defined $current_comment); + ($current_comment) = ($rawlines[$end_line] =~ m@^[\+ ].*(//.*$)@); + return $current_comment if (defined $current_comment); + # Catch a comment on the end of the line itself. - my ($current_comment) = ($rawlines[$end_line - 1] =~ m@.*(/\*.*\*/)\s*(?:\\\s*)?$@); + ($current_comment) = ($rawlines[$end_line - 1] =~ m@.*(/\*.*\*/)\s*(?:\\\s*)?$@); return $current_comment if (defined $current_comment); # Look through the context and try and figure out if there is a @@ -3062,14 +3073,43 @@ sub process { #print "is_start<$is_start> is_end<$is_end> length<$length>\n"; } -# check for MAINTAINERS entries that don't have the right form - if ($realfile =~ /^MAINTAINERS$/ && - $rawline =~ /^\+[A-Z]:/ && - $rawline !~ /^\+[A-Z]:\t\S/) { - if (WARN("MAINTAINERS_STYLE", - "MAINTAINERS entries use one tab after TYPE:\n" . $herecurr) && - $fix) { - $fixed[$fixlinenr] =~ s/^(\+[A-Z]):\s*/$1:\t/; +# check MAINTAINERS entries + if ($realfile =~ /^MAINTAINERS$/) { +# check MAINTAINERS entries for the right form + if ($rawline =~ /^\+[A-Z]:/ && + $rawline !~ /^\+[A-Z]:\t\S/) { + if (WARN("MAINTAINERS_STYLE", + "MAINTAINERS entries use one tab after TYPE:\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/^(\+[A-Z]):\s*/$1:\t/; + } + } +# check MAINTAINERS entries for the right ordering too + my $preferred_order = 'MRLSWQBCPTFXNK'; + if ($rawline =~ /^\+[A-Z]:/ && + $prevrawline =~ /^[\+ ][A-Z]:/) { + $rawline =~ /^\+([A-Z]):\s*(.*)/; + my $cur = $1; + my $curval = $2; + $prevrawline =~ /^[\+ ]([A-Z]):\s*(.*)/; + my $prev = $1; + my $prevval = $2; + my $curindex = index($preferred_order, $cur); + my $previndex = index($preferred_order, $prev); + if ($curindex < 0) { + WARN("MAINTAINERS_STYLE", + "Unknown MAINTAINERS entry type: '$cur'\n" . $herecurr); + } else { + if ($previndex >= 0 && $curindex < $previndex) { + WARN("MAINTAINERS_STYLE", + "Misordered MAINTAINERS entry - list '$cur:' before '$prev:'\n" . $hereprev); + } elsif ((($prev eq 'F' && $cur eq 'F') || + ($prev eq 'X' && $cur eq 'X')) && + ($prevval cmp $curval) > 0) { + WARN("MAINTAINERS_STYLE", + "Misordered MAINTAINERS entry - list file patterns in alphabetic order\n" . $hereprev); + } + } } } diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 6cbcd1a3e113..484d2fbf5921 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -19,6 +19,7 @@ my $V = '0.26'; use Getopt::Long qw(:config no_auto_abbrev); use Cwd; use File::Find; +use File::Spec::Functions; my $cur_path = fastgetcwd() . '/'; my $lk_path = "./"; @@ -57,7 +58,7 @@ my $status = 0; my $letters = ""; my $keywords = 1; my $sections = 0; -my $file_emails = 0; +my $email_file_emails = 0; my $from_filename = 0; my $pattern_depth = 0; my $self_test = undef; @@ -69,6 +70,12 @@ my $vcs_used = 0; my $exit = 0; +my @files = (); +my @fixes = (); # If a patch description includes Fixes: lines +my @range = (); +my @keyword_tvi = (); +my @file_emails = (); + my %commit_author_hash; my %commit_signer_hash; @@ -266,7 +273,7 @@ if (!GetOptions( 'pattern-depth=i' => \$pattern_depth, 'k|keywords!' => \$keywords, 'sections!' => \$sections, - 'fe|file-emails!' => \$file_emails, + 'fe|file-emails!' => \$email_file_emails, 'f|file' => \$from_filename, 'find-maintainer-files' => \$find_maintainer_files, 'mpath|maintainer-path=s' => \$maintainer_path, @@ -424,6 +431,22 @@ sub read_all_maintainer_files { } } +sub maintainers_in_file { + my ($file) = @_; + + return if ($file =~ m@\bMAINTAINERS$@); + + if (-f $file && ($email_file_emails || $file =~ /\.yaml$/)) { + open(my $f, '<', $file) + or die "$P: Can't open $file: $!\n"; + my $text = do { local($/) ; <$f> }; + close($f); + + my @poss_addr = $text =~ m$[A-Za-zÀ-ÿ\"\' \,\.\+-]*\s*[\,]*\s*[\(\<\{]{0,1}[A-Za-z0-9_\.\+-]+\@[A-Za-z0-9\.-]+\.[A-Za-z0-9]+[\)\>\}]{0,1}$g; + push(@file_emails, clean_file_emails(@poss_addr)); + } +} + # # Read mail address map # @@ -504,18 +527,13 @@ sub read_mailmap { ## use the filenames on the command line or find the filenames in the patchfiles -my @files = (); -my @fixes = (); # If a patch description includes Fixes: lines -my @range = (); -my @keyword_tvi = (); -my @file_emails = (); - if (!@ARGV) { push(@ARGV, "&STDIN"); } foreach my $file (@ARGV) { if ($file ne "&STDIN") { + $file = canonpath($file); ##if $file is a directory and it lacks a trailing slash, add one if ((-d $file)) { $file =~ s@([^/])$@$1/@; @@ -527,7 +545,7 @@ foreach my $file (@ARGV) { $file =~ s/^\Q${cur_path}\E//; #strip any absolute path $file =~ s/^\Q${lk_path}\E//; #or the path to the lk tree push(@files, $file); - if ($file ne "MAINTAINERS" && -f $file && ($keywords || $file_emails)) { + if ($file ne "MAINTAINERS" && -f $file && $keywords) { open(my $f, '<', $file) or die "$P: Can't open $file: $!\n"; my $text = do { local($/) ; <$f> }; @@ -539,10 +557,6 @@ foreach my $file (@ARGV) { } } } - if ($file_emails) { - my @poss_addr = $text =~ m$[A-Za-zÀ-ÿ\"\' \,\.\+-]*\s*[\,]*\s*[\(\<\{]{0,1}[A-Za-z0-9_\.\+-]+\@[A-Za-z0-9\.-]+\.[A-Za-z0-9]+[\)\>\}]{0,1}$g; - push(@file_emails, clean_file_emails(@poss_addr)); - } } } else { my $file_cnt = @files; @@ -923,6 +937,8 @@ sub get_maintainers { print("\n"); } } + + maintainers_in_file($file); } if ($keywords) { @@ -1835,7 +1851,7 @@ tm toggle maintainers tg toggle git entries tl toggle open list entries ts toggle subscriber list entries -f emails in file [$file_emails] +f emails in file [$email_file_emails] k keywords in file [$keywords] r remove duplicates [$email_remove_duplicates] p# pattern match depth [$pattern_depth] @@ -1960,7 +1976,7 @@ EOT bool_invert(\$email_git_all_signature_types); $rerun = 1; } elsif ($sel eq "f") { - bool_invert(\$file_emails); + bool_invert(\$email_file_emails); $rerun = 1; } elsif ($sel eq "r") { bool_invert(\$email_remove_duplicates); diff --git a/security/keys/internal.h b/security/keys/internal.h index 6d0ca48ae9a5..153d35c20d3d 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -350,15 +350,4 @@ static inline void key_check(const struct key *key) #define key_check(key) do {} while(0) #endif - -/* - * Helper function to clear and free a kvmalloc'ed memory object. - */ -static inline void __kvzfree(const void *addr, size_t len) -{ - if (addr) { - memset((void *)addr, 0, len); - kvfree(addr); - } -} #endif /* _INTERNAL_H */ diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 5e01192e222a..edde63a63007 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -142,10 +142,7 @@ SYSCALL_DEFINE5(add_key, const char __user *, _type, key_ref_put(keyring_ref); error3: - if (payload) { - memzero_explicit(payload, plen); - kvfree(payload); - } + kvfree_sensitive(payload, plen); error2: kfree(description); error: @@ -360,7 +357,7 @@ long keyctl_update_key(key_serial_t id, key_ref_put(key_ref); error2: - __kvzfree(payload, plen); + kvfree_sensitive(payload, plen); error: return ret; } @@ -914,7 +911,7 @@ can_read_key: */ if (ret > key_data_len) { if (unlikely(key_data)) - __kvzfree(key_data, key_data_len); + kvfree_sensitive(key_data, key_data_len); key_data_len = ret; continue; /* Allocate buffer */ } @@ -923,7 +920,7 @@ can_read_key: ret = -EFAULT; break; } - __kvzfree(key_data, key_data_len); + kvfree_sensitive(key_data, key_data_len); key_put_out: key_put(key); @@ -1225,10 +1222,7 @@ long keyctl_instantiate_key_common(key_serial_t id, keyctl_change_reqkey_auth(NULL); error2: - if (payload) { - memzero_explicit(payload, plen); - kvfree(payload); - } + kvfree_sensitive(payload, plen); error: return ret; } diff --git a/tools/testing/selftests/lib/config b/tools/testing/selftests/lib/config index 14a77ea4a8da..b80ee3f6e265 100644 --- a/tools/testing/selftests/lib/config +++ b/tools/testing/selftests/lib/config @@ -2,3 +2,4 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m CONFIG_PRIME_NUMBERS=m CONFIG_TEST_STRSCPY=m +CONFIG_TEST_BITOPS=m diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 8df6a074e370..849e8226395a 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -10,6 +10,7 @@ mlock2-tests mremap_dontunmap on-fault-limit transhuge-stress +protection_keys userfaultfd mlock-intersect-test mlock-random-test diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 9f18440080ef..a9026706d597 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -22,6 +22,30 @@ TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd TEST_GEN_FILES += khugepaged +ifeq ($(ARCH),x86_64) +CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32) +CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_64bit_program.c) +CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_program.c -no-pie) + +TARGETS := protection_keys +BINARIES_32 := $(TARGETS:%=%_32) +BINARIES_64 := $(TARGETS:%=%_64) + +ifeq ($(CAN_BUILD_WITH_NOPIE),1) +CFLAGS += -no-pie +endif + +ifeq ($(CAN_BUILD_I386),1) +TEST_GEN_FILES += $(BINARIES_32) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +TEST_GEN_FILES += $(BINARIES_64) +endif +else +TEST_GEN_FILES += protection_keys +endif + ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64)) TEST_GEN_FILES += va_128TBswitch TEST_GEN_FILES += virtual_address_range @@ -37,6 +61,55 @@ include ../lib.mk $(OUTPUT)/hmm-tests: LDLIBS += -lhugetlbfs -lpthread +ifeq ($(ARCH),x86_64) +BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) +BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) + +define gen-target-rule-32 +$(1) $(1)_32: $(OUTPUT)/$(1)_32 +.PHONY: $(1) $(1)_32 +endef + +define gen-target-rule-64 +$(1) $(1)_64: $(OUTPUT)/$(1)_64 +.PHONY: $(1) $(1)_64 +endef + +ifeq ($(CAN_BUILD_I386),1) +$(BINARIES_32): CFLAGS += -m32 +$(BINARIES_32): LDLIBS += -lrt -ldl -lm +$(BINARIES_32): %_32: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(TARGETS),$(eval $(call gen-target-rule-32,$(t)))) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +$(BINARIES_64): CFLAGS += -m64 +$(BINARIES_64): LDLIBS += -lrt -ldl +$(BINARIES_64): %_64: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(TARGETS),$(eval $(call gen-target-rule-64,$(t)))) +endif + +# x86_64 users should be encouraged to install 32-bit libraries +ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01) +all: warn_32bit_failure + +warn_32bit_failure: + @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \ + echo "environment. This will reduce test coverage of 64-bit" 2>&1; \ + echo "kernels. If you are using a Debian-like distribution," 2>&1; \ + echo "try:"; 2>&1; \ + echo ""; \ + echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \ + echo ""; \ + echo "If you are using a Fedora-like distribution, try:"; \ + echo ""; \ + echo " yum install glibc-devel.*i686"; \ + exit 0; +endif +endif + $(OUTPUT)/userfaultfd: LDLIBS += -lpthread $(OUTPUT)/mlock-random-test: LDLIBS += -lcap diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c index ee06cb0b9efb..3a7b5ef0b0c6 100644 --- a/tools/testing/selftests/vm/mremap_dontunmap.c +++ b/tools/testing/selftests/vm/mremap_dontunmap.c @@ -11,7 +11,6 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <stdlib.h> #include <unistd.h> #include "../kselftest.h" diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h new file mode 100644 index 000000000000..622a85848f61 --- /dev/null +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PKEYS_HELPER_H +#define _PKEYS_HELPER_H +#define _GNU_SOURCE +#include <string.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdint.h> +#include <stdbool.h> +#include <signal.h> +#include <assert.h> +#include <stdlib.h> +#include <ucontext.h> +#include <sys/mman.h> + +/* Define some kernel-like types */ +#define u8 __u8 +#define u16 __u16 +#define u32 __u32 +#define u64 __u64 + +#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) + +#ifndef DEBUG_LEVEL +#define DEBUG_LEVEL 0 +#endif +#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 +extern int dprint_in_signal; +extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; + +extern int test_nr; +extern int iteration_nr; + +#ifdef __GNUC__ +__attribute__((format(printf, 1, 2))) +#endif +static inline void sigsafe_printf(const char *format, ...) +{ + va_list ap; + + if (!dprint_in_signal) { + va_start(ap, format); + vprintf(format, ap); + va_end(ap); + } else { + int ret; + /* + * No printf() functions are signal-safe. + * They deadlock easily. Write the format + * string to get some output, even if + * incomplete. + */ + ret = write(1, format, strlen(format)); + if (ret < 0) + exit(1); + } +} +#define dprintf_level(level, args...) do { \ + if (level <= DEBUG_LEVEL) \ + sigsafe_printf(args); \ +} while (0) +#define dprintf0(args...) dprintf_level(0, args) +#define dprintf1(args...) dprintf_level(1, args) +#define dprintf2(args...) dprintf_level(2, args) +#define dprintf3(args...) dprintf_level(3, args) +#define dprintf4(args...) dprintf_level(4, args) + +extern void abort_hooks(void); +#define pkey_assert(condition) do { \ + if (!(condition)) { \ + dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ + __FILE__, __LINE__, \ + test_nr, iteration_nr); \ + dprintf0("errno at assert: %d", errno); \ + abort_hooks(); \ + exit(__LINE__); \ + } \ +} while (0) + +__attribute__((noinline)) int read_ptr(int *ptr); +void expected_pkey_fault(int pkey); +int sys_pkey_alloc(unsigned long flags, unsigned long init_val); +int sys_pkey_free(unsigned long pkey); +int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey); +void record_pkey_malloc(void *ptr, long size, int prot); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ +#include "pkey-x86.h" +#elif defined(__powerpc64__) /* arch */ +#include "pkey-powerpc.h" +#else /* arch */ +#error Architecture not supported +#endif /* arch */ + +#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) + +static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + u32 shift = pkey_bit_position(pkey); + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + /* OR in new bits for pkey */ + reg |= (flags & PKEY_MASK) << shift; + return reg; +} + +static inline u64 get_pkey_bits(u64 reg, int pkey) +{ + u32 shift = pkey_bit_position(pkey); + /* + * shift down the relevant bits to the lowest two, then + * mask off all the other higher bits + */ + return ((reg >> shift) & PKEY_MASK); +} + +extern u64 shadow_pkey_reg; + +static inline u64 _read_pkey_reg(int line) +{ + u64 pkey_reg = __read_pkey_reg(); + + dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx" + " shadow: %016llx\n", + line, pkey_reg, shadow_pkey_reg); + assert(pkey_reg == shadow_pkey_reg); + + return pkey_reg; +} + +#define read_pkey_reg() _read_pkey_reg(__LINE__) + +static inline void write_pkey_reg(u64 pkey_reg) +{ + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + /* will do the shadow check for us: */ + read_pkey_reg(); + __write_pkey_reg(pkey_reg); + shadow_pkey_reg = pkey_reg; + dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__, + pkey_reg, __read_pkey_reg()); +} + +/* + * These are technically racy. since something could + * change PKEY register between the read and the write. + */ +static inline void __pkey_access_allow(int pkey, int do_allow) +{ + u64 pkey_reg = read_pkey_reg(); + int bit = pkey * 2; + + if (do_allow) + pkey_reg &= (1<<bit); + else + pkey_reg |= (1<<bit); + + dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); + write_pkey_reg(pkey_reg); +} + +static inline void __pkey_write_allow(int pkey, int do_allow_write) +{ + u64 pkey_reg = read_pkey_reg(); + int bit = pkey * 2 + 1; + + if (do_allow_write) + pkey_reg &= (1<<bit); + else + pkey_reg |= (1<<bit); + + write_pkey_reg(pkey_reg); + dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); +} + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) +#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) +#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) +#define ALIGN_PTR_UP(p, ptr_align_to) \ + ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) +#define ALIGN_PTR_DOWN(p, ptr_align_to) \ + ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) +{ +#ifdef si_pkey + return &si->si_pkey; +#else + return (u32 *)(((u8 *)si) + si_pkey_offset); +#endif +} + +static inline int kernel_has_pkeys(void) +{ + /* try allocating a key and see if it succeeds */ + int ret = sys_pkey_alloc(0, 0); + if (ret <= 0) { + return 0; + } + sys_pkey_free(ret); + return 1; +} + +static inline int is_pkeys_supported(void) +{ + /* check if the cpu supports pkeys */ + if (!cpu_has_pkeys()) { + dprintf1("SKIP: %s: no CPU support\n", __func__); + return 0; + } + + /* check if the kernel supports pkeys */ + if (!kernel_has_pkeys()) { + dprintf1("SKIP: %s: no kernel support\n", __func__); + return 0; + } + + return 1; +} + +#endif /* _PKEYS_HELPER_H */ diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h new file mode 100644 index 000000000000..1ebb586b2fbc --- /dev/null +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_POWERPC_H +#define _PKEYS_POWERPC_H + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 386 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 384 +# define SYS_pkey_free 385 +#endif +#define REG_IP_IDX PT_NIP +#define REG_TRAPNO PT_TRAP +#define gregs gp_regs +#define fpregs fp_regs +#define si_pkey_offset 0x20 + +#undef PKEY_DISABLE_ACCESS +#define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ + +#undef PKEY_DISABLE_WRITE +#define PKEY_DISABLE_WRITE 0x2 + +#define NR_PKEYS 32 +#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey + and 24 other keys that cannot be + represented in the PTE */ +#define NR_RESERVED_PKEYS_64K_3KEYS 3 /* PowerNV and KVM: pkey-0, + pkey-1 and exec-only key */ +#define NR_RESERVED_PKEYS_64K_4KEYS 4 /* PowerVM: pkey-0, pkey-1, + pkey-31 and exec-only key */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL << 24) +#define PAGE_SIZE sysconf(_SC_PAGESIZE) + +static inline u32 pkey_bit_position(int pkey) +{ + return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY; +} + +static inline u64 __read_pkey_reg(void) +{ + u64 pkey_reg; + + asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg)); + + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + u64 amr = pkey_reg; + + dprintf4("%s() changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); + + asm volatile("isync; mtspr 0xd, %0; isync" + : : "r" ((unsigned long)(amr)) : "memory"); + + dprintf4("%s() pkey register after changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); +} + +static inline int cpu_has_pkeys(void) +{ + /* No simple way to determine this */ + return 1; +} + +static inline bool arch_is_powervm() +{ + struct stat buf; + + if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) ) + return true; + + return false; +} + +static inline int get_arch_reserved_keys(void) +{ + if (sysconf(_SC_PAGESIZE) == 4096) + return NR_RESERVED_PKEYS_4K; + else + if (arch_is_powervm()) + return NR_RESERVED_PKEYS_64K_4KEYS; + else + return NR_RESERVED_PKEYS_64K_3KEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + /* + * powerpc does not allow userspace to change permissions of exec-only + * keys since those keys are not allocated by userspace. The signal + * handler wont be able to reset the permissions, which means the code + * will infinitely continue to segfault here. + */ + return; +} + +/* 4-byte instructions * 16384 = 64K page */ +#define __page_o_noops() asm(".rept 16384 ; nop; .endr") + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + void *ptr; + int ret; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + + ret = syscall(__NR_subpage_prot, ptr, size, NULL); + if (ret) { + perror("subpage_perm"); + return PTR_ERR_ENOTSUP; + } + + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); + pkey_assert(!ret); + record_pkey_malloc(ptr, size, prot); + + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); + return ptr; +} + +#endif /* _PKEYS_POWERPC_H */ diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h new file mode 100644 index 000000000000..3be20f5d5275 --- /dev/null +++ b/tools/testing/selftests/vm/pkey-x86.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_X86_H +#define _PKEYS_X86_H + +#ifdef __i386__ + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 380 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 381 +# define SYS_pkey_free 382 +#endif + +#define REG_IP_IDX REG_EIP +#define si_pkey_offset 0x14 + +#else + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 329 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 330 +# define SYS_pkey_free 331 +#endif + +#define REG_IP_IDX REG_RIP +#define si_pkey_offset 0x20 + +#endif + +#ifndef PKEY_DISABLE_ACCESS +# define PKEY_DISABLE_ACCESS 0x1 +#endif + +#ifndef PKEY_DISABLE_WRITE +# define PKEY_DISABLE_WRITE 0x2 +#endif + +#define NR_PKEYS 16 +#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL<<21) +#define PAGE_SIZE 4096 +#define MB (1<<20) + +static inline void __page_o_noops(void) +{ + /* 8-bytes of instruction * 512 bytes = 1 page */ + asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); +} + +static inline u64 __read_pkey_reg(void) +{ + unsigned int eax, edx; + unsigned int ecx = 0; + unsigned pkey_reg; + + asm volatile(".byte 0x0f,0x01,0xee\n\t" + : "=a" (eax), "=d" (edx) + : "c" (ecx)); + pkey_reg = eax; + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + unsigned int eax = pkey_reg; + unsigned int ecx = 0; + unsigned int edx = 0; + + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (eax), "c" (ecx), "d" (edx)); + assert(pkey_reg == __read_pkey_reg()); +} + +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile( + "cpuid;" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ +#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ + +static inline int cpu_has_pkeys(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + + eax = 0x7; + ecx = 0x0; + __cpuid(&eax, &ebx, &ecx, &edx); + + if (!(ecx & X86_FEATURE_PKU)) { + dprintf2("cpu does not have PKU\n"); + return 0; + } + if (!(ecx & X86_FEATURE_OSPKE)) { + dprintf2("cpu does not have OSPKE\n"); + return 0; + } + return 1; +} + +static inline u32 pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + +#define XSTATE_PKEY_BIT (9) +#define XSTATE_PKEY 0x200 + +int pkey_reg_xstate_offset(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + int xstate_offset; + int xstate_size; + unsigned long XSTATE_CPUID = 0xd; + int leaf; + + /* assume that XSTATE_PKEY is set in XCR0 */ + leaf = XSTATE_PKEY_BIT; + { + eax = XSTATE_CPUID; + ecx = leaf; + __cpuid(&eax, &ebx, &ecx, &edx); + + if (leaf == XSTATE_PKEY_BIT) { + xstate_offset = ebx; + xstate_size = eax; + } + } + + if (xstate_size == 0) { + printf("could not find size/offset of PKEY in xsave state\n"); + return 0; + } + + return xstate_offset; +} + +static inline int get_arch_reserved_keys(void) +{ + return NR_RESERVED_PKEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + int ptr_contents; + + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + expected_pkey_fault(pkey); +} + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + return PTR_ERR_ENOTSUP; +} + +#endif /* _PKEYS_X86_H */ diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 480995bceefa..fc19addcb5c8 100644 --- a/tools/testing/selftests/x86/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Tests x86 Memory Protection Keys (see Documentation/core-api/protection-keys.rst) + * Tests Memory Protection Keys (see Documentation/vm/protection-keys.txt) * * There are examples in here of: * * how to set protection keys on memory - * * how to set/clear bits in PKRU (the rights register) - * * how to handle SEGV_PKRU signals and extract pkey-relevant + * * how to set/clear bits in pkey registers (the rights register) + * * how to handle SEGV_PKUERR signals and extract pkey-relevant * information from the siginfo * * Things to add: @@ -22,8 +22,10 @@ * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm */ #define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ #include <errno.h> #include <linux/futex.h> +#include <time.h> #include <sys/time.h> #include <sys/syscall.h> #include <string.h> @@ -48,34 +50,10 @@ int iteration_nr = 1; int test_nr; -unsigned int shadow_pkru; - -#define HPAGE_SIZE (1UL<<21) -#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) -#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) -#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) -#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) -#define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) -#define __stringify_1(x...) #x -#define __stringify(x...) __stringify_1(x) - -#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) - +u64 shadow_pkey_reg; int dprint_in_signal; char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; -extern void abort_hooks(void); -#define pkey_assert(condition) do { \ - if (!(condition)) { \ - dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ - __FILE__, __LINE__, \ - test_nr, iteration_nr); \ - dprintf0("errno at assert: %d", errno); \ - abort_hooks(); \ - exit(__LINE__); \ - } \ -} while (0) - void cat_into_file(char *str, char *file) { int fd = open(file, O_RDWR); @@ -158,12 +136,6 @@ void abort_hooks(void) #endif } -static inline void __page_o_noops(void) -{ - /* 8-bytes of instruction * 512 bytes = 1 page */ - asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); -} - /* * This attempts to have roughly a page of instructions followed by a few * instructions that do a write, and another page of instructions. That @@ -174,7 +146,12 @@ static inline void __page_o_noops(void) * will then fault, which makes sure that the fault code handles * execute-only memory properly. */ +#ifdef __powerpc64__ +/* This way, both 4K and 64K alignment are maintained */ +__attribute__((__aligned__(65536))) +#else __attribute__((__aligned__(PAGE_SIZE))) +#endif void lots_o_noops_around_write(int *write_to_me) { dprintf3("running %s()\n", __func__); @@ -186,51 +163,134 @@ void lots_o_noops_around_write(int *write_to_me) dprintf3("%s() done\n", __func__); } -/* Define some kernel-like types */ -#define u8 uint8_t -#define u16 uint16_t -#define u32 uint32_t -#define u64 uint64_t +void dump_mem(void *dumpme, int len_bytes) +{ + char *c = (void *)dumpme; + int i; -#ifdef __i386__ + for (i = 0; i < len_bytes; i += sizeof(u64)) { + u64 *ptr = (u64 *)(c + i); + dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); + } +} -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 380 -#endif +static u32 hw_pkey_get(int pkey, unsigned long flags) +{ + u64 pkey_reg = __read_pkey_reg(); -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 381 -# define SYS_pkey_free 382 -#endif + dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", + __func__, pkey, flags, 0, 0); + dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); -#define REG_IP_IDX REG_EIP -#define si_pkey_offset 0x14 + return (u32) get_pkey_bits(pkey_reg, pkey); +} -#else +static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) +{ + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); + u64 old_pkey_reg = __read_pkey_reg(); + u64 new_pkey_reg; -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 329 -#endif + /* make sure that 'rights' only contains the bits we expect: */ + assert(!(rights & ~mask)); -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 330 -# define SYS_pkey_free 331 -#endif + /* modify bits accordingly in old pkey_reg and assign it */ + new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); -#define REG_IP_IDX REG_RIP -#define si_pkey_offset 0x20 + __write_pkey_reg(new_pkey_reg); -#endif + dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" + " pkey_reg now: %016llx old_pkey_reg: %016llx\n", + __func__, pkey, rights, flags, 0, __read_pkey_reg(), + old_pkey_reg); + return 0; +} -void dump_mem(void *dumpme, int len_bytes) +void pkey_disable_set(int pkey, int flags) { - char *c = (void *)dumpme; - int i; + unsigned long syscall_flags = 0; + int ret; + int pkey_rights; + u64 orig_pkey_reg = read_pkey_reg(); - for (i = 0; i < len_bytes; i += sizeof(u64)) { - u64 *ptr = (u64 *)(c + i); - dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr); - } + dprintf1("START->%s(%d, 0x%x)\n", __func__, + pkey, flags); + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + pkey_assert(pkey_rights >= 0); + + pkey_rights |= flags; + + ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); + assert(!ret); + /* pkey_reg and flags have the same format */ + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + dprintf1("%s(%d) shadow: 0x%016llx\n", + __func__, pkey, shadow_pkey_reg); + + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", + __func__, pkey, read_pkey_reg()); + if (flags) + pkey_assert(read_pkey_reg() >= orig_pkey_reg); + dprintf1("END<---%s(%d, 0x%x)\n", __func__, + pkey, flags); +} + +void pkey_disable_clear(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights = hw_pkey_get(pkey, syscall_flags); + u64 orig_pkey_reg = read_pkey_reg(); + + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + pkey_assert(pkey_rights >= 0); + + pkey_rights &= ~flags; + + ret = hw_pkey_set(pkey, pkey_rights, 0); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, + pkey, read_pkey_reg()); + if (flags) + assert(read_pkey_reg() <= orig_pkey_reg); +} + +void pkey_write_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); +} +void pkey_write_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_WRITE); +} +void pkey_access_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); +} +void pkey_access_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); } /* Failed address bound checks: */ @@ -255,7 +315,7 @@ static char *si_code_str(int si_code) return "UNKNOWN"; } -int pkru_faults; +int pkey_faults; int last_si_pkey = -1; void signal_handler(int signum, siginfo_t *si, void *vucontext) { @@ -263,24 +323,28 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) int trapno; unsigned long ip; char *fpregs; - u32 *pkru_ptr; +#if defined(__i386__) || defined(__x86_64__) /* arch */ + u32 *pkey_reg_ptr; + int pkey_reg_offset; +#endif /* arch */ u64 siginfo_pkey; u32 *si_pkey_ptr; - int pkru_offset; - fpregset_t fpregset; dprint_in_signal = 1; dprintf1(">>>>===============SIGSEGV============================\n"); - dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__, - __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, + __read_pkey_reg(), shadow_pkey_reg); trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; - fpregset = uctxt->uc_mcontext.fpregs; - fpregs = (void *)fpregset; + fpregs = (char *) uctxt->uc_mcontext.fpregs; - dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__, - trapno, ip, si_code_str(si->si_code), si->si_code); + dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", + __func__, trapno, ip, si_code_str(si->si_code), + si->si_code); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ #ifdef __i386__ /* * 32-bit has some extra padding so that userspace can tell whether @@ -288,20 +352,22 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) * state. We just assume that it is here. */ fpregs += 0x70; -#endif - pkru_offset = pkru_xstate_offset(); - pkru_ptr = (void *)(&fpregs[pkru_offset]); +#endif /* i386 */ + pkey_reg_offset = pkey_reg_xstate_offset(); + pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); - dprintf1("siginfo: %p\n", si); - dprintf1(" fpregs: %p\n", fpregs); /* - * If we got a PKRU fault, we *HAVE* to have at least one bit set in + * If we got a PKEY fault, we *HAVE* to have at least one bit set in * here. */ - dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset()); + dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); if (DEBUG_LEVEL > 4) - dump_mem(pkru_ptr - 128, 256); - pkey_assert(*pkru_ptr); + dump_mem(pkey_reg_ptr - 128, 256); + pkey_assert(*pkey_reg_ptr); +#endif /* arch */ + + dprintf1("siginfo: %p\n", si); + dprintf1(" fpregs: %p\n", fpregs); if ((si->si_code == SEGV_MAPERR) || (si->si_code == SEGV_ACCERR) || @@ -310,20 +376,29 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) exit(4); } - si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset); + si_pkey_ptr = siginfo_get_pkey_ptr(si); dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); dump_mem((u8 *)si_pkey_ptr - 8, 24); siginfo_pkey = *si_pkey_ptr; pkey_assert(siginfo_pkey < NR_PKEYS); last_si_pkey = siginfo_pkey; - dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr); - /* need __rdpkru() version so we do not do shadow_pkru checking */ - dprintf1("signal pkru from pkru: %08x\n", __rdpkru()); - dprintf1("pkey from siginfo: %jx\n", siginfo_pkey); - *(u64 *)pkru_ptr = 0x00000000; - dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n"); - pkru_faults++; + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + dprintf1("signal pkey_reg from pkey_reg: %016llx\n", + __read_pkey_reg()); + dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); +#if defined(__i386__) || defined(__x86_64__) /* arch */ + dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); + *(u64 *)pkey_reg_ptr = 0x00000000; + dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); +#elif defined(__powerpc64__) /* arch */ + /* restore access and let the faulting instruction continue */ + pkey_access_allow(siginfo_pkey); +#endif /* arch */ + pkey_faults++; dprintf1("<<<<==================================================\n"); dprint_in_signal = 0; } @@ -391,143 +466,6 @@ pid_t fork_lazy_child(void) return forkret; } -#ifndef PKEY_DISABLE_ACCESS -# define PKEY_DISABLE_ACCESS 0x1 -#endif - -#ifndef PKEY_DISABLE_WRITE -# define PKEY_DISABLE_WRITE 0x2 -#endif - -static u32 hw_pkey_get(int pkey, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 pkru = __rdpkru(); - u32 shifted_pkru; - u32 masked_pkru; - - dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", - __func__, pkey, flags, 0, 0); - dprintf2("%s() raw pkru: %x\n", __func__, pkru); - - shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY)); - dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru); - masked_pkru = shifted_pkru & mask; - dprintf2("%s() masked pkru: %x\n", __func__, masked_pkru); - /* - * shift down the relevant bits to the lowest two, then - * mask off all the other high bits. - */ - return masked_pkru; -} - -static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 old_pkru = __rdpkru(); - u32 new_pkru; - - /* make sure that 'rights' only contains the bits we expect: */ - assert(!(rights & ~mask)); - - /* copy old pkru */ - new_pkru = old_pkru; - /* mask out bits from pkey in old value: */ - new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY)); - /* OR in new bits for pkey: */ - new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY)); - - __wrpkru(new_pkru); - - dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n", - __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru); - return 0; -} - -void pkey_disable_set(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights; - u32 orig_pkru = rdpkru(); - - dprintf1("START->%s(%d, 0x%x)\n", __func__, - pkey, flags); - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); - assert(!ret); - /*pkru and flags have the same format */ - shadow_pkru |= flags << (pkey * 2); - dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru); - - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); - if (flags) - pkey_assert(rdpkru() > orig_pkru); - dprintf1("END<---%s(%d, 0x%x)\n", __func__, - pkey, flags); -} - -void pkey_disable_clear(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u32 orig_pkru = rdpkru(); - - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, 0); - /* pkru and flags have the same format */ - shadow_pkru &= ~(flags << (pkey * 2)); - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); - if (flags) - assert(rdpkru() > orig_pkru); -} - -void pkey_write_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); -} -void pkey_write_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_WRITE); -} -void pkey_access_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); -} -void pkey_access_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); -} - int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, unsigned long pkey) { @@ -561,33 +499,44 @@ int alloc_pkey(void) int ret; unsigned long init_val = 0x0; - dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n", - __LINE__, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); ret = sys_pkey_alloc(0, init_val); /* - * pkey_alloc() sets PKRU, so we need to reflect it in - * shadow_pkru: + * pkey_alloc() sets PKEY register, so we need to reflect it in + * shadow_pkey_reg: */ - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); if (ret) { /* clear both the bits: */ - shadow_pkru &= ~(0x3 << (ret * 2)); - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + ~PKEY_MASK); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, + __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); /* * move the new state in from init_val - * (remember, we cheated and init_val == pkru format) + * (remember, we cheated and init_val == pkey_reg format) */ - shadow_pkru |= (init_val << (ret * 2)); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + init_val); } - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); - dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); /* for shadow checking: */ - rdpkru(); - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + read_pkey_reg(); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); return ret; } @@ -612,10 +561,10 @@ int alloc_random_pkey(void) int nr_alloced = 0; int random_index; memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); + srand((unsigned int)time(NULL)); /* allocate every possible key and make a note of which ones we got */ max_nr_pkey_allocs = NR_PKEYS; - max_nr_pkey_allocs = 1; for (i = 0; i < max_nr_pkey_allocs; i++) { int new_pkey = alloc_pkey(); if (new_pkey < 0) @@ -638,8 +587,9 @@ int alloc_random_pkey(void) free_ret = sys_pkey_free(alloced_pkeys[i]); pkey_assert(!free_ret); } - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); return ret; } @@ -657,11 +607,15 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, if (nr_iterations-- < 0) break; - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); sys_pkey_free(rpkey); - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); } pkey_assert(pkey < NR_PKEYS); @@ -669,8 +623,9 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", ptr, size, orig_prot, pkey, ret); pkey_assert(!ret); - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); return ret; } @@ -752,7 +707,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) void *ptr; int ret; - rdpkru(); + read_pkey_reg(); dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, size, prot, pkey); pkey_assert(pkey < NR_PKEYS); @@ -761,7 +716,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); pkey_assert(!ret); record_pkey_malloc(ptr, size, prot); - rdpkru(); + read_pkey_reg(); dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); return ptr; @@ -798,12 +753,15 @@ void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) } int hugetlb_setup_ok; +#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" #define GET_NR_HUGE_PAGES 10 void setup_hugetlbfs(void) { int err; int fd; - char buf[] = "123"; + char buf[256]; + long hpagesz_kb; + long hpagesz_mb; if (geteuid() != 0) { fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); @@ -814,11 +772,16 @@ void setup_hugetlbfs(void) /* * Now go make sure that we got the pages and that they - * are 2M pages. Someone might have made 1G the default. + * are PMD-level pages. Someone might have made PUD-level + * pages the default. */ - fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY); + hpagesz_kb = HPAGE_SIZE / 1024; + hpagesz_mb = hpagesz_kb / 1024; + sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); + fd = open(buf, O_RDONLY); if (fd < 0) { - perror("opening sysfs 2M hugetlb config"); + fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); return; } @@ -826,13 +789,14 @@ void setup_hugetlbfs(void) err = read(fd, buf, sizeof(buf)-1); close(fd); if (err <= 0) { - perror("reading sysfs 2M hugetlb config"); + fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); return; } if (atoi(buf) != GET_NR_HUGE_PAGES) { - fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n", - buf, GET_NR_HUGE_PAGES); + fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", + hpagesz_mb, buf, GET_NR_HUGE_PAGES); return; } @@ -886,6 +850,7 @@ void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { malloc_pkey_with_mprotect, + malloc_pkey_with_mprotect_subpage, malloc_pkey_anon_huge, malloc_pkey_hugetlb /* can not do direct with the pkey_mprotect() API: @@ -924,14 +889,14 @@ void *malloc_pkey(long size, int prot, u16 pkey) return ret; } -int last_pkru_faults; +int last_pkey_faults; #define UNKNOWN_PKEY -2 -void expected_pk_fault(int pkey) +void expected_pkey_fault(int pkey) { - dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n", - __func__, last_pkru_faults, pkru_faults); + dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", + __func__, last_pkey_faults, pkey_faults); dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); - pkey_assert(last_pkru_faults + 1 == pkru_faults); + pkey_assert(last_pkey_faults + 1 == pkey_faults); /* * For exec-only memory, we do not know the pkey in @@ -940,24 +905,28 @@ void expected_pk_fault(int pkey) if (pkey != UNKNOWN_PKEY) pkey_assert(last_si_pkey == pkey); +#if defined(__i386__) || defined(__x86_64__) /* arch */ /* - * The signal handler shold have cleared out PKRU to let the + * The signal handler shold have cleared out PKEY register to let the * test program continue. We now have to restore it. */ - if (__rdpkru() != 0) + if (__read_pkey_reg() != 0) +#else /* arch */ + if (__read_pkey_reg() != shadow_pkey_reg) +#endif /* arch */ pkey_assert(0); - __wrpkru(shadow_pkru); - dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n", - __func__, shadow_pkru); - last_pkru_faults = pkru_faults; + __write_pkey_reg(shadow_pkey_reg); + dprintf1("%s() set pkey_reg=%016llx to restore state after signal " + "nuked it\n", __func__, shadow_pkey_reg); + last_pkey_faults = pkey_faults; last_si_pkey = -1; } -#define do_not_expect_pk_fault(msg) do { \ - if (last_pkru_faults != pkru_faults) \ - dprintf0("unexpected PK fault: %s\n", msg); \ - pkey_assert(last_pkru_faults == pkru_faults); \ +#define do_not_expect_pkey_fault(msg) do { \ + if (last_pkey_faults != pkey_faults) \ + dprintf0("unexpected PKey fault: %s\n", msg); \ + pkey_assert(last_pkey_faults == pkey_faults); \ } while (0) int test_fds[10] = { -1 }; @@ -1000,6 +969,58 @@ __attribute__((noinline)) int read_ptr(int *ptr) return *ptr; } +void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) +{ + int i, err; + int max_nr_pkey_allocs; + int alloced_pkeys[NR_PKEYS]; + int nr_alloced = 0; + long size; + + pkey_assert(pkey_last_malloc_record); + size = pkey_last_malloc_record->size; + /* + * This is a bit of a hack. But mprotect() requires + * huge-page-aligned sizes when operating on hugetlbfs. + * So, make sure that we use something that's a multiple + * of a huge page when we can. + */ + if (size >= HPAGE_SIZE) + size = HPAGE_SIZE; + + /* allocate every possible key and make sure key-0 never got allocated */ + max_nr_pkey_allocs = NR_PKEYS; + for (i = 0; i < max_nr_pkey_allocs; i++) { + int new_pkey = alloc_pkey(); + pkey_assert(new_pkey != 0); + + if (new_pkey < 0) + break; + alloced_pkeys[nr_alloced++] = new_pkey; + } + /* free all the allocated keys */ + for (i = 0; i < nr_alloced; i++) { + int free_ret; + + if (!alloced_pkeys[i]) + continue; + free_ret = sys_pkey_free(alloced_pkeys[i]); + pkey_assert(!free_ret); + } + + /* attach key-0 in various modes */ + err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); + pkey_assert(!err); +} + void test_read_of_write_disabled_region(int *ptr, u16 pkey) { int ptr_contents; @@ -1015,26 +1036,67 @@ void test_read_of_access_disabled_region(int *ptr, u16 pkey) int ptr_contents; dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); - rdpkru(); + read_pkey_reg(); + pkey_access_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + expected_pkey_fault(pkey); +} + +void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", + pkey, ptr); + ptr_contents = read_ptr(ptr); + dprintf1("reading ptr before disabling the read : %d\n", + ptr_contents); + read_pkey_reg(); pkey_access_deny(pkey); ptr_contents = read_ptr(ptr); dprintf1("*ptr: %d\n", ptr_contents); - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } + +void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling write access; after accessing the page, " + "to PKEY[%02d], doing write\n", pkey); + pkey_write_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + void test_write_of_write_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); pkey_write_deny(pkey); *ptr = __LINE__; - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } void test_write_of_access_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); pkey_access_deny(pkey); *ptr = __LINE__; - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } + +void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling access; after accessing the page, " + " to PKEY[%02d], doing write\n", pkey); + pkey_access_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) { int ret; @@ -1160,9 +1222,11 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) int new_pkey; dprintf1("%s() alloc loop: %d\n", __func__, i); new_pkey = alloc_pkey(); - dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, err, __rdpkru(), shadow_pkru); - rdpkru(); /* for shadow checking */ + dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, err, __read_pkey_reg(), + shadow_pkey_reg); + read_pkey_reg(); /* for shadow checking */ dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); if ((new_pkey == -1) && (errno == ENOSPC)) { dprintf2("%s() failed to allocate pkey after %d tries\n", @@ -1188,6 +1252,7 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) dprintf3("%s()::%d\n", __func__, __LINE__); /* + * On x86: * There are 16 pkeys supported in hardware. Three are * allocated by the time we get here: * 1. The default key (0) @@ -1195,13 +1260,21 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) * 3. One allocated by the test code and passed in via * 'pkey' to this function. * Ensure that we can allocate at least another 13 (16-3). + * + * On powerpc: + * There are either 5, 28, 29 or 32 pkeys supported in + * hardware depending on the page size (4K or 64K) and + * platform (powernv or powervm). Four are allocated by + * the time we get here. These include pkey-0, pkey-1, + * exec-only pkey and the one allocated by the test code. + * Ensure that we can allocate the remaining. */ - pkey_assert(i >= NR_PKEYS-3); + pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); for (i = 0; i < nr_allocated_pkeys; i++) { err = sys_pkey_free(allocated_pkeys[i]); pkey_assert(!err); - rdpkru(); /* for shadow checking */ + read_pkey_reg(); /* for shadow checking */ } } @@ -1287,7 +1360,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey) pkey_assert(ret != -1); /* Now access from the current task, and expect an exception: */ peek_result = read_ptr(ptr); - expected_pk_fault(pkey); + expected_pkey_fault(pkey); /* * Try to access the NON-pkey-protected "plain_ptr" via ptrace: @@ -1297,7 +1370,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey) pkey_assert(ret != -1); /* Now access from the current task, and expect NO exception: */ peek_result = read_ptr(plain_ptr); - do_not_expect_pk_fault("read plain pointer after ptrace"); + do_not_expect_pkey_fault("read plain pointer after ptrace"); ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); pkey_assert(ret != -1); @@ -1347,17 +1420,15 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey) pkey_assert(!ret); pkey_access_deny(pkey); - dprintf2("pkru: %x\n", rdpkru()); + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); /* * Make sure this is an *instruction* fault */ madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(&scratch); - do_not_expect_pk_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pk_fault(pkey); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, pkey); } void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) @@ -1378,15 +1449,13 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); pkey_assert(!ret); - dprintf2("pkru: %x\n", rdpkru()); + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); /* Make sure this is an *instruction* fault */ madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(&scratch); - do_not_expect_pk_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pk_fault(UNKNOWN_PKEY); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); /* * Put the memory back to non-PROT_EXEC. Should clear the @@ -1400,7 +1469,7 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); pkey_assert(!ret); ptr_contents = read_ptr(p1); - do_not_expect_pk_fault("plain read on recently PROT_EXEC area"); + do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); } void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) @@ -1408,7 +1477,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) int size = PAGE_SIZE; int sret; - if (cpu_has_pku()) { + if (cpu_has_pkeys()) { dprintf1("SKIP: %s: no CPU support\n", __func__); return; } @@ -1420,8 +1489,11 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) void (*pkey_tests[])(int *ptr, u16 pkey) = { test_read_of_write_disabled_region, test_read_of_access_disabled_region, + test_read_of_access_disabled_region_with_page_already_mapped, test_write_of_write_disabled_region, + test_write_of_write_disabled_region_with_page_already_mapped, test_write_of_access_disabled_region, + test_write_of_access_disabled_region_with_page_already_mapped, test_kernel_write_of_access_disabled_region, test_kernel_write_of_write_disabled_region, test_kernel_gup_of_access_disabled_region, @@ -1433,6 +1505,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_pkey_syscalls_on_non_allocated_pkey, test_pkey_syscalls_bad_args, test_pkey_alloc_exhaust, + test_pkey_alloc_free_attach_pkey0, }; void run_tests_once(void) @@ -1442,7 +1515,7 @@ void run_tests_once(void) for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { int pkey; - int orig_pkru_faults = pkru_faults; + int orig_pkey_faults = pkey_faults; dprintf1("======================\n"); dprintf1("test %d preparing...\n", test_nr); @@ -1457,8 +1530,8 @@ void run_tests_once(void) free_pkey_malloc(ptr); sys_pkey_free(pkey); - dprintf1("pkru_faults: %d\n", pkru_faults); - dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults); + dprintf1("pkey_faults: %d\n", pkey_faults); + dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); tracing_off(); close_test_fds(); @@ -1471,18 +1544,19 @@ void run_tests_once(void) void pkey_setup_shadow(void) { - shadow_pkru = __rdpkru(); + shadow_pkey_reg = __read_pkey_reg(); } int main(void) { int nr_iterations = 22; + int pkeys_supported = is_pkeys_supported(); setup_handlers(); - printf("has pku: %d\n", cpu_has_pku()); + printf("has pkeys: %d\n", pkeys_supported); - if (!cpu_has_pku()) { + if (!pkeys_supported) { int size = PAGE_SIZE; int *ptr; @@ -1495,7 +1569,7 @@ int main(void) } pkey_setup_shadow(); - printf("startup pkru: %x\n", rdpkru()); + printf("startup pkey_reg: %016llx\n", read_pkey_reg()); setup_hugetlbfs(); while (nr_iterations-- > 0) diff --git a/tools/testing/selftests/x86/.gitignore b/tools/testing/selftests/x86/.gitignore index 022a1f3b64ef..1aaef5bf119a 100644 --- a/tools/testing/selftests/x86/.gitignore +++ b/tools/testing/selftests/x86/.gitignore @@ -12,5 +12,4 @@ ldt_gdt iopl mpx-mini-test ioperm -protection_keys test_vdso diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 5d49bfec1e9a..5f16821c7f63 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -12,7 +12,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie) TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ check_initial_reg_state sigreturn iopl ioperm \ - protection_keys test_vdso test_vsyscall mov_ss_trap \ + test_vdso test_vsyscall mov_ss_trap \ syscall_arg_fault TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ diff --git a/tools/testing/selftests/x86/pkey-helpers.h b/tools/testing/selftests/x86/pkey-helpers.h deleted file mode 100644 index 254e5436bdd9..000000000000 --- a/tools/testing/selftests/x86/pkey-helpers.h +++ /dev/null @@ -1,219 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PKEYS_HELPER_H -#define _PKEYS_HELPER_H -#define _GNU_SOURCE -#include <string.h> -#include <stdarg.h> -#include <stdio.h> -#include <stdint.h> -#include <stdbool.h> -#include <signal.h> -#include <assert.h> -#include <stdlib.h> -#include <ucontext.h> -#include <sys/mman.h> - -#define NR_PKEYS 16 -#define PKRU_BITS_PER_PKEY 2 - -#ifndef DEBUG_LEVEL -#define DEBUG_LEVEL 0 -#endif -#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 -extern int dprint_in_signal; -extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; -static inline void sigsafe_printf(const char *format, ...) -{ - va_list ap; - - if (!dprint_in_signal) { - va_start(ap, format); - vprintf(format, ap); - va_end(ap); - } else { - int ret; - /* - * No printf() functions are signal-safe. - * They deadlock easily. Write the format - * string to get some output, even if - * incomplete. - */ - ret = write(1, format, strlen(format)); - if (ret < 0) - exit(1); - } -} -#define dprintf_level(level, args...) do { \ - if (level <= DEBUG_LEVEL) \ - sigsafe_printf(args); \ -} while (0) -#define dprintf0(args...) dprintf_level(0, args) -#define dprintf1(args...) dprintf_level(1, args) -#define dprintf2(args...) dprintf_level(2, args) -#define dprintf3(args...) dprintf_level(3, args) -#define dprintf4(args...) dprintf_level(4, args) - -extern unsigned int shadow_pkru; -static inline unsigned int __rdpkru(void) -{ - unsigned int eax, edx; - unsigned int ecx = 0; - unsigned int pkru; - - asm volatile(".byte 0x0f,0x01,0xee\n\t" - : "=a" (eax), "=d" (edx) - : "c" (ecx)); - pkru = eax; - return pkru; -} - -static inline unsigned int _rdpkru(int line) -{ - unsigned int pkru = __rdpkru(); - - dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n", - line, pkru, shadow_pkru); - assert(pkru == shadow_pkru); - - return pkru; -} - -#define rdpkru() _rdpkru(__LINE__) - -static inline void __wrpkru(unsigned int pkru) -{ - unsigned int eax = pkru; - unsigned int ecx = 0; - unsigned int edx = 0; - - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); - asm volatile(".byte 0x0f,0x01,0xef\n\t" - : : "a" (eax), "c" (ecx), "d" (edx)); - assert(pkru == __rdpkru()); -} - -static inline void wrpkru(unsigned int pkru) -{ - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); - /* will do the shadow check for us: */ - rdpkru(); - __wrpkru(pkru); - shadow_pkru = pkru; - dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru()); -} - -/* - * These are technically racy. since something could - * change PKRU between the read and the write. - */ -static inline void __pkey_access_allow(int pkey, int do_allow) -{ - unsigned int pkru = rdpkru(); - int bit = pkey * 2; - - if (do_allow) - pkru &= (1<<bit); - else - pkru |= (1<<bit); - - dprintf4("pkru now: %08x\n", rdpkru()); - wrpkru(pkru); -} - -static inline void __pkey_write_allow(int pkey, int do_allow_write) -{ - long pkru = rdpkru(); - int bit = pkey * 2 + 1; - - if (do_allow_write) - pkru &= (1<<bit); - else - pkru |= (1<<bit); - - wrpkru(pkru); - dprintf4("pkru now: %08x\n", rdpkru()); -} - -#define PROT_PKEY0 0x10 /* protection key value (bit 0) */ -#define PROT_PKEY1 0x20 /* protection key value (bit 1) */ -#define PROT_PKEY2 0x40 /* protection key value (bit 2) */ -#define PROT_PKEY3 0x80 /* protection key value (bit 3) */ - -#define PAGE_SIZE 4096 -#define MB (1<<20) - -static inline void __cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - /* ecx is often an input as well as an output. */ - asm volatile( - "cpuid;" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx)); -} - -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ -#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ - -static inline int cpu_has_pku(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - - eax = 0x7; - ecx = 0x0; - __cpuid(&eax, &ebx, &ecx, &edx); - - if (!(ecx & X86_FEATURE_PKU)) { - dprintf2("cpu does not have PKU\n"); - return 0; - } - if (!(ecx & X86_FEATURE_OSPKE)) { - dprintf2("cpu does not have OSPKE\n"); - return 0; - } - return 1; -} - -#define XSTATE_PKRU_BIT (9) -#define XSTATE_PKRU 0x200 - -int pkru_xstate_offset(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - int xstate_offset; - int xstate_size; - unsigned long XSTATE_CPUID = 0xd; - int leaf; - - /* assume that XSTATE_PKRU is set in XCR0 */ - leaf = XSTATE_PKRU_BIT; - { - eax = XSTATE_CPUID; - ecx = leaf; - __cpuid(&eax, &ebx, &ecx, &edx); - - if (leaf == XSTATE_PKRU_BIT) { - xstate_offset = ebx; - xstate_size = eax; - } - } - - if (xstate_size == 0) { - printf("could not find size/offset of PKRU in xsave state\n"); - return 0; - } - - return xstate_offset; -} - -#endif /* _PKEYS_HELPER_H */ |