diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-02-01 05:46:22 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-02-01 05:46:22 +0300 |
commit | 73da9e1a9f310a449eeb9bf5735a9cd475fef5e2 (patch) | |
tree | 82cd78255b0a480340a8427e7ba5586df8280ac4 | |
parent | b2fe5fa68642860e7de76167c3111623aa0d5de1 (diff) | |
parent | 3f56a2f8030071cf86520ef4fc3045ba6856e610 (diff) | |
download | linux-73da9e1a9f310a449eeb9bf5735a9cd475fef5e2.tar.xz |
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton:
- misc fixes
- ocfs2 updates
- most of MM
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (118 commits)
mm: remove PG_highmem description
tools, vm: new option to specify kpageflags file
mm/swap.c: make functions and their kernel-doc agree
mm, memory_hotplug: fix memmap initialization
mm: correct comments regarding do_fault_around()
mm: numa: do not trap faults on shared data section pages.
hugetlb, mbind: fall back to default policy if vma is NULL
hugetlb, mempolicy: fix the mbind hugetlb migration
mm, hugetlb: further simplify hugetlb allocation API
mm, hugetlb: get rid of surplus page accounting tricks
mm, hugetlb: do not rely on overcommit limit during migration
mm, hugetlb: integrate giga hugetlb more naturally to the allocation path
mm, hugetlb: unify core page allocation accounting and initialization
mm/memcontrol.c: try harder to decrease [memory,memsw].limit_in_bytes
mm/memcontrol.c: make local symbol static
mm/hmm: fix uninitialized use of 'entry' in hmm_vma_walk_pmd()
include/linux/mmzone.h: fix explanation of lower bits in the SPARSEMEM mem_map pointer
mm/compaction.c: fix comment for try_to_compact_pages()
mm/page_ext.c: make page_ext_init a noop when CONFIG_PAGE_EXTENSION but nothing uses it
zsmalloc: use U suffix for negative literals being shifted
...
118 files changed, 2607 insertions, 1863 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 5025ff9307e6..ff234d229cbb 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -30,7 +30,6 @@ Currently, these files are in /proc/sys/vm: - dirty_writeback_centisecs - drop_caches - extfrag_threshold -- hugepages_treat_as_movable - hugetlb_shm_group - laptop_mode - legacy_va_layout @@ -261,30 +260,6 @@ any throttling. ============================================================== -hugepages_treat_as_movable - -This parameter controls whether we can allocate hugepages from ZONE_MOVABLE -or not. If set to non-zero, hugepages can be allocated from ZONE_MOVABLE. -ZONE_MOVABLE is created when kernel boot parameter kernelcore= is specified, -so this parameter has no effect if used without kernelcore=. - -Hugepage migration is now available in some situations which depend on the -architecture and/or the hugepage size. If a hugepage supports migration, -allocation from ZONE_MOVABLE is always enabled for the hugepage regardless -of the value of this parameter. -IOW, this parameter affects only non-migratable hugepages. - -Assuming that hugepages are not migratable in your system, one usecase of -this parameter is that users can make hugepage pool more extensible by -enabling the allocation from ZONE_MOVABLE. This is because on ZONE_MOVABLE -page reclaim/migration/compaction work more and you can get contiguous -memory more likely. Note that using ZONE_MOVABLE for non-migratable -hugepages can do harm to other features like memory hotremove (because -memory hotremove expects that memory blocks on ZONE_MOVABLE are always -removable,) so it's a trade-off responsible for the users. - -============================================================== - hugetlb_shm_group hugetlb_shm_group contains group id that is allowed to create SysV diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 59cbc803aad6..faf077d50d42 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -20,19 +20,20 @@ options. The /proc/meminfo file provides information about the total number of persistent hugetlb pages in the kernel's huge page pool. It also displays -information about the number of free, reserved and surplus huge pages and the -default huge page size. The huge page size is needed for generating the -proper alignment and size of the arguments to system calls that map huge page -regions. +default huge page size and information about the number of free, reserved +and surplus huge pages in the pool of huge pages of default size. +The huge page size is needed for generating the proper alignment and +size of the arguments to system calls that map huge page regions. The output of "cat /proc/meminfo" will include lines like: ..... -HugePages_Total: vvv -HugePages_Free: www -HugePages_Rsvd: xxx -HugePages_Surp: yyy -Hugepagesize: zzz kB +HugePages_Total: uuu +HugePages_Free: vvv +HugePages_Rsvd: www +HugePages_Surp: xxx +Hugepagesize: yyy kB +Hugetlb: zzz kB where: HugePages_Total is the size of the pool of huge pages. @@ -47,6 +48,14 @@ HugePages_Surp is short for "surplus," and is the number of huge pages in the pool above the value in /proc/sys/vm/nr_hugepages. The maximum number of surplus huge pages is controlled by /proc/sys/vm/nr_overcommit_hugepages. +Hugepagesize is the default hugepage size (in Kb). +Hugetlb is the total amount of memory (in kB), consumed by huge + pages of all sizes. + If huge pages of different sizes are in use, this number + will exceed HugePages_Total * Hugepagesize. To get more + detailed information, please, refer to + /sys/kernel/mm/hugepages (described below). + /proc/filesystems should also show a filesystem of type "hugetlbfs" configured in the kernel. diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h index b18fcb606908..dc8ee011882f 100644 --- a/arch/arc/include/asm/hugepage.h +++ b/arch/arc/include/asm/hugepage.h @@ -74,4 +74,7 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); extern void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); +/* We don't have hardware dirty/accessed bits, generic_pmdp_establish is fine.*/ +#define pmdp_establish generic_pmdp_establish + #endif diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 1a7a17b2a1ba..2a4836087358 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -249,6 +249,9 @@ PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF); #define pfn_pmd(pfn,prot) (__pmd(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))) #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) +/* No hardware dirty/accessed bits -- generic_pmdp_establish() fits */ +#define pmdp_establish generic_pmdp_establish + /* represent a notpresent pmd by faulting entry, this is used by pmdp_invalidate */ static inline pmd_t pmd_mknotpresent(pmd_t pmd) { diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 89167c43ebb5..094374c82db0 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -706,6 +706,13 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, { ptep_set_wrprotect(mm, address, (pte_t *)pmdp); } + +#define pmdp_establish pmdp_establish +static inline pmd_t pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) +{ + return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd))); +} #endif extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c index b88a8dd14933..a6f300a208bd 100644 --- a/arch/m32r/kernel/traps.c +++ b/arch/m32r/kernel/traps.c @@ -115,14 +115,6 @@ static void set_eit_vector_entries(void) _flush_cache_copyback_all(); } -void abort(void) -{ - BUG(); - - /* if that doesn't kill us, halt */ - panic("Oops failed to kill thread"); -} - void __init trap_init(void) { set_eit_vector_entries(); diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 1a508a74d48d..129e0328367f 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -534,6 +534,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, #ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* We don't have hardware dirty/accessed bits, generic_pmdp_establish is fine.*/ +#define pmdp_establish generic_pmdp_establish + #define has_transparent_hugepage has_transparent_hugepage extern int has_transparent_hugepage(void); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e92432ae9737..73fcf592ee91 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -151,7 +151,6 @@ config PPC select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 197ced1eaaa0..2d9df40446f6 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -101,8 +101,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); -extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); extern int hash__has_transparent_hugepage(void); diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 8d40cf03cb67..cb46d1034f33 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -203,8 +203,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); -extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); extern int hash__has_transparent_hugepage(void); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 44697817ccc6..6ca1208cedcb 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1137,17 +1137,8 @@ static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, } #define __HAVE_ARCH_PMDP_INVALIDATE -extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp); - -#define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE -static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - if (radix_enabled()) - return radix__pmdp_huge_split_prepare(vma, address, pmdp); - return hash__pmdp_huge_split_prepare(vma, address, pmdp); -} +extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp); #define pmd_move_must_withdraw pmd_move_must_withdraw struct spinlock; diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 19c44e1495ae..365010f66570 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -269,12 +269,6 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE); return __pmd(pmd_val(pmd) | _PAGE_PTE); } -static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - /* Nothing to do for radix. */ - return; -} extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long clr, diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 3b65917785a5..422e80253a33 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -90,16 +90,19 @@ void serialize_against_pte_lookup(struct mm_struct *mm) * We use this to invalidate a pmdp entry before switching from a * hugepte to regular pmd entry. */ -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); + unsigned long old_pmd; + + old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); /* * This ensures that generic code that rely on IRQ disabling * to prevent a parallel THP split work as expected. */ serialize_against_pte_lookup(vma->vm_mm); + return __pmd(old_pmd); } static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index ec277913e01b..469808e77e58 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -296,28 +296,6 @@ pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) return pgtable; } -void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); - VM_BUG_ON(pmd_devmap(*pmdp)); - - /* - * We can't mark the pmd none here, because that will cause a race - * against exit_mmap. We need to continue mark pmd TRANS HUGE, while - * we spilt, but at the same time we wan't rest of the ppc64 code - * not to insert hash pte on this, because we will be modifying - * the deposited pgtable in the caller of this function. Hence - * clear the _PAGE_USER so that we move the fault handling to - * higher level function and that will serialize against ptl. - * We need to flush existing hash pte entries here even though, - * the translation is still valid, because we will withdraw - * pgtable_t after this. - */ - pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED); -} - /* * A linux hugepage PMD was changed and the corresponding hash table entries * neesd to be flushed. diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 9376637229c9..0105ce28e246 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -108,7 +108,6 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0a6b0286c32e..2d24d33bf188 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1505,12 +1505,12 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, } #define __HAVE_ARCH_PMDP_INVALIDATE -static inline void pmdp_invalidate(struct vm_area_struct *vma, +static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID); - pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd); + return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd); } #define __HAVE_ARCH_PMDP_SET_WRPROTECT diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 9937c5ff94a9..339920fdf9ed 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1010,7 +1010,7 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); #define __HAVE_ARCH_PMDP_INVALIDATE -extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PGTABLE_DEPOSIT diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 4ae86bc0d35c..847ddffbf38a 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -219,17 +219,28 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, } } +static inline pmd_t pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) +{ + pmd_t old; + + do { + old = *pmdp; + } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd); + + return old; +} + /* * This routine is only called when splitting a THP */ -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_t entry = *pmdp; - - pmd_val(entry) &= ~_PAGE_VALID; + pmd_t old, entry; - set_pmd_at(vma->vm_mm, address, pmdp, entry); + entry = __pmd(pmd_val(*pmdp) & ~_PAGE_VALID); + old = pmdp_establish(vma, address, pmdp, entry); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); /* @@ -240,6 +251,8 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, if ((pmd_val(entry) & _PAGE_PMD_HUGE) && !is_huge_zero_page(pmd_page(entry))) (vma->vm_mm)->context.thp_pte_count--; + + return old; } void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fcd3b4d24eea..d416fd9fdb3b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -69,7 +69,6 @@ config X86 select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index bc4af5453802..f24df59c40b2 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -158,7 +158,6 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep) #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #endif -#ifdef CONFIG_SMP union split_pmd { struct { u32 pmd_low; @@ -166,6 +165,8 @@ union split_pmd { }; pmd_t pmd; }; + +#ifdef CONFIG_SMP static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) { union split_pmd res, *orig = (union split_pmd *)pmdp; @@ -181,6 +182,40 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#ifndef pmdp_establish +#define pmdp_establish pmdp_establish +static inline pmd_t pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) +{ + pmd_t old; + + /* + * If pmd has present bit cleared we can get away without expensive + * cmpxchg64: we can update pmdp half-by-half without racing with + * anybody. + */ + if (!(pmd_val(pmd) & _PAGE_PRESENT)) { + union split_pmd old, new, *ptr; + + ptr = (union split_pmd *)pmdp; + + new.pmd = pmd; + + /* xchg acts as a barrier before setting of the high bits */ + old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low); + old.pmd_high = ptr->pmd_high; + ptr->pmd_high = new.pmd_high; + return old.pmd; + } + + do { + old = *pmdp; + } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd); + + return old; +} +#endif + #ifdef CONFIG_SMP union split_pud { struct { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e42b8943cb1a..63c2552b6b65 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1109,6 +1109,21 @@ static inline int pud_write(pud_t pud) return pud_flags(pud) & _PAGE_RW; } +#ifndef pmdp_establish +#define pmdp_establish pmdp_establish +static inline pmd_t pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) +{ + if (IS_ENABLED(CONFIG_SMP)) { + return xchg(pmdp, pmd); + } else { + pmd_t old = *pmdp; + *pmdp = pmd; + return old; + } +} +#endif + /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index e7b3ce123da6..70aceefe14d5 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler, static void handle_remove(struct work_struct *work); static const struct mmu_notifier_ops mn_opts = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = mmu_notifier_range_start, }; diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 7d94e1d39e5e..df72493a0f13 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops iommu_mn = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .release = mn_release, .clear_flush_young = mn_clear_flush_young, .invalidate_range = mn_invalidate_range, diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index ed1cf7c5a43b..0a826eb7fe48 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops intel_mmuops = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .release = intel_mm_release, .change_pte = intel_change_pte, .invalidate_range = intel_invalidate_range, diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c index 9918eda0e05f..a3454eb56fbf 100644 --- a/drivers/misc/sgi-gru/grutlbpurge.c +++ b/drivers/misc/sgi-gru/grutlbpurge.c @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) static const struct mmu_notifier_ops gru_mmuops = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = gru_invalidate_range_start, .invalidate_range_end = gru_invalidate_range_end, .release = gru_release, @@ -44,6 +44,7 @@ /* The 'colour' (ie low bits) within a PMD of a page offset. */ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) +#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; @@ -375,8 +376,8 @@ restart: * unmapped. */ if (pmd_downgrade && dax_is_zero_entry(entry)) - unmap_mapping_range(mapping, - (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); + unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, + PG_PMD_NR, false); err = radix_tree_preload( mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); @@ -538,12 +539,10 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) - unmap_mapping_range(mapping, - (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, - PMD_SIZE, 0); + unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, + PG_PMD_NR, false); else /* pte entry */ - unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, - PAGE_SIZE, 0); + unmap_mapping_pages(mapping, vmf->pgoff, 1, false); } spin_lock_irq(&mapping->tree_lock); @@ -636,8 +635,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pmd = pmd_mkclean(pmd); set_pmd_at(vma->vm_mm, address, pmdp, pmd); unlock_pmd: - spin_unlock(ptl); #endif + spin_unlock(ptl); } else { if (pfn != pte_pfn(*ptep)) goto unlock_pte; @@ -1269,12 +1268,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, } #ifdef CONFIG_FS_DAX_PMD -/* - * The 'colour' (ie low bits) within a PMD of a page offset. This comes up - * more often than one might expect in the below functions. - */ -#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) - static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) { diff --git a/fs/fcntl.c b/fs/fcntl.c index c7b9e0948107..e95fa0a352ea 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -418,7 +418,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; case F_ADD_SEALS: case F_GET_SEALS: - err = shmem_fcntl(filp, cmd, arg); + err = memfd_fcntl(filp, cmd, arg); break; case F_GET_RW_HINT: case F_SET_RW_HINT: diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8a85f3f53446..8fe1b0aa2896 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -55,16 +55,6 @@ struct hugetlbfs_config { umode_t mode; }; -struct hugetlbfs_inode_info { - struct shared_policy policy; - struct inode vfs_inode; -}; - -static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) -{ - return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); -} - int sysctl_hugetlb_shm_group; enum { @@ -520,8 +510,16 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (hole_end > hole_start) { struct address_space *mapping = inode->i_mapping; + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); inode_lock(inode); + + /* protected by i_mutex */ + if (info->seals & F_SEAL_WRITE) { + inode_unlock(inode); + return -EPERM; + } + i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, @@ -539,6 +537,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); struct vm_area_struct pseudo_vma; @@ -570,6 +569,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (error) goto out; + if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { + error = -EPERM; + goto out; + } + /* * Initialize a pseudo vma as this is required by the huge page * allocation routines. If NUMA is configured, use page index @@ -660,6 +664,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) struct hstate *h = hstate_inode(inode); int error; unsigned int ia_valid = attr->ia_valid; + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); BUG_ON(!inode); @@ -668,9 +673,16 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) return error; if (ia_valid & ATTR_SIZE) { - if (attr->ia_size & ~huge_page_mask(h)) + loff_t oldsize = inode->i_size; + loff_t newsize = attr->ia_size; + + if (newsize & ~huge_page_mask(h)) return -EINVAL; - error = hugetlb_vmtruncate(inode, attr->ia_size); + /* protected by i_mutex */ + if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || + (newsize > oldsize && (info->seals & F_SEAL_GROW))) + return -EPERM; + error = hugetlb_vmtruncate(inode, newsize); if (error) return error; } @@ -722,6 +734,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode = new_inode(sb); if (inode) { + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); + inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, @@ -729,6 +743,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_mapping->private_data = resv_map; + info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 40b5cc97f7b0..917fadca8a7b 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -311,7 +311,9 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) if (had_lock < 0) return ERR_PTR(had_lock); + down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, type, di_bh); + up_read(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); brelse(di_bh); @@ -330,7 +332,9 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh) if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return 0; + down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh); + up_read(&OCFS2_I(inode)->ip_xattr_sem); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); @@ -361,8 +365,10 @@ int ocfs2_init_acl(handle_t *handle, if (!S_ISLNK(inode->i_mode)) { if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + down_read(&OCFS2_I(dir)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, dir_bh); + up_read(&OCFS2_I(dir)->ip_xattr_sem); if (IS_ERR(acl)) return PTR_ERR(acl); } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index ab5105f9767e..9a876bb07cac 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec); static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); + +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, + struct ocfs2_extent_tree *et, + struct buffer_head **new_eb_bh, + int blk_wanted, int *blk_given); +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et); + static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, @@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, if (!obj) obj = (void *)bh->b_data; et->et_object = obj; + et->et_dealloc = NULL; et->et_ops->eo_fill_root_el(et); if (!et->et_ops->eo_fill_max_leaf_clusters) @@ -1158,7 +1166,7 @@ static int ocfs2_add_branch(handle_t *handle, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { - int status, new_blocks, i; + int status, new_blocks, i, block_given = 0; u64 next_blkno, new_last_eb_blk; struct buffer_head *bh; struct buffer_head **new_eb_bhs = NULL; @@ -1213,11 +1221,31 @@ static int ocfs2_add_branch(handle_t *handle, goto bail; } - status = ocfs2_create_new_meta_bhs(handle, et, new_blocks, - meta_ac, new_eb_bhs); - if (status < 0) { - mlog_errno(status); - goto bail; + /* Firstyly, try to reuse dealloc since we have already estimated how + * many extent blocks we may use. + */ + if (!ocfs2_is_dealloc_empty(et)) { + status = ocfs2_reuse_blk_from_dealloc(handle, et, + new_eb_bhs, new_blocks, + &block_given); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + BUG_ON(block_given > new_blocks); + + if (block_given < new_blocks) { + BUG_ON(!meta_ac); + status = ocfs2_create_new_meta_bhs(handle, et, + new_blocks - block_given, + meta_ac, + &new_eb_bhs[block_given]); + if (status < 0) { + mlog_errno(status); + goto bail; + } } /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be @@ -1340,15 +1368,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle, struct ocfs2_alloc_context *meta_ac, struct buffer_head **ret_new_eb_bh) { - int status, i; + int status, i, block_given = 0; u32 new_clusters; struct buffer_head *new_eb_bh = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *root_el; struct ocfs2_extent_list *eb_el; - status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, - &new_eb_bh); + if (!ocfs2_is_dealloc_empty(et)) { + status = ocfs2_reuse_blk_from_dealloc(handle, et, + &new_eb_bh, 1, + &block_given); + } else if (meta_ac) { + status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, + &new_eb_bh); + + } else { + BUG(); + } + if (status < 0) { mlog_errno(status); goto bail; @@ -1511,7 +1549,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, int depth = le16_to_cpu(el->l_tree_depth); struct buffer_head *bh = NULL; - BUG_ON(meta_ac == NULL); + BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et)); shift = ocfs2_find_branch_target(et, &bh); if (shift < 0) { @@ -2598,11 +2636,8 @@ static void ocfs2_unlink_subtree(handle_t *handle, int i; struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el; - struct ocfs2_extent_list *el; struct ocfs2_extent_block *eb; - el = path_leaf_el(left_path); - eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data; for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) @@ -3938,7 +3973,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, struct ocfs2_path *path, struct ocfs2_extent_rec *insert_rec) { - int ret, i, next_free; + int i, next_free; struct buffer_head *bh; struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; @@ -3955,7 +3990,6 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Owner %llu has a bad extent list\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); - ret = -EIO; return; } @@ -5057,7 +5091,6 @@ int ocfs2_split_extent(handle_t *handle, struct buffer_head *last_eb_bh = NULL; struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; struct ocfs2_merge_ctxt ctxt; - struct ocfs2_extent_list *rightmost_el; if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < @@ -5093,9 +5126,7 @@ int ocfs2_split_extent(handle_t *handle, } eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - rightmost_el = &eb->h_list; - } else - rightmost_el = path_root_el(path); + } if (rec->e_cpos == split_rec->e_cpos && rec->e_leaf_clusters == split_rec->e_leaf_clusters) @@ -6585,6 +6616,154 @@ ocfs2_find_per_slot_free_list(int type, return fl; } +static struct ocfs2_per_slot_free_list * +ocfs2_find_preferred_free_list(int type, + int preferred_slot, + int *real_slot, + struct ocfs2_cached_dealloc_ctxt *ctxt) +{ + struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; + + while (fl) { + if (fl->f_inode_type == type && fl->f_slot == preferred_slot) { + *real_slot = fl->f_slot; + return fl; + } + + fl = fl->f_next_suballocator; + } + + /* If we can't find any free list matching preferred slot, just use + * the first one. + */ + fl = ctxt->c_first_suballocator; + *real_slot = fl->f_slot; + + return fl; +} + +/* Return Value 1 indicates empty */ +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et) +{ + struct ocfs2_per_slot_free_list *fl = NULL; + + if (!et->et_dealloc) + return 1; + + fl = et->et_dealloc->c_first_suballocator; + if (!fl) + return 1; + + if (!fl->f_first) + return 1; + + return 0; +} + +/* If extent was deleted from tree due to extent rotation and merging, and + * no metadata is reserved ahead of time. Try to reuse some extents + * just deleted. This is only used to reuse extent blocks. + * It is supposed to find enough extent blocks in dealloc if our estimation + * on metadata is accurate. + */ +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, + struct ocfs2_extent_tree *et, + struct buffer_head **new_eb_bh, + int blk_wanted, int *blk_given) +{ + int i, status = 0, real_slot; + struct ocfs2_cached_dealloc_ctxt *dealloc; + struct ocfs2_per_slot_free_list *fl; + struct ocfs2_cached_block_free *bf; + struct ocfs2_extent_block *eb; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); + + *blk_given = 0; + + /* If extent tree doesn't have a dealloc, this is not faulty. Just + * tell upper caller dealloc can't provide any block and it should + * ask for alloc to claim more space. + */ + dealloc = et->et_dealloc; + if (!dealloc) + goto bail; + + for (i = 0; i < blk_wanted; i++) { + /* Prefer to use local slot */ + fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE, + osb->slot_num, &real_slot, + dealloc); + /* If no more block can be reused, we should claim more + * from alloc. Just return here normally. + */ + if (!fl) { + status = 0; + break; + } + + bf = fl->f_first; + fl->f_first = bf->free_next; + + new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk); + if (new_eb_bh[i] == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + mlog(0, "Reusing block(%llu) from " + "dealloc(local slot:%d, real slot:%d)\n", + bf->free_blk, osb->slot_num, real_slot); + + ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]); + + status = ocfs2_journal_access_eb(handle, et->et_ci, + new_eb_bh[i], + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize); + eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data; + + /* We can't guarantee that buffer head is still cached, so + * polutlate the extent block again. + */ + strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); + eb->h_blkno = cpu_to_le64(bf->free_blk); + eb->h_fs_generation = cpu_to_le32(osb->fs_generation); + eb->h_suballoc_slot = cpu_to_le16(real_slot); + eb->h_suballoc_loc = cpu_to_le64(bf->free_bg); + eb->h_suballoc_bit = cpu_to_le16(bf->free_bit); + eb->h_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); + + /* We'll also be dirtied by the caller, so + * this isn't absolutely necessary. + */ + ocfs2_journal_dirty(handle, new_eb_bh[i]); + + if (!fl->f_first) { + dealloc->c_first_suballocator = fl->f_next_suballocator; + kfree(fl); + } + kfree(bf); + } + + *blk_given = i; + +bail: + if (unlikely(status < 0)) { + for (i = 0; i < blk_wanted; i++) + brelse(new_eb_bh[i]); + } + + return status; +} + int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, int type, int slot, u64 suballoc, u64 blkno, unsigned int bit) @@ -7382,6 +7561,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) struct buffer_head *gd_bh = NULL; struct ocfs2_dinode *main_bm; struct ocfs2_group_desc *gd = NULL; + struct ocfs2_trim_fs_info info, *pinfo = NULL; start = range->start >> osb->s_clustersize_bits; len = range->len >> osb->s_clustersize_bits; @@ -7419,6 +7599,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) trace_ocfs2_trim_fs(start, len, minlen); + ocfs2_trim_fs_lock_res_init(osb); + ret = ocfs2_trim_fs_lock(osb, NULL, 1); + if (ret < 0) { + if (ret != -EAGAIN) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + goto out_unlock; + } + + mlog(ML_NOTICE, "Wait for trim on device (%s) to " + "finish, which is running from another node.\n", + osb->dev_str); + ret = ocfs2_trim_fs_lock(osb, &info, 0); + if (ret < 0) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + goto out_unlock; + } + + if (info.tf_valid && info.tf_success && + info.tf_start == start && info.tf_len == len && + info.tf_minlen == minlen) { + /* Avoid sending duplicated trim to a shared device */ + mlog(ML_NOTICE, "The same trim on device (%s) was " + "just done from node (%u), return.\n", + osb->dev_str, info.tf_nodenum); + range->len = info.tf_trimlen; + goto out_trimunlock; + } + } + + info.tf_nodenum = osb->node_num; + info.tf_start = start; + info.tf_len = len; + info.tf_minlen = minlen; + /* Determine first and last group to examine based on start and len */ first_group = ocfs2_which_cluster_group(main_bm_inode, start); if (first_group == osb->first_cluster_group_blkno) @@ -7463,6 +7679,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); } range->len = trimmed * sb->s_blocksize; + + info.tf_trimlen = range->len; + info.tf_success = (ret ? 0 : 1); + pinfo = &info; +out_trimunlock: + ocfs2_trim_fs_unlock(osb, pinfo); + ocfs2_trim_fs_lock_res_uninit(osb); out_unlock: ocfs2_inode_unlock(main_bm_inode, 0); brelse(main_bm_bh); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 27b75cf32cfa..250bcacdf9e9 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -61,6 +61,7 @@ struct ocfs2_extent_tree { ocfs2_journal_access_func et_root_journal_access; void *et_object; unsigned int et_max_leaf_clusters; + struct ocfs2_cached_dealloc_ctxt *et_dealloc; }; /* diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index d1516327b787..e8e205bf2e41 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -797,6 +797,7 @@ struct ocfs2_write_ctxt { struct ocfs2_cached_dealloc_ctxt w_dealloc; struct list_head w_unwritten_list; + unsigned int w_unwritten_count; }; void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) @@ -1386,6 +1387,7 @@ retry: desc->c_clear_unwritten = 0; list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list); list_add_tail(&new->ue_node, &wc->w_unwritten_list); + wc->w_unwritten_count++; new = NULL; unlock: spin_unlock(&oi->ip_lock); @@ -2256,7 +2258,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, ue->ue_phys = desc->c_phys; list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list); - dwc->dw_zero_count++; + dwc->dw_zero_count += wc->w_unwritten_count; } ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc); @@ -2330,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode, ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + /* Attach dealloc with extent tree in case that we may reuse extents + * which are already unlinked from current extent tree due to extent + * rotation and merging. + */ + et.et_dealloc = &dealloc; + ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, &data_ac, &meta_ac); if (ret) { diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 62e8ec619b4c..af2e7473956e 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -314,12 +314,13 @@ void o2quo_conn_err(u8 node) node, qs->qs_connected); clear_bit(node, qs->qs_conn_bm); + + if (test_bit(node, qs->qs_hb_bm)) + o2quo_set_hold(qs, node); } mlog(0, "node %u, %d total\n", node, qs->qs_connected); - if (test_bit(node, qs->qs_hb_bm)) - o2quo_set_hold(qs, node); spin_unlock(&qs->qs_lock); } diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index b95e7df5b76a..0276f7f8d5e6 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -196,7 +196,7 @@ struct o2net_msg_handler { u32 nh_msg_type; u32 nh_key; o2net_msg_handler_func *nh_func; - o2net_msg_handler_func *nh_func_data; + void *nh_func_data; o2net_post_msg_handler_func *nh_post_func; struct kref nh_kref; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 32f9c72dff17..b7520e20a770 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1958,7 +1958,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx) trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); - error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level); + error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level, 1); if (lock_level && error >= 0) { /* We release EX lock which used to update atime * and get PR lock again to reduce contention diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9c3e0f13ca87..a7df226f9449 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1122,13 +1122,6 @@ recheck: /* sleep if we haven't finished voting yet */ if (sleep) { unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); - - /* - if (kref_read(&mle->mle_refs) < 2) - mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle, - kref_read(&mle->mle_refs), - res->lockname.len, res->lockname.name); - */ atomic_set(&mle->woken, 0); (void)wait_event_timeout(mle->wq, (atomic_read(&mle->woken) == 1), diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 4689940a953c..9479f99c2145 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -259,6 +259,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = { .flags = 0, }; +static struct ocfs2_lock_res_ops ocfs2_trim_fs_lops = { + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, +}; + static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = { .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, }; @@ -676,6 +680,24 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res, &ocfs2_nfs_sync_lops, osb); } +void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb) +{ + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name); + ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS, + &ocfs2_trim_fs_lops, osb); +} + +void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb) +{ + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + ocfs2_simple_drop_lockres(osb, lockres); + ocfs2_lock_res_free(lockres); +} + static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, struct ocfs2_super *osb) { @@ -1742,6 +1764,27 @@ int ocfs2_rw_lock(struct inode *inode, int write) return status; } +int ocfs2_try_rw_lock(struct inode *inode, int write) +{ + int status, level; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog(0, "inode %llu try to take %s RW lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (ocfs2_mount_local(osb)) + return 0; + + lockres = &OCFS2_I(inode)->ip_rw_lockres; + + level = write ? DLM_LOCK_EX : DLM_LOCK_PR; + + status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0); + return status; +} + void ocfs2_rw_unlock(struct inode *inode, int write) { int level = write ? DLM_LOCK_EX : DLM_LOCK_PR; @@ -2486,6 +2529,15 @@ int ocfs2_inode_lock_with_page(struct inode *inode, ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); if (ret == -EAGAIN) { unlock_page(page); + /* + * If we can't get inode lock immediately, we should not return + * directly here, since this will lead to a softlockup problem. + * The method is to get a blocking lock and immediately unlock + * before returning, this can avoid CPU resource waste due to + * lots of retries, and benefits fairness in getting lock. + */ + if (ocfs2_inode_lock(inode, ret_bh, ex) == 0) + ocfs2_inode_unlock(inode, ex); ret = AOP_TRUNCATED_PAGE; } @@ -2494,13 +2546,18 @@ int ocfs2_inode_lock_with_page(struct inode *inode, int ocfs2_inode_lock_atime(struct inode *inode, struct vfsmount *vfsmnt, - int *level) + int *level, int wait) { int ret; - ret = ocfs2_inode_lock(inode, NULL, 0); + if (wait) + ret = ocfs2_inode_lock(inode, NULL, 0); + else + ret = ocfs2_try_inode_lock(inode, NULL, 0); + if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); return ret; } @@ -2512,9 +2569,14 @@ int ocfs2_inode_lock_atime(struct inode *inode, struct buffer_head *bh = NULL; ocfs2_inode_unlock(inode, 0); - ret = ocfs2_inode_lock(inode, &bh, 1); + if (wait) + ret = ocfs2_inode_lock(inode, &bh, 1); + else + ret = ocfs2_try_inode_lock(inode, &bh, 1); + if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); return ret; } *level = 1; @@ -2745,6 +2807,70 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex) ex ? LKM_EXMODE : LKM_PRMODE); } +int ocfs2_trim_fs_lock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info, int trylock) +{ + int status; + struct ocfs2_trim_fs_lvb *lvb; + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + if (info) + info->tf_valid = 0; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, + trylock ? DLM_LKF_NOQUEUE : 0, 0); + if (status < 0) { + if (status != -EAGAIN) + mlog_errno(status); + return status; + } + + if (info) { + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) && + lvb->lvb_version == OCFS2_TRIMFS_LVB_VERSION) { + info->tf_valid = 1; + info->tf_success = lvb->lvb_success; + info->tf_nodenum = be32_to_cpu(lvb->lvb_nodenum); + info->tf_start = be64_to_cpu(lvb->lvb_start); + info->tf_len = be64_to_cpu(lvb->lvb_len); + info->tf_minlen = be64_to_cpu(lvb->lvb_minlen); + info->tf_trimlen = be64_to_cpu(lvb->lvb_trimlen); + } + } + + return status; +} + +void ocfs2_trim_fs_unlock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info) +{ + struct ocfs2_trim_fs_lvb *lvb; + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + if (ocfs2_mount_local(osb)) + return; + + if (info) { + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + lvb->lvb_version = OCFS2_TRIMFS_LVB_VERSION; + lvb->lvb_success = info->tf_success; + lvb->lvb_nodenum = cpu_to_be32(info->tf_nodenum); + lvb->lvb_start = cpu_to_be64(info->tf_start); + lvb->lvb_len = cpu_to_be64(info->tf_len); + lvb->lvb_minlen = cpu_to_be64(info->tf_minlen); + lvb->lvb_trimlen = cpu_to_be64(info->tf_trimlen); + } + + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); +} + int ocfs2_dentry_lock(struct dentry *dentry, int ex) { int ret; diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index a7fc18ba0dc1..256e0a9067b8 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -70,6 +70,29 @@ struct ocfs2_orphan_scan_lvb { __be32 lvb_os_seqno; }; +#define OCFS2_TRIMFS_LVB_VERSION 1 + +struct ocfs2_trim_fs_lvb { + __u8 lvb_version; + __u8 lvb_success; + __u8 lvb_reserved[2]; + __be32 lvb_nodenum; + __be64 lvb_start; + __be64 lvb_len; + __be64 lvb_minlen; + __be64 lvb_trimlen; +}; + +struct ocfs2_trim_fs_info { + u8 tf_valid; /* lvb is valid, or not */ + u8 tf_success; /* trim is successful, or not */ + u32 tf_nodenum; /* osb node number */ + u64 tf_start; /* trim start offset in clusters */ + u64 tf_len; /* trim end offset in clusters */ + u64 tf_minlen; /* trim minimum contiguous free clusters */ + u64 tf_trimlen; /* trimmed length in bytes */ +}; + struct ocfs2_lock_holder { struct list_head oh_list; struct pid *oh_owner_pid; @@ -116,13 +139,14 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res); int ocfs2_create_new_inode_locks(struct inode *inode); int ocfs2_drop_inode_locks(struct inode *inode); int ocfs2_rw_lock(struct inode *inode, int write); +int ocfs2_try_rw_lock(struct inode *inode, int write); void ocfs2_rw_unlock(struct inode *inode, int write); int ocfs2_open_lock(struct inode *inode); int ocfs2_try_open_lock(struct inode *inode, int write); void ocfs2_open_unlock(struct inode *inode); int ocfs2_inode_lock_atime(struct inode *inode, struct vfsmount *vfsmnt, - int *level); + int *level, int wait); int ocfs2_inode_lock_full_nested(struct inode *inode, struct buffer_head **ret_bh, int ex, @@ -140,6 +164,9 @@ int ocfs2_inode_lock_with_page(struct inode *inode, /* 99% of the time we don't want to supply any additional flags -- * those are for very specific cases only. */ #define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL) +#define ocfs2_try_inode_lock(i, b, e)\ + ocfs2_inode_lock_full_nested(i, b, e, OCFS2_META_LOCK_NOQUEUE,\ + OI_LS_NORMAL) void ocfs2_inode_unlock(struct inode *inode, int ex); int ocfs2_super_lock(struct ocfs2_super *osb, @@ -153,6 +180,12 @@ int ocfs2_rename_lock(struct ocfs2_super *osb); void ocfs2_rename_unlock(struct ocfs2_super *osb); int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex); void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex); +void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb); +void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb); +int ocfs2_trim_fs_lock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info, int trylock); +void ocfs2_trim_fs_unlock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info); int ocfs2_dentry_lock(struct dentry *dentry, int ex); void ocfs2_dentry_unlock(struct dentry *dentry, int ex); int ocfs2_file_lock(struct file *file, int ex, int trylock); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index e4719e0a3f99..06cb96462bf9 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -38,6 +38,7 @@ #include "inode.h" #include "super.h" #include "symlink.h" +#include "aops.h" #include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -832,6 +833,50 @@ out: return ret; } +/* Is IO overwriting allocated blocks? */ +int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh, + u64 map_start, u64 map_len) +{ + int ret = 0, is_last; + u32 mapping_end, cpos; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_rec rec; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (ocfs2_size_fits_inline_data(di_bh, map_start + map_len)) + return ret; + else + return -EAGAIN; + } + + cpos = map_start >> osb->s_clustersize_bits; + mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, + map_start + map_len); + is_last = 0; + while (cpos < mapping_end && !is_last) { + ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, + NULL, &rec, &is_last); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rec.e_blkno == 0ULL) + break; + + if (rec.e_flags & OCFS2_EXT_REFCOUNTED) + break; + + cpos = le32_to_cpu(rec.e_cpos) + + le16_to_cpu(rec.e_leaf_clusters); + } + + if (cpos < mapping_end) + ret = -EAGAIN; +out: + return ret; +} + int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence) { struct inode *inode = file->f_mapping->host; diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index 67ea57d2fd59..1057586ec19f 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -53,6 +53,9 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 map_start, u64 map_len); +int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh, + u64 map_start, u64 map_len); + int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin); int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a1d051055472..5d1784a365a3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -140,6 +140,8 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) spin_unlock(&oi->ip_lock); } + file->f_mode |= FMODE_NOWAIT; + leave: return status; } @@ -2132,12 +2134,12 @@ out: } static int ocfs2_prepare_inode_for_write(struct file *file, - loff_t pos, - size_t count) + loff_t pos, size_t count, int wait) { - int ret = 0, meta_level = 0; + int ret = 0, meta_level = 0, overwrite_io = 0; struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); + struct buffer_head *di_bh = NULL; loff_t end; /* @@ -2145,13 +2147,40 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * if we need to make modifications here. */ for(;;) { - ret = ocfs2_inode_lock(inode, NULL, meta_level); + if (wait) + ret = ocfs2_inode_lock(inode, NULL, meta_level); + else + ret = ocfs2_try_inode_lock(inode, + overwrite_io ? NULL : &di_bh, meta_level); if (ret < 0) { meta_level = -1; - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out; } + /* + * Check if IO will overwrite allocated blocks in case + * IOCB_NOWAIT flag is set. + */ + if (!wait && !overwrite_io) { + overwrite_io = 1; + if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) { + ret = -EAGAIN; + goto out_unlock; + } + + ret = ocfs2_overwrite_io(inode, di_bh, pos, count); + brelse(di_bh); + di_bh = NULL; + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (ret < 0) { + if (ret != -EAGAIN) + mlog_errno(ret); + goto out_unlock; + } + } + /* Clear suid / sgid if necessary. We do this here * instead of later in the write path because * remove_suid() calls ->setattr without any hint that @@ -2199,7 +2228,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file, out_unlock: trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, - pos, count); + pos, count, wait); + + brelse(di_bh); if (meta_level >= 0) ocfs2_inode_unlock(inode, meta_level); @@ -2211,7 +2242,7 @@ out: static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - int direct_io, rw_level; + int rw_level; ssize_t written = 0; ssize_t ret; size_t count = iov_iter_count(from); @@ -2223,6 +2254,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, void *saved_ki_complete = NULL; int append_write = ((iocb->ki_pos + count) >= i_size_read(inode) ? 1 : 0); + int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; + int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2230,12 +2263,17 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, file->f_path.dentry->d_name.name, (unsigned int)from->nr_segs); /* GRRRRR */ + if (!direct_io && nowait) + return -EOPNOTSUPP; + if (count == 0) return 0; - direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; - - inode_lock(inode); + if (nowait) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else + inode_lock(inode); /* * Concurrent O_DIRECT writes are allowed with @@ -2244,9 +2282,13 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, */ rw_level = (!direct_io || full_coherency || append_write); - ret = ocfs2_rw_lock(inode, rw_level); + if (nowait) + ret = ocfs2_try_rw_lock(inode, rw_level); + else + ret = ocfs2_rw_lock(inode, rw_level); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out_mutex; } @@ -2260,9 +2302,13 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, * other nodes to drop their caches. Buffered I/O * already does this in write_begin(). */ - ret = ocfs2_inode_lock(inode, NULL, 1); + if (nowait) + ret = ocfs2_try_inode_lock(inode, NULL, 1); + else + ret = ocfs2_inode_lock(inode, NULL, 1); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out; } @@ -2277,9 +2323,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, } count = ret; - ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count); + ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out; } @@ -2355,6 +2402,8 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, int ret = 0, rw_level = -1, lock_level = 0; struct file *filp = iocb->ki_filp; struct inode *inode = file_inode(filp); + int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; + int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2369,14 +2418,22 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, goto bail; } + if (!direct_io && nowait) + return -EOPNOTSUPP; + /* * buffered reads protect themselves in ->readpage(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. */ - if (iocb->ki_flags & IOCB_DIRECT) { - ret = ocfs2_rw_lock(inode, 0); + if (direct_io) { + if (nowait) + ret = ocfs2_try_rw_lock(inode, 0); + else + ret = ocfs2_rw_lock(inode, 0); + if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto bail; } rw_level = 0; @@ -2393,9 +2450,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, * like i_size. This allows the checks down below * generic_file_aio_read() a chance of actually working. */ - ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level); + ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level, + !nowait); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto bail; } ocfs2_inode_unlock(inode, lock_level); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 36304434eacf..e5dcea6cee5f 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -666,23 +666,24 @@ static int __ocfs2_journal_access(handle_t *handle, /* we can safely remove this assertion after testing. */ if (!buffer_uptodate(bh)) { mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); - mlog(ML_ERROR, "b_blocknr=%llu\n", - (unsigned long long)bh->b_blocknr); + mlog(ML_ERROR, "b_blocknr=%llu, b_state=0x%lx\n", + (unsigned long long)bh->b_blocknr, bh->b_state); lock_buffer(bh); /* - * A previous attempt to write this buffer head failed. - * Nothing we can do but to retry the write and hope for - * the best. + * A previous transaction with a couple of buffer heads fail + * to checkpoint, so all the bhs are marked as BH_Write_EIO. + * For current transaction, the bh is just among those error + * bhs which previous transaction handle. We can't just clear + * its BH_Write_EIO and reuse directly, since other bhs are + * not written to disk yet and that will cause metadata + * inconsistency. So we should set fs read-only to avoid + * further damage. */ if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) { - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - } - - if (!buffer_uptodate(bh)) { unlock_buffer(bh); - return -EIO; + return ocfs2_error(osb->sb, "A previous attempt to " + "write this buffer head failed\n"); } unlock_buffer(bh); } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 098f5c712569..fb9a20e3d608 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -184,7 +184,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) int ret = 0, lock_level = 0; ret = ocfs2_inode_lock_atime(file_inode(file), - file->f_path.mnt, &lock_level); + file->f_path.mnt, &lock_level, 1); if (ret < 0) { mlog_errno(ret); goto out; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 9a50f222ac97..6867eef2e06b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -404,6 +404,7 @@ struct ocfs2_super struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; + struct ocfs2_lock_res osb_trim_fs_lockres; struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index d277aabf5dfb..7051b994c776 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -50,6 +50,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_NFS_SYNC, OCFS2_LOCK_TYPE_ORPHAN_SCAN, OCFS2_LOCK_TYPE_REFCOUNT, + OCFS2_LOCK_TYPE_TRIM_FS, OCFS2_NUM_LOCK_TYPES }; @@ -93,6 +94,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_REFCOUNT: c = 'T'; break; + case OCFS2_LOCK_TYPE_TRIM_FS: + c = 'I'; + break; default: c = '\0'; } @@ -115,6 +119,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount", + [OCFS2_LOCK_TYPE_TRIM_FS] = "TrimFs", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index a0b5d00ef0a9..e2a11aaece10 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1449,20 +1449,22 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range); TRACE_EVENT(ocfs2_prepare_inode_for_write, TP_PROTO(unsigned long long ino, unsigned long long saved_pos, - unsigned long count), - TP_ARGS(ino, saved_pos, count), + unsigned long count, int wait), + TP_ARGS(ino, saved_pos, count, wait), TP_STRUCT__entry( __field(unsigned long long, ino) __field(unsigned long long, saved_pos) __field(unsigned long, count) + __field(int, wait) ), TP_fast_assign( __entry->ino = ino; __entry->saved_pos = saved_pos; __entry->count = count; + __entry->wait = wait; ), - TP_printk("%llu %llu %lu", __entry->ino, - __entry->saved_pos, __entry->count) + TP_printk("%llu %llu %lu %d", __entry->ino, + __entry->saved_pos, __entry->count, __entry->wait) ); DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 9f0b95abc09f..d8f5f6ce99dc 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2438,6 +2438,8 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, } le16_add_cpu(&bg->bg_free_bits_count, num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + if (undo_fn) + jbd_unlock_bh_state(group_bh); return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", (unsigned long long)le64_to_cpu(bg->bg_blkno), le16_to_cpu(bg->bg_bits), @@ -2563,16 +2565,16 @@ static int _ocfs2_free_clusters(handle_t *handle, int status; u16 bg_start_bit; u64 bg_blkno; - struct ocfs2_dinode *fe; /* You can't ever have a contiguous set of clusters * bigger than a block group bitmap so we never have to worry * about looping on them. * This is expensive. We can safely remove once this stuff has * gotten tested really well. */ - BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); + BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, + ocfs2_blocks_to_clusters(bitmap_inode->i_sb, + start_blk))); - fe = (struct ocfs2_dinode *) bitmap_bh->b_data; ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, &bg_start_bit); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 80efa5699fb0..ffa4952d432b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -474,9 +474,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); - status = -EINVAL; + status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL; mlog_errno(status); - /* FIXME: Should ERROR_RO_FS */ mlog(ML_ERROR, "Unable to load system inode %d, " "possibly corrupt fs?", i); goto bail; @@ -505,7 +504,7 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); - status = -EINVAL; + status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL; mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", status, i, osb->slot_num); goto bail; @@ -1208,14 +1207,15 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) read_super_error: brelse(bh); + if (status) + mlog_errno(status); + if (osb) { atomic_set(&osb->vol_state, VOLUME_DISABLED); wake_up(&osb->osb_mount_event); ocfs2_dismount_volume(sb, 1); } - if (status) - mlog_errno(status); return status; } @@ -1843,6 +1843,9 @@ static int ocfs2_mount_volume(struct super_block *sb) status = ocfs2_dlm_init(osb); if (status < 0) { mlog_errno(status); + if (status == -EBADR && ocfs2_userspace_stack(osb)) + mlog(ML_ERROR, "couldn't mount because cluster name on" + " disk does not match the running cluster name.\n"); goto leave; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index c5898c59d411..c261c1dfd374 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -638,14 +638,17 @@ int ocfs2_calc_xattr_init(struct inode *dir, si->value_len); if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + down_read(&OCFS2_I(dir)->ip_xattr_sem); acl_len = ocfs2_xattr_get_nolock(dir, dir_bh, OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, "", NULL, 0); + up_read(&OCFS2_I(dir)->ip_xattr_sem); if (acl_len > 0) { a_size = ocfs2_xattr_entry_real_size(0, acl_len); if (S_ISDIR(mode)) a_size <<= 1; } else if (acl_len != 0 && acl_len != -ENODATA) { + ret = acl_len; mlog_errno(ret); return ret; } @@ -6415,7 +6418,7 @@ static int ocfs2_reflink_xattr_header(handle_t *handle, * and then insert the extents one by one. */ if (xv->xr_list.l_tree_depth) { - memcpy(new_xv, &def_xv, sizeof(def_xv)); + memcpy(new_xv, &def_xv, OCFS2_XATTR_ROOT_SIZE); vb->vb_xv = new_xv; vb->vb_bh = value_bh; ocfs2_init_xattr_value_extent_tree(&data_et, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 339e4c1c044d..ec6d2983a5cb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -47,8 +47,11 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) if (hiwater_rss < mm->hiwater_rss) hiwater_rss = mm->hiwater_rss; - text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; - lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; + /* split executable areas between text and lib */ + text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK); + text = min(text, mm->exec_vm << PAGE_SHIFT); + lib = (mm->exec_vm << PAGE_SHIFT) - text; + swap = get_mm_counter(mm, MM_SWAPENTS); seq_printf(m, "VmPeak:\t%8lu kB\n" @@ -76,7 +79,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) file << (PAGE_SHIFT-10), shmem << (PAGE_SHIFT-10), mm->data_vm << (PAGE_SHIFT-10), - mm->stack_vm << (PAGE_SHIFT-10), text, lib, + mm->stack_vm << (PAGE_SHIFT-10), + text >> 10, + lib >> 10, mm_pgtables_bytes(mm) >> 10, swap << (PAGE_SHIFT-10)); hugetlb_report_usage(m, mm); @@ -977,14 +982,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = *pmdp; + pmd_t old, pmd = *pmdp; if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ - pmdp_invalidate(vma, addr, pmdp); - if (pmd_dirty(*pmdp)) + old = pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(old)) pmd = pmd_mkdirty(pmd); - if (pmd_young(*pmdp)) + if (pmd_young(old)) pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 743eaa646898..87a13a7c8270 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -294,10 +294,13 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, * pmd_trans_unstable) of the pmd. */ _pmd = READ_ONCE(*pmd); - if (!pmd_present(_pmd)) + if (pmd_none(_pmd)) goto out; ret = false; + if (!pmd_present(_pmd)) + goto out; + if (pmd_trans_huge(_pmd)) goto out; @@ -985,24 +988,14 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *ctx, struct uffd_msg *msg) { int fd; - struct file *file; - unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS; - fd = get_unused_fd_flags(flags); + fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new, + O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS)); if (fd < 0) return fd; - file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new, - O_RDWR | flags); - if (IS_ERR(file)) { - put_unused_fd(fd); - return PTR_ERR(file); - } - - fd_install(fd, file); msg->arg.reserved.reserved1 = 0; msg->arg.fork.ufd = fd; - return 0; } @@ -1884,24 +1877,10 @@ static void init_once_userfaultfd_ctx(void *mem) seqcount_init(&ctx->refile_seq); } -/** - * userfaultfd_file_create - Creates a userfaultfd file pointer. - * @flags: Flags for the userfaultfd file. - * - * This function creates a userfaultfd file pointer, w/out installing - * it into the fd table. This is useful when the userfaultfd file is - * used during the initialization of data structures that require - * extra setup after the userfaultfd creation. So the userfaultfd - * creation is split into the file pointer creation phase, and the - * file descriptor installation phase. In this way races with - * userspace closing the newly installed file descriptor can be - * avoided. Returns a userfaultfd file pointer, or a proper error - * pointer. - */ -static struct file *userfaultfd_file_create(int flags) +SYSCALL_DEFINE1(userfaultfd, int, flags) { - struct file *file; struct userfaultfd_ctx *ctx; + int fd; BUG_ON(!current->mm); @@ -1909,14 +1888,12 @@ static struct file *userfaultfd_file_create(int flags) BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); - file = ERR_PTR(-EINVAL); if (flags & ~UFFD_SHARED_FCNTL_FLAGS) - goto out; + return -EINVAL; - file = ERR_PTR(-ENOMEM); ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); if (!ctx) - goto out; + return -ENOMEM; atomic_set(&ctx->refcount, 1); ctx->flags = flags; @@ -1927,39 +1904,13 @@ static struct file *userfaultfd_file_create(int flags) /* prevent the mm struct to be freed */ mmgrab(ctx->mm); - file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx, - O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); - if (IS_ERR(file)) { + fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx, + O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); + if (fd < 0) { mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); } -out: - return file; -} - -SYSCALL_DEFINE1(userfaultfd, int, flags) -{ - int fd, error; - struct file *file; - - error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS); - if (error < 0) - return error; - fd = error; - - file = userfaultfd_file_create(flags); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_put_unused_fd; - } - fd_install(fd, file); - return fd; - -err_put_unused_fd: - put_unused_fd(fd); - - return error; } static int __init userfaultfd_init(void) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 868e68561f91..2cfa3075d148 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -309,19 +309,26 @@ extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif -#ifndef __HAVE_ARCH_PMDP_INVALIDATE -extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp); -#endif - -#ifndef __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE -static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * This is an implementation of pmdp_establish() that is only suitable for an + * architecture that doesn't have hardware dirty/accessed bits. In this case we + * can't race with CPU which sets these bits and non-atomic aproach is fine. + */ +static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) { - + pmd_t old_pmd = *pmdp; + set_pmd_at(vma->vm_mm, address, pmdp, pmd); + return old_pmd; } #endif +#ifndef __HAVE_ARCH_PMDP_INVALIDATE +extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp); +#endif + #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 82a25880714a..36fa6a2a82e3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -119,6 +119,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); +void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; @@ -129,7 +130,6 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); -extern int hugepages_treat_as_movable; extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; @@ -158,6 +158,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); bool is_hugetlb_entry_migration(pte_t pte); + #else /* !CONFIG_HUGETLB_PAGE */ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) @@ -198,6 +199,7 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list) return false; } #define putback_active_hugepage(p) do {} while (0) +#define move_hugetlb_state(old, new, reason) do {} while (0) static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) @@ -271,6 +273,17 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) return sb->s_fs_info; } +struct hugetlbfs_inode_info { + struct shared_policy policy; + struct inode vfs_inode; + unsigned int seals; +}; + +static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) +{ + return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); +} + extern const struct file_operations hugetlbfs_file_operations; extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, @@ -343,10 +356,10 @@ struct huge_bootmem_page { struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_node(struct hstate *h, int nid); -struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask); +struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, + unsigned long address); int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -524,7 +537,7 @@ struct hstate {}; #define alloc_huge_page(v, a, r) NULL #define alloc_huge_page_node(h, nid) NULL #define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL -#define alloc_huge_page_noerr(v, a, r) NULL +#define alloc_huge_page_vma(h, vma, address) NULL #define alloc_bootmem_huge_page(h) NULL #define hstate_file(f) NULL #define hstate_sizelog(s) NULL diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 69966c461d1c..882046863581 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -108,7 +108,10 @@ struct lruvec_stat { */ struct mem_cgroup_per_node { struct lruvec lruvec; - struct lruvec_stat __percpu *lruvec_stat; + + struct lruvec_stat __percpu *lruvec_stat_cpu; + atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; + unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; @@ -227,10 +230,10 @@ struct mem_cgroup { spinlock_t move_lock; struct task_struct *move_lock_task; unsigned long move_lock_flags; - /* - * percpu counter. - */ - struct mem_cgroup_stat_cpu __percpu *stat; + + struct mem_cgroup_stat_cpu __percpu *stat_cpu; + atomic_long_t stat[MEMCG_NR_STAT]; + atomic_long_t events[MEMCG_NR_EVENTS]; unsigned long socket_pressure; @@ -265,6 +268,12 @@ struct mem_cgroup { /* WARNING: nodeinfo must be the last member here */ }; +/* + * size of first charge trial. "32" comes from vmscan.c's magic value. + * TODO: maybe necessary to use big numbers in big irons. + */ +#define MEMCG_CHARGE_BATCH 32U + extern struct mem_cgroup *root_mem_cgroup; static inline bool mem_cgroup_disabled(void) @@ -272,13 +281,6 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -static inline void mem_cgroup_event(struct mem_cgroup *memcg, - enum memcg_event_item event) -{ - this_cpu_inc(memcg->stat->events[event]); - cgroup_file_notify(&memcg->events_file); -} - bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, @@ -492,32 +494,38 @@ void unlock_page_memcg(struct page *page); static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long val = 0; - int cpu; - - for_each_possible_cpu(cpu) - val += per_cpu(memcg->stat->count[idx], cpu); - - if (val < 0) - val = 0; - - return val; + long x = atomic_long_read(&memcg->stat[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { - if (!mem_cgroup_disabled()) - __this_cpu_add(memcg->stat->count[idx], val); + long x; + + if (mem_cgroup_disabled()) + return; + + x = val + __this_cpu_read(memcg->stat_cpu->count[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &memcg->stat[idx]); + x = 0; + } + __this_cpu_write(memcg->stat_cpu->count[idx], x); } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { - if (!mem_cgroup_disabled()) - this_cpu_add(memcg->stat->count[idx], val); + preempt_disable(); + __mod_memcg_state(memcg, idx, val); + preempt_enable(); } /** @@ -555,87 +563,108 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; - long val = 0; - int cpu; + long x; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - for_each_possible_cpu(cpu) - val += per_cpu(pn->lruvec_stat->count[idx], cpu); - - if (val < 0) - val = 0; - - return val; + x = atomic_long_read(&pn->lruvec_stat[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } static inline void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { struct mem_cgroup_per_node *pn; + long x; + /* Update node */ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + if (mem_cgroup_disabled()) return; + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + + /* Update memcg */ __mod_memcg_state(pn->memcg, idx, val); - __this_cpu_add(pn->lruvec_stat->count[idx], val); + + /* Update lruvec */ + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &pn->lruvec_stat[idx]); + x = 0; + } + __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); } static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { - struct mem_cgroup_per_node *pn; - - mod_node_page_state(lruvec_pgdat(lruvec), idx, val); - if (mem_cgroup_disabled()) - return; - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - mod_memcg_state(pn->memcg, idx, val); - this_cpu_add(pn->lruvec_stat->count[idx], val); + preempt_disable(); + __mod_lruvec_state(lruvec, idx, val); + preempt_enable(); } static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { - struct mem_cgroup_per_node *pn; + pg_data_t *pgdat = page_pgdat(page); + struct lruvec *lruvec; - __mod_node_page_state(page_pgdat(page), idx, val); - if (mem_cgroup_disabled() || !page->mem_cgroup) + /* Untracked pages have no memcg, no lruvec. Update only the node */ + if (!page->mem_cgroup) { + __mod_node_page_state(pgdat, idx, val); return; - __mod_memcg_state(page->mem_cgroup, idx, val); - pn = page->mem_cgroup->nodeinfo[page_to_nid(page)]; - __this_cpu_add(pn->lruvec_stat->count[idx], val); + } + + lruvec = mem_cgroup_lruvec(pgdat, page->mem_cgroup); + __mod_lruvec_state(lruvec, idx, val); } static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { - struct mem_cgroup_per_node *pn; - - mod_node_page_state(page_pgdat(page), idx, val); - if (mem_cgroup_disabled() || !page->mem_cgroup) - return; - mod_memcg_state(page->mem_cgroup, idx, val); - pn = page->mem_cgroup->nodeinfo[page_to_nid(page)]; - this_cpu_add(pn->lruvec_stat->count[idx], val); + preempt_disable(); + __mod_lruvec_page_state(page, idx, val); + preempt_enable(); } unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); +/* idx can be of type enum memcg_event_item or vm_event_item */ +static inline void __count_memcg_events(struct mem_cgroup *memcg, + int idx, unsigned long count) +{ + unsigned long x; + + if (mem_cgroup_disabled()) + return; + + x = count + __this_cpu_read(memcg->stat_cpu->events[idx]); + if (unlikely(x > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &memcg->events[idx]); + x = 0; + } + __this_cpu_write(memcg->stat_cpu->events[idx], x); +} + static inline void count_memcg_events(struct mem_cgroup *memcg, - enum vm_event_item idx, - unsigned long count) + int idx, unsigned long count) { - if (!mem_cgroup_disabled()) - this_cpu_add(memcg->stat->events[idx], count); + preempt_disable(); + __count_memcg_events(memcg, idx, count); + preempt_enable(); } -/* idx can be of type enum memcg_stat_item or node_stat_item */ +/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void count_memcg_page_event(struct page *page, int idx) { @@ -654,12 +683,20 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) { - this_cpu_inc(memcg->stat->events[idx]); + count_memcg_events(memcg, idx, 1); if (idx == OOM_KILL) cgroup_file_notify(&memcg->events_file); } rcu_read_unlock(); } + +static inline void mem_cgroup_event(struct mem_cgroup *memcg, + enum memcg_event_item event) +{ + count_memcg_events(memcg, event, 1); + cgroup_file_notify(&memcg->events_file); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE void mem_cgroup_split_huge_fixup(struct page *head); #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 7fc92384977e..173d2484f6e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1312,8 +1312,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); -void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, int even_cows); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, unsigned long *start, unsigned long *end, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); @@ -1324,12 +1322,6 @@ int follow_phys(struct vm_area_struct *vma, unsigned long address, int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); -static inline void unmap_shared_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen) -{ - unmap_mapping_range(mapping, holebegin, holelen, 0); -} - extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); @@ -1344,6 +1336,10 @@ extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); +void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows); +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -1360,10 +1356,20 @@ static inline int fixup_user_fault(struct task_struct *tsk, BUG(); return -EFAULT; } +static inline void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows) { } +static inline void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows) { } #endif -extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, - unsigned int gup_flags); +static inline void unmap_shared_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen) +{ + unmap_mapping_range(mapping, holebegin, holelen, 0); +} + +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cfd0ac4e5e0e..fd1af6b9591d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -31,28 +31,56 @@ struct hmm; * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using * a page, though if it is a pagecache page, rmap structures can tell us - * who is mapping it. + * who is mapping it. If you allocate the page using alloc_pages(), you + * can use some of the space in struct page for your own purposes. * - * The objects in struct page are organized in double word blocks in - * order to allows us to use atomic double word operations on portions - * of struct page. That is currently only used by slub but the arrangement - * allows the use of atomic double word operations on the flags/mapping - * and lru list pointers also. + * Pages that were once in the page cache may be found under the RCU lock + * even after they have been recycled to a different purpose. The page + * cache reads and writes some of the fields in struct page to pin the + * page before checking that it's still in the page cache. It is vital + * that all users of struct page: + * 1. Use the first word as PageFlags. + * 2. Clear or preserve bit 0 of page->compound_head. It is used as + * PageTail for compound pages, and the page cache must not see false + * positives. Some users put a pointer here (guaranteed to be at least + * 4-byte aligned), other users avoid using the field altogether. + * 3. page->_refcount must either not be used, or must be used in such a + * way that other CPUs temporarily incrementing and then decrementing the + * refcount does not cause problems. On receiving the page from + * alloc_pages(), the refcount will be positive. + * 4. Either preserve page->_mapcount or restore it to -1 before freeing it. + * + * If you allocate pages of order > 0, you can use the fields in the struct + * page associated with each page, but bear in mind that the pages may have + * been inserted individually into the page cache, so you must use the above + * four fields in a compatible way for each struct page. + * + * SLUB uses cmpxchg_double() to atomically update its freelist and + * counters. That requires that freelist & counters be adjacent and + * double-word aligned. We align all struct pages to double-word + * boundaries, and ensure that 'freelist' is aligned within the + * struct. */ +#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE +#define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) +#define _slub_counter_t unsigned long +#else +#define _slub_counter_t unsigned int +#endif +#else /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ +#define _struct_page_alignment +#define _slub_counter_t unsigned int +#endif /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ + struct page { /* First double word block */ unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ union { - struct address_space *mapping; /* If low bit clear, points to - * inode address_space, or NULL. - * If page mapped as anonymous - * memory, low bit is set, and - * it points to anon_vma object - * or KSM private structure. See - * PAGE_MAPPING_ANON and - * PAGE_MAPPING_KSM. - */ + /* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */ + struct address_space *mapping; + void *s_mem; /* slab first object */ atomic_t compound_mapcount; /* first tail page */ /* page_deferred_list().next -- second tail page */ @@ -66,40 +94,27 @@ struct page { }; union { -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - /* Used for cmpxchg_double in slub */ - unsigned long counters; -#else - /* - * Keep _refcount separate from slub cmpxchg_double data. - * As the rest of the double word is protected by slab_lock - * but _refcount is not. - */ - unsigned counters; -#endif - struct { + _slub_counter_t counters; + unsigned int active; /* SLAB */ + struct { /* SLUB */ + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + int units; /* SLOB */ + + struct { /* Page cache */ + /* + * Count of ptes mapped in mms, to show when + * page is mapped & limit reverse map searches. + * + * Extra information about page type may be + * stored here for pages that are never mapped, + * in which case the value MUST BE <= -2. + * See page-flags.h for more details. + */ + atomic_t _mapcount; - union { - /* - * Count of ptes mapped in mms, to show when - * page is mapped & limit reverse map searches. - * - * Extra information about page type may be - * stored here for pages that are never mapped, - * in which case the value MUST BE <= -2. - * See page-flags.h for more details. - */ - atomic_t _mapcount; - - unsigned int active; /* SLAB */ - struct { /* SLUB */ - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; - }; - int units; /* SLOB */ - }; /* * Usage count, *USE WRAPPER FUNCTION* when manual * accounting. See page_ref.h @@ -109,8 +124,6 @@ struct page { }; /* - * Third double word block - * * WARNING: bit 0 of the first word encode PageTail(). That means * the rest users of the storage space MUST NOT use the bit to * avoid collision and false-positive PageTail(). @@ -145,19 +158,9 @@ struct page { unsigned long compound_head; /* If bit zero is set */ /* First tail page only */ -#ifdef CONFIG_64BIT - /* - * On 64 bit system we have enough space in struct page - * to encode compound_dtor and compound_order with - * unsigned int. It can help compiler generate better or - * smaller code on some archtectures. - */ - unsigned int compound_dtor; - unsigned int compound_order; -#else - unsigned short int compound_dtor; - unsigned short int compound_order; -#endif + unsigned char compound_dtor; + unsigned char compound_order; + /* two/six bytes available here */ }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS @@ -171,15 +174,14 @@ struct page { #endif }; - /* Remainder is not double word aligned */ union { - unsigned long private; /* Mapping-private opaque data: - * usually used for buffer_heads - * if PagePrivate set; used for - * swp_entry_t if PageSwapCache; - * indicates order in the buddy - * system if PG_buddy is set. - */ + /* + * Mapping-private opaque data: + * Usually used for buffer_heads if PagePrivate + * Used for swp_entry_t if PageSwapCache + * Indicates order in the buddy system if PageBuddy + */ + unsigned long private; #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; @@ -212,15 +214,7 @@ struct page { #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif -} -/* - * The struct page can be forced to be double word aligned so that atomic ops - * on double words work. The SLUB allocator can make use of such a feature. - */ -#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE - __aligned(2 * sizeof(unsigned long)) -#endif -; +} _struct_page_alignment; #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b25dc9db19fc..2d07a1ed5a31 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -2,6 +2,7 @@ #ifndef _LINUX_MMU_NOTIFIER_H #define _LINUX_MMU_NOTIFIER_H +#include <linux/types.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mm_types.h> @@ -10,6 +11,9 @@ struct mmu_notifier; struct mmu_notifier_ops; +/* mmu_notifier_ops flags */ +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01) + #ifdef CONFIG_MMU_NOTIFIER /* @@ -27,6 +31,15 @@ struct mmu_notifier_mm { struct mmu_notifier_ops { /* + * Flags to specify behavior of callbacks for this MMU notifier. + * Used to determine which context an operation may be called. + * + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not + * block + */ + int flags; + + /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are * freed. This can run concurrently with other mmu notifier @@ -137,6 +150,10 @@ struct mmu_notifier_ops { * page. Pages will no longer be referenced by the linux * address space but may still be referenced by sptes until * the last refcount is dropped. + * + * If both of these callbacks cannot block, and invalidate_range + * cannot block, mmu_notifier_ops.flags should have + * MMU_INVALIDATE_DOES_NOT_BLOCK set. */ void (*invalidate_range_start)(struct mmu_notifier *mn, struct mm_struct *mm, @@ -159,12 +176,13 @@ struct mmu_notifier_ops { * external TLB range needs to be flushed. For more in depth * discussion on this see Documentation/vm/mmu_notifier.txt * - * The invalidate_range() function is called under the ptl - * spin-lock and not allowed to sleep. - * * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if * called between those functions. + * + * If this callback cannot block, and invalidate_range_{start,end} + * cannot block, mmu_notifier_ops.flags should have + * MMU_INVALIDATE_DOES_NOT_BLOCK set. */ void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -218,6 +236,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm); static inline void mmu_notifier_release(struct mm_struct *mm) { @@ -457,6 +476,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, { } +static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) +{ + return false; +} + static inline void mmu_notifier_mm_init(struct mm_struct *mm) { } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 67f2e3c38939..7522a6987595 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1166,8 +1166,16 @@ extern unsigned long usemap_size(void); /* * We use the lower bits of the mem_map pointer to store - * a little bit of information. There should be at least - * 3 bits here due to 32-bit alignment. + * a little bit of information. The pointer is calculated + * as mem_map - section_nr_to_pfn(pnum). The result is + * aligned to the minimum alignment of the two values: + * 1. All mem_map arrays are page-aligned. + * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT + * lowest bits. PFN_SECTION_SHIFT is arch-specific + * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the + * worst combination is powerpc with 256k pages, + * which results in PFN_SECTION_SHIFT equal 6. + * To sum it up, at least 6 bits are available. */ #define SECTION_MARKED_PRESENT (1UL<<0) #define SECTION_HAS_MEM_MAP (1UL<<1) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 3ec44e27aa9d..50c2b8786831 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -46,11 +46,6 @@ * guarantees that this bit is cleared for a page when it first is entered into * the page cache. * - * PG_highmem pages are not permanently mapped into the kernel virtual address - * space, they need to be kmapped separately for doing IO on the pages. The - * struct page (these bits with information) are always mapped into kernel - * address space... - * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch! diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 5fb6580f7f23..6dc456ac6136 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -9,14 +9,14 @@ #ifndef _LINUX_PAGEVEC_H #define _LINUX_PAGEVEC_H -/* 14 pointers + two long's align the pagevec structure to a power of two */ -#define PAGEVEC_SIZE 14 +/* 15 pointers + header align the pagevec structure to a power of two */ +#define PAGEVEC_SIZE 15 struct page; struct address_space; struct pagevec { - unsigned long nr; + unsigned char nr; bool percpu_pvec_drained; struct page *pages[PAGEVEC_SIZE]; }; diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3d49b91b674d..bd422561a75e 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -11,7 +11,7 @@ /* * Routines for handling mm_structs */ -extern struct mm_struct * mm_alloc(void); +extern struct mm_struct *mm_alloc(void); /** * mmgrab() - Pin a &struct mm_struct. @@ -35,27 +35,7 @@ static inline void mmgrab(struct mm_struct *mm) atomic_inc(&mm->mm_count); } -/* mmdrop drops the mm and the page tables */ -extern void __mmdrop(struct mm_struct *); -static inline void mmdrop(struct mm_struct *mm) -{ - if (unlikely(atomic_dec_and_test(&mm->mm_count))) - __mmdrop(mm); -} - -static inline void mmdrop_async_fn(struct work_struct *work) -{ - struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); - __mmdrop(mm); -} - -static inline void mmdrop_async(struct mm_struct *mm) -{ - if (unlikely(atomic_dec_and_test(&mm->mm_count))) { - INIT_WORK(&mm->async_put_work, mmdrop_async_fn); - schedule_work(&mm->async_put_work); - } -} +extern void mmdrop(struct mm_struct *mm); /** * mmget() - Pin the address space associated with a &struct mm_struct. diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 06b295bec00d..73b5e655a76e 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -112,13 +112,11 @@ extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_TMPFS -extern int shmem_add_seals(struct file *file, unsigned int seals); -extern int shmem_get_seals(struct file *file); -extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg); +extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); #else -static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a) +static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) { return -EINVAL; } diff --git a/include/linux/swap.h b/include/linux/swap.h index c2b8128799c1..7b6a59f722a3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -332,7 +332,6 @@ extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); -extern void lru_add_drain_all_cpuslocked(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); extern void mark_page_lazyfree(struct page *page); @@ -345,7 +344,6 @@ extern void lru_cache_add_active_or_unevictable(struct page *page, /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); -extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 1779c9817b39..a4c2317d8b9f 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -216,23 +216,6 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, return x; } -static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat, - enum node_stat_item item) -{ - long x = atomic_long_read(&pgdat->vm_stat[item]); - -#ifdef CONFIG_SMP - int cpu; - for_each_online_cpu(cpu) - x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item]; - - if (x < 0) - x = 0; -#endif - return x; -} - - #ifdef CONFIG_NUMA extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item); extern unsigned long sum_zone_node_page_state(int node, diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 004ba807df96..7238865e75b0 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -108,4 +108,6 @@ void zpool_register_driver(struct zpool_driver *driver); int zpool_unregister_driver(struct zpool_driver *driver); +bool zpool_evictable(struct zpool *pool); + #endif diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index d70b53e65f43..e0b8b9173e1c 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -192,12 +192,12 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_re TRACE_EVENT(mm_shrink_slab_start, TP_PROTO(struct shrinker *shr, struct shrink_control *sc, - long nr_objects_to_shrink, unsigned long pgs_scanned, - unsigned long lru_pgs, unsigned long cache_items, - unsigned long long delta, unsigned long total_scan), + long nr_objects_to_shrink, unsigned long cache_items, + unsigned long long delta, unsigned long total_scan, + int priority), - TP_ARGS(shr, sc, nr_objects_to_shrink, pgs_scanned, lru_pgs, - cache_items, delta, total_scan), + TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan, + priority), TP_STRUCT__entry( __field(struct shrinker *, shr) @@ -205,11 +205,10 @@ TRACE_EVENT(mm_shrink_slab_start, __field(int, nid) __field(long, nr_objects_to_shrink) __field(gfp_t, gfp_flags) - __field(unsigned long, pgs_scanned) - __field(unsigned long, lru_pgs) __field(unsigned long, cache_items) __field(unsigned long long, delta) __field(unsigned long, total_scan) + __field(int, priority) ), TP_fast_assign( @@ -218,24 +217,22 @@ TRACE_EVENT(mm_shrink_slab_start, __entry->nid = sc->nid; __entry->nr_objects_to_shrink = nr_objects_to_shrink; __entry->gfp_flags = sc->gfp_mask; - __entry->pgs_scanned = pgs_scanned; - __entry->lru_pgs = lru_pgs; __entry->cache_items = cache_items; __entry->delta = delta; __entry->total_scan = total_scan; + __entry->priority = priority; ), - TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", + TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", __entry->shrink, __entry->shr, __entry->nid, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), - __entry->pgs_scanned, - __entry->lru_pgs, __entry->cache_items, __entry->delta, - __entry->total_scan) + __entry->total_scan, + __entry->priority) ); TRACE_EVENT(mm_shrink_slab_end, diff --git a/kernel/fork.c b/kernel/fork.c index 2295fc69717f..5e6cf0dd031c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -77,6 +77,7 @@ #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> +#include <linux/sched/mm.h> #include <linux/perf_event.h> #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> @@ -390,6 +391,241 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); +#ifdef CONFIG_MMU +static __latent_entropy int dup_mmap(struct mm_struct *mm, + struct mm_struct *oldmm) +{ + struct vm_area_struct *mpnt, *tmp, *prev, **pprev; + struct rb_node **rb_link, *rb_parent; + int retval; + unsigned long charge; + LIST_HEAD(uf); + + uprobe_start_dup_mmap(); + if (down_write_killable(&oldmm->mmap_sem)) { + retval = -EINTR; + goto fail_uprobe_end; + } + flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); + /* + * Not linked in yet - no deadlock potential: + */ + down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + + /* No ordering required: file already has been exposed. */ + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + + mm->total_vm = oldmm->total_vm; + mm->data_vm = oldmm->data_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + pprev = &mm->mmap; + retval = ksm_fork(mm, oldmm); + if (retval) + goto out; + retval = khugepaged_fork(mm, oldmm); + if (retval) + goto out; + + prev = NULL; + for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { + struct file *file; + + if (mpnt->vm_flags & VM_DONTCOPY) { + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); + continue; + } + charge = 0; + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned long len = vma_pages(mpnt); + + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ + goto fail_nomem; + charge = len; + } + tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!tmp) + goto fail_nomem; + *tmp = *mpnt; + INIT_LIST_HEAD(&tmp->anon_vma_chain); + retval = vma_dup_policy(mpnt, tmp); + if (retval) + goto fail_nomem_policy; + tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; + if (tmp->vm_flags & VM_WIPEONFORK) { + /* VM_WIPEONFORK gets a clean slate in the child. */ + tmp->anon_vma = NULL; + if (anon_vma_prepare(tmp)) + goto fail_nomem_anon_vma_fork; + } else if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); + tmp->vm_next = tmp->vm_prev = NULL; + file = tmp->vm_file; + if (file) { + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; + + get_file(file); + if (tmp->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + i_mmap_lock_write(mapping); + if (tmp->vm_flags & VM_SHARED) + atomic_inc(&mapping->i_mmap_writable); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } + + /* + * Clear hugetlb-related page reserves for children. This only + * affects MAP_PRIVATE mappings. Faults generated by the child + * are not guaranteed to succeed, even if read-only + */ + if (is_vm_hugetlb_page(tmp)) + reset_vma_resv_huge_pages(tmp); + + /* + * Link in the new vma and copy the page table entries. + */ + *pprev = tmp; + pprev = &tmp->vm_next; + tmp->vm_prev = prev; + prev = tmp; + + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; + + mm->map_count++; + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(mm, oldmm, mpnt); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + if (retval) + goto out; + } + /* a new mm has just been created */ + arch_dup_mmap(oldmm, mm); + retval = 0; +out: + up_write(&mm->mmap_sem); + flush_tlb_mm(oldmm); + up_write(&oldmm->mmap_sem); + dup_userfaultfd_complete(&uf); +fail_uprobe_end: + uprobe_end_dup_mmap(); + return retval; +fail_nomem_anon_vma_fork: + mpol_put(vma_policy(tmp)); +fail_nomem_policy: + kmem_cache_free(vm_area_cachep, tmp); +fail_nomem: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto out; +} + +static inline int mm_alloc_pgd(struct mm_struct *mm) +{ + mm->pgd = pgd_alloc(mm); + if (unlikely(!mm->pgd)) + return -ENOMEM; + return 0; +} + +static inline void mm_free_pgd(struct mm_struct *mm) +{ + pgd_free(mm, mm->pgd); +} +#else +static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + down_write(&oldmm->mmap_sem); + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + up_write(&oldmm->mmap_sem); + return 0; +} +#define mm_alloc_pgd(mm) (0) +#define mm_free_pgd(mm) +#endif /* CONFIG_MMU */ + +static void check_mm(struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + long x = atomic_long_read(&mm->rss_stat.count[i]); + + if (unlikely(x)) + printk(KERN_ALERT "BUG: Bad rss-counter state " + "mm:%p idx:%d val:%ld\n", mm, i, x); + } + + if (mm_pgtables_bytes(mm)) + pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); +#endif +} + +#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) +#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +/* + * Called when the last reference to the mm + * is dropped: either by a lazy thread or by + * mmput. Free the page directory and the mm. + */ +static void __mmdrop(struct mm_struct *mm) +{ + BUG_ON(mm == &init_mm); + mm_free_pgd(mm); + destroy_context(mm); + hmm_mm_destroy(mm); + mmu_notifier_mm_destroy(mm); + check_mm(mm); + put_user_ns(mm->user_ns); + free_mm(mm); +} + +void mmdrop(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) + __mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmdrop); + +static void mmdrop_async_fn(struct work_struct *work) +{ + struct mm_struct *mm; + + mm = container_of(work, struct mm_struct, async_put_work); + __mmdrop(mm); +} + +static void mmdrop_async(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) { + INIT_WORK(&mm->async_put_work, mmdrop_async_fn); + schedule_work(&mm->async_put_work); + } +} + static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); @@ -594,181 +830,8 @@ free_tsk: return NULL; } -#ifdef CONFIG_MMU -static __latent_entropy int dup_mmap(struct mm_struct *mm, - struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; - struct rb_node **rb_link, *rb_parent; - int retval; - unsigned long charge; - LIST_HEAD(uf); - - uprobe_start_dup_mmap(); - if (down_write_killable(&oldmm->mmap_sem)) { - retval = -EINTR; - goto fail_uprobe_end; - } - flush_cache_dup_mm(oldmm); - uprobe_dup_mmap(oldmm, mm); - /* - * Not linked in yet - no deadlock potential: - */ - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - - /* No ordering required: file already has been exposed. */ - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - - mm->total_vm = oldmm->total_vm; - mm->data_vm = oldmm->data_vm; - mm->exec_vm = oldmm->exec_vm; - mm->stack_vm = oldmm->stack_vm; - - rb_link = &mm->mm_rb.rb_node; - rb_parent = NULL; - pprev = &mm->mmap; - retval = ksm_fork(mm, oldmm); - if (retval) - goto out; - retval = khugepaged_fork(mm, oldmm); - if (retval) - goto out; - - prev = NULL; - for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { - struct file *file; - - if (mpnt->vm_flags & VM_DONTCOPY) { - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); - continue; - } - charge = 0; - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len = vma_pages(mpnt); - - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ - goto fail_nomem; - charge = len; - } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!tmp) - goto fail_nomem; - *tmp = *mpnt; - INIT_LIST_HEAD(&tmp->anon_vma_chain); - retval = vma_dup_policy(mpnt, tmp); - if (retval) - goto fail_nomem_policy; - tmp->vm_mm = mm; - retval = dup_userfaultfd(tmp, &uf); - if (retval) - goto fail_nomem_anon_vma_fork; - if (tmp->vm_flags & VM_WIPEONFORK) { - /* VM_WIPEONFORK gets a clean slate in the child. */ - tmp->anon_vma = NULL; - if (anon_vma_prepare(tmp)) - goto fail_nomem_anon_vma_fork; - } else if (anon_vma_fork(tmp, mpnt)) - goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); - tmp->vm_next = tmp->vm_prev = NULL; - file = tmp->vm_file; - if (file) { - struct inode *inode = file_inode(file); - struct address_space *mapping = file->f_mapping; - - get_file(file); - if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); - i_mmap_lock_write(mapping); - if (tmp->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only - */ - if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); - - /* - * Link in the new vma and copy the page table entries. - */ - *pprev = tmp; - pprev = &tmp->vm_next; - tmp->vm_prev = prev; - prev = tmp; - - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; - - mm->map_count++; - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - if (retval) - goto out; - } - /* a new mm has just been created */ - retval = arch_dup_mmap(oldmm, mm); -out: - up_write(&mm->mmap_sem); - flush_tlb_mm(oldmm); - up_write(&oldmm->mmap_sem); - dup_userfaultfd_complete(&uf); -fail_uprobe_end: - uprobe_end_dup_mmap(); - return retval; -fail_nomem_anon_vma_fork: - mpol_put(vma_policy(tmp)); -fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto out; -} - -static inline int mm_alloc_pgd(struct mm_struct *mm) -{ - mm->pgd = pgd_alloc(mm); - if (unlikely(!mm->pgd)) - return -ENOMEM; - return 0; -} - -static inline void mm_free_pgd(struct mm_struct *mm) -{ - pgd_free(mm, mm->pgd); -} -#else -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - down_write(&oldmm->mmap_sem); - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - up_write(&oldmm->mmap_sem); - return 0; -} -#define mm_alloc_pgd(mm) (0) -#define mm_free_pgd(mm) -#endif /* CONFIG_MMU */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); -#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) -#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) - static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; static int __init coredump_filter_setup(char *s) @@ -858,27 +921,6 @@ fail_nopgd: return NULL; } -static void check_mm(struct mm_struct *mm) -{ - int i; - - for (i = 0; i < NR_MM_COUNTERS; i++) { - long x = atomic_long_read(&mm->rss_stat.count[i]); - - if (unlikely(x)) - printk(KERN_ALERT "BUG: Bad rss-counter state " - "mm:%p idx:%d val:%ld\n", mm, i, x); - } - - if (mm_pgtables_bytes(mm)) - pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", - mm_pgtables_bytes(mm)); - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON_MM(mm->pmd_huge_pte, mm); -#endif -} - /* * Allocate and initialize an mm_struct. */ @@ -894,24 +936,6 @@ struct mm_struct *mm_alloc(void) return mm_init(mm, current, current_user_ns()); } -/* - * Called when the last reference to the mm - * is dropped: either by a lazy thread or by - * mmput. Free the page directory and the mm. - */ -void __mmdrop(struct mm_struct *mm) -{ - BUG_ON(mm == &init_mm); - mm_free_pgd(mm); - destroy_context(mm); - hmm_mm_destroy(mm); - mmu_notifier_mm_destroy(mm); - check_mm(mm); - put_user_ns(mm->user_ns); - free_mm(mm); -} -EXPORT_SYMBOL_GPL(__mmdrop); - static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users)); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 557d46728577..2fb4e27c636a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1374,13 +1374,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "hugepages_treat_as_movable", - .data = &hugepages_treat_as_movable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "nr_overcommit_hugepages", .data = NULL, diff --git a/mm/Kconfig b/mm/Kconfig index 03ff7703d322..c782e8fb7235 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -639,15 +639,10 @@ config MAX_STACK_SIZE_MB A sane initial value is 80 MB. -# For architectures that support deferred memory initialisation -config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT - bool - config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" default n - depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT - depends on NO_BOOTMEM && MEMORY_HOTPLUG + depends on NO_BOOTMEM depends on !FLATMEM help Ordinarily all struct pages are initialised during early boot in a diff --git a/mm/compaction.c b/mm/compaction.c index 10cd757f1006..2c8999d027ab 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1738,7 +1738,7 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @alloc_flags: The allocation flags of the current allocation * @ac: The context of current allocation - * @mode: The migration mode for async, sync light, or sync migration + * @prio: Determines how hard direct compaction should try to succeed * * This is the main entry point for direct page compaction. */ diff --git a/mm/fadvise.c b/mm/fadvise.c index ec70d6e4b86d..767887f5f3bf 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -127,7 +127,15 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) */ start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT; end_index = (endbyte >> PAGE_SHIFT); - if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK) { + /* + * The page at end_index will be inclusively discarded according + * by invalidate_mapping_pages(), so subtracting 1 from + * end_index means we will skip the last page. But if endbyte + * is page aligned or is at the end of file, we should not skip + * that page - discarding the last page is safe enough. + */ + if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK && + endbyte != inode->i_size - 1) { /* First page is tricky as 0 - 1 = -1, but pgoff_t * is unsigned, so the end_index >= start_index * check below would be true and we'll discard the whole diff --git a/mm/filemap.c b/mm/filemap.c index ee83baaf855d..693f62212a59 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -31,7 +31,6 @@ #include <linux/blkdev.h> #include <linux/security.h> #include <linux/cpuset.h> -#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/cleancache.h> @@ -418,7 +418,7 @@ again: } if (!pte_present(pte)) { - swp_entry_t entry; + swp_entry_t entry = pte_to_swp_entry(pte); if (!non_swap_entry(entry)) { if (hmm_vma_walk->fault) @@ -426,8 +426,6 @@ again: continue; } - entry = pte_to_swp_entry(pte); - /* * This is a special swap entry, ignore migration, use * device and report anything else as error. diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0e7ded98d114..87ab9b8f56b5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1910,17 +1910,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * pmdp_invalidate() is required to make sure we don't miss * dirty/young flags set by hardware. */ - entry = *pmd; - pmdp_invalidate(vma, addr, pmd); - - /* - * Recover dirty/young flags. It relies on pmdp_invalidate to not - * corrupt them. - */ - if (pmd_dirty(*pmd)) - entry = pmd_mkdirty(entry); - if (pmd_young(*pmd)) - entry = pmd_mkyoung(entry); + entry = pmdp_invalidate(vma, addr, pmd); entry = pmd_modify(entry, newprot); if (preserve_write) @@ -2073,8 +2063,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct mm_struct *mm = vma->vm_mm; struct page *page; pgtable_t pgtable; - pmd_t _pmd; - bool young, write, dirty, soft_dirty, pmd_migration = false; + pmd_t old_pmd, _pmd; + bool young, write, soft_dirty, pmd_migration = false; unsigned long addr; int i; @@ -2116,24 +2106,50 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, return __split_huge_zero_page_pmd(vma, haddr, pmd); } + /* + * Up to this point the pmd is present and huge and userland has the + * whole access to the hugepage during the split (which happens in + * place). If we overwrite the pmd with the not-huge version pointing + * to the pte here (which of course we could if all CPUs were bug + * free), userland could trigger a small page size TLB miss on the + * small sized TLB while the hugepage TLB entry is still established in + * the huge TLB. Some CPU doesn't like that. + * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum + * 383 on page 93. Intel should be safe but is also warns that it's + * only safe if the permission and cache attributes of the two entries + * loaded in the two TLB is identical (which should be the case here). + * But it is generally safer to never allow small and huge TLB entries + * for the same virtual address to be loaded simultaneously. So instead + * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the + * current pmd notpresent (atomically because here the pmd_trans_huge + * must remain set at all times on the pmd until the split is complete + * for this pmd), then we flush the SMP TLB and finally we write the + * non-huge version of the pmd entry with pmd_populate. + */ + old_pmd = pmdp_invalidate(vma, haddr, pmd); + #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - pmd_migration = is_pmd_migration_entry(*pmd); + pmd_migration = is_pmd_migration_entry(old_pmd); if (pmd_migration) { swp_entry_t entry; - entry = pmd_to_swp_entry(*pmd); + entry = pmd_to_swp_entry(old_pmd); page = pfn_to_page(swp_offset(entry)); } else #endif - page = pmd_page(*pmd); + page = pmd_page(old_pmd); VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); - write = pmd_write(*pmd); - young = pmd_young(*pmd); - dirty = pmd_dirty(*pmd); - soft_dirty = pmd_soft_dirty(*pmd); + if (pmd_dirty(old_pmd)) + SetPageDirty(page); + write = pmd_write(old_pmd); + young = pmd_young(old_pmd); + soft_dirty = pmd_soft_dirty(old_pmd); - pmdp_huge_split_prepare(vma, haddr, pmd); + /* + * Withdraw the table only after we mark the pmd entry invalid. + * This's critical for some architectures (Power). + */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -2160,8 +2176,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (soft_dirty) entry = pte_mksoft_dirty(entry); } - if (dirty) - SetPageDirty(page + i); pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); @@ -2189,28 +2203,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } smp_wmb(); /* make pte visible before pmd */ - /* - * Up to this point the pmd is present and huge and userland has the - * whole access to the hugepage during the split (which happens in - * place). If we overwrite the pmd with the not-huge version pointing - * to the pte here (which of course we could if all CPUs were bug - * free), userland could trigger a small page size TLB miss on the - * small sized TLB while the hugepage TLB entry is still established in - * the huge TLB. Some CPU doesn't like that. - * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum - * 383 on page 93. Intel should be safe but is also warns that it's - * only safe if the permission and cache attributes of the two entries - * loaded in the two TLB is identical (which should be the case here). - * But it is generally safer to never allow small and huge TLB entries - * for the same virtual address to be loaded simultaneously. So instead - * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the - * current pmd notpresent (atomically because here the pmd_trans_huge - * and pmd_trans_splitting must remain set at all times on the pmd - * until the split is complete for this pmd), then we flush the SMP TLB - * and finally we write the non-huge version of the pmd entry with - * pmd_populate. - */ - pmdp_invalidate(vma, haddr, pmd); pmd_populate(mm, pmd, pgtable); if (freeze) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9a334f5fb730..7c204e3d132b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -34,10 +34,9 @@ #include <linux/hugetlb_cgroup.h> #include <linux/node.h> #include <linux/userfaultfd_k.h> +#include <linux/page_owner.h> #include "internal.h" -int hugepages_treat_as_movable; - int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; @@ -926,7 +925,7 @@ retry_cpuset: /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepages_treat_as_movable || hugepage_migration_supported(h)) + if (hugepage_migration_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; @@ -1108,7 +1107,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, return zone_spans_pfn(zone, last_pfn); } -static struct page *alloc_gigantic_page(int nid, struct hstate *h) +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { unsigned int order = huge_page_order(h); unsigned long nr_pages = 1 << order; @@ -1116,11 +1116,9 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h) struct zonelist *zonelist; struct zone *zone; struct zoneref *z; - gfp_t gfp_mask; - gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; zonelist = node_zonelist(nid, gfp_mask); - for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { spin_lock_irqsave(&zone->lock, flags); pfn = ALIGN(zone->zone_start_pfn, nr_pages); @@ -1151,41 +1149,13 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); static void prep_compound_gigantic_page(struct page *page, unsigned int order); -static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) -{ - struct page *page; - - page = alloc_gigantic_page(nid, h); - if (page) { - prep_compound_gigantic_page(page, huge_page_order(h)); - prep_new_huge_page(h, page, nid); - } - - return page; -} - -static int alloc_fresh_gigantic_page(struct hstate *h, - nodemask_t *nodes_allowed) -{ - struct page *page = NULL; - int nr_nodes, node; - - for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_gigantic_page_node(h, node); - if (page) - return 1; - } - - return 0; -} - #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ static inline bool gigantic_page_supported(void) { return false; } +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { return NULL; } static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } -static inline int alloc_fresh_gigantic_page(struct hstate *h, - nodemask_t *nodes_allowed) { return 0; } #endif static void update_and_free_page(struct hstate *h, struct page *page) @@ -1250,6 +1220,28 @@ static void clear_page_huge_active(struct page *page) ClearPagePrivate(&page[1]); } +/* + * Internal hugetlb specific page flag. Do not use outside of the hugetlb + * code + */ +static inline bool PageHugeTemporary(struct page *page) +{ + if (!PageHuge(page)) + return false; + + return (unsigned long)page[2].mapping == -1U; +} + +static inline void SetPageHugeTemporary(struct page *page) +{ + page[2].mapping = (void *)-1U; +} + +static inline void ClearPageHugeTemporary(struct page *page) +{ + page[2].mapping = NULL; +} + void free_huge_page(struct page *page) { /* @@ -1284,7 +1276,11 @@ void free_huge_page(struct page *page) if (restore_reserve) h->resv_huge_pages++; - if (h->surplus_huge_pages_node[nid]) { + if (PageHugeTemporary(page)) { + list_del(&page->lru); + ClearPageHugeTemporary(page); + update_and_free_page(h, page); + } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ list_del(&page->lru); update_and_free_page(h, page); @@ -1306,7 +1302,6 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; spin_unlock(&hugetlb_lock); - put_page(page); /* free it into the hugepage allocator */ } static void prep_compound_gigantic_page(struct page *page, unsigned int order) @@ -1383,41 +1378,70 @@ pgoff_t __basepage_index(struct page *page) return (index << compound_order(page_head)) + compound_idx; } -static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) +static struct page *alloc_buddy_huge_page(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask) { + int order = huge_page_order(h); struct page *page; - page = __alloc_pages_node(nid, - htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| - __GFP_RETRY_MAYFAIL|__GFP_NOWARN, - huge_page_order(h)); - if (page) { - prep_new_huge_page(h, page, nid); - } + gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); + if (page) + __count_vm_event(HTLB_BUDDY_PGALLOC); + else + __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + + return page; +} + +/* + * Common helper to allocate a fresh hugetlb page. All specific allocators + * should use this function to get new hugetlb pages + */ +static struct page *alloc_fresh_huge_page(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask) +{ + struct page *page; + + if (hstate_is_gigantic(h)) + page = alloc_gigantic_page(h, gfp_mask, nid, nmask); + else + page = alloc_buddy_huge_page(h, gfp_mask, + nid, nmask); + if (!page) + return NULL; + + if (hstate_is_gigantic(h)) + prep_compound_gigantic_page(page, huge_page_order(h)); + prep_new_huge_page(h, page, page_to_nid(page)); return page; } -static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) +/* + * Allocates a fresh page to the hugetlb allocator pool in the node interleaved + * manner. + */ +static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; int nr_nodes, node; - int ret = 0; + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_huge_page_node(h, node); - if (page) { - ret = 1; + page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed); + if (page) break; - } } - if (ret) - count_vm_event(HTLB_BUDDY_PGALLOC); - else - count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + if (!page) + return 0; - return ret; + put_page(page); /* free it into the hugepage allocator */ + + return 1; } /* @@ -1525,79 +1549,66 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) return rc; } -static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask) -{ - int order = huge_page_order(h); - - gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; - if (nid == NUMA_NO_NODE) - nid = numa_mem_id(); - return __alloc_pages_nodemask(gfp_mask, order, nid, nmask); -} - -static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, +/* + * Allocates a fresh surplus page from the page allocator. + */ +static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { - struct page *page; - unsigned int r_nid; + struct page *page = NULL; if (hstate_is_gigantic(h)) return NULL; + spin_lock(&hugetlb_lock); + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) + goto out_unlock; + spin_unlock(&hugetlb_lock); + + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); + if (!page) + return NULL; + + spin_lock(&hugetlb_lock); /* - * Assume we will successfully allocate the surplus page to - * prevent racing processes from causing the surplus to exceed - * overcommit - * - * This however introduces a different race, where a process B - * tries to grow the static hugepage pool while alloc_pages() is - * called by process A. B will only examine the per-node - * counters in determining if surplus huge pages can be - * converted to normal huge pages in adjust_pool_surplus(). A - * won't be able to increment the per-node counter, until the - * lock is dropped by B, but B doesn't drop hugetlb_lock until - * no more huge pages can be converted from surplus to normal - * state (and doesn't try to convert again). Thus, we have a - * case where a surplus huge page exists, the pool is grown, and - * the surplus huge page still exists after, even though it - * should just have been converted to a normal huge page. This - * does not leak memory, though, as the hugepage will be freed - * once it is out of use. It also does not allow the counters to - * go out of whack in adjust_pool_surplus() as we don't modify - * the node values until we've gotten the hugepage and only the - * per-node value is checked there. + * We could have raced with the pool size change. + * Double check that and simply deallocate the new page + * if we would end up overcommiting the surpluses. Abuse + * temporary page to workaround the nasty free_huge_page + * codeflow */ - spin_lock(&hugetlb_lock); if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { - spin_unlock(&hugetlb_lock); - return NULL; + SetPageHugeTemporary(page); + put_page(page); + page = NULL; } else { - h->nr_huge_pages++; h->surplus_huge_pages++; + h->nr_huge_pages_node[page_to_nid(page)]++; } + +out_unlock: spin_unlock(&hugetlb_lock); - page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask); + return page; +} - spin_lock(&hugetlb_lock); - if (page) { - INIT_LIST_HEAD(&page->lru); - r_nid = page_to_nid(page); - set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - set_hugetlb_cgroup(page, NULL); - /* - * We incremented the global counters already - */ - h->nr_huge_pages_node[r_nid]++; - h->surplus_huge_pages_node[r_nid]++; - __count_vm_event(HTLB_BUDDY_PGALLOC); - } else { - h->nr_huge_pages--; - h->surplus_huge_pages--; - __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); - } - spin_unlock(&hugetlb_lock); +static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) +{ + struct page *page; + + if (hstate_is_gigantic(h)) + return NULL; + + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); + if (!page) + return NULL; + + /* + * We do not account these pages as surplus because they are only + * temporary and will be released properly on the last reference + */ + SetPageHugeTemporary(page); return page; } @@ -1606,7 +1617,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static -struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, +struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct page *page; @@ -1616,17 +1627,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); - page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask); + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return page; } -/* - * This allocation function is useful in the context where vma is irrelevant. - * E.g. soft-offlining uses this function because it only cares physical - * address of error page. - */ +/* page migration callback function */ struct page *alloc_huge_page_node(struct hstate *h, int nid) { gfp_t gfp_mask = htlb_alloc_mask(h); @@ -1641,12 +1648,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (!page) - page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL); + page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL); return page; } - +/* page migration callback function */ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask) { @@ -1664,9 +1671,25 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, } spin_unlock(&hugetlb_lock); - /* No reservations, try to overcommit */ + return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); +} + +/* mempolicy aware migration callback */ +struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, + unsigned long address) +{ + struct mempolicy *mpol; + nodemask_t *nodemask; + struct page *page; + gfp_t gfp_mask; + int node; + + gfp_mask = htlb_alloc_mask(h); + node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + page = alloc_huge_page_nodemask(h, node, nodemask); + mpol_cond_put(mpol); - return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask); + return page; } /* @@ -1694,7 +1717,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h), + page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); if (!page) { alloc_ok = false; @@ -2031,7 +2054,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { spin_unlock(&hugetlb_lock); - page = __alloc_buddy_huge_page_with_mpol(h, vma, addr); + page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { @@ -2074,20 +2097,6 @@ out_subpool_put: return ERR_PTR(-ENOSPC); } -/* - * alloc_huge_page()'s wrapper which simply returns the page if allocation - * succeeds, otherwise NULL. This function is called from new_vma_page(), - * where no ERR_VALUE is expected to be returned. - */ -struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) -{ - struct page *page = alloc_huge_page(vma, addr, avoid_reserve); - if (IS_ERR(page)) - page = NULL; - return page; -} - int alloc_bootmem_huge_page(struct hstate *h) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); int __alloc_bootmem_huge_page(struct hstate *h) @@ -2150,6 +2159,8 @@ static void __init gather_bootmem_prealloc(void) prep_compound_huge_page(page, h->order); WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); + put_page(page); /* free it into the hugepage allocator */ + /* * If we had gigantic hugepages allocated at boot time, we need * to restore the 'stolen' pages to totalram_pages in order to @@ -2169,7 +2180,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) if (hstate_is_gigantic(h)) { if (!alloc_bootmem_huge_page(h)) break; - } else if (!alloc_fresh_huge_page(h, + } else if (!alloc_pool_huge_page(h, &node_states[N_MEMORY])) break; cond_resched(); @@ -2289,7 +2300,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * - * We might race with __alloc_buddy_huge_page() here and be unable + * We might race with alloc_surplus_huge_page() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but @@ -2312,10 +2323,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, /* yield cpu to avoid soft lockup */ cond_resched(); - if (hstate_is_gigantic(h)) - ret = alloc_fresh_gigantic_page(h, nodes_allowed); - else - ret = alloc_fresh_huge_page(h, nodes_allowed); + ret = alloc_pool_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -2335,7 +2343,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since - * __alloc_buddy_huge_page() is checking the global counter, + * alloc_surplus_huge_page() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. @@ -2975,20 +2983,32 @@ out: void hugetlb_report_meminfo(struct seq_file *m) { - struct hstate *h = &default_hstate; + struct hstate *h; + unsigned long total = 0; + if (!hugepages_supported()) return; - seq_printf(m, - "HugePages_Total: %5lu\n" - "HugePages_Free: %5lu\n" - "HugePages_Rsvd: %5lu\n" - "HugePages_Surp: %5lu\n" - "Hugepagesize: %8lu kB\n", - h->nr_huge_pages, - h->free_huge_pages, - h->resv_huge_pages, - h->surplus_huge_pages, - 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); + + for_each_hstate(h) { + unsigned long count = h->nr_huge_pages; + + total += (PAGE_SIZE << huge_page_order(h)) * count; + + if (h == &default_hstate) + seq_printf(m, + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" + "HugePages_Surp: %5lu\n" + "Hugepagesize: %8lu kB\n", + count, + h->free_huge_pages, + h->resv_huge_pages, + h->surplus_huge_pages, + (PAGE_SIZE << huge_page_order(h)) / 1024); + } + + seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024); } int hugetlb_report_node_meminfo(int nid, char *buf) @@ -4799,3 +4819,36 @@ void putback_active_hugepage(struct page *page) spin_unlock(&hugetlb_lock); put_page(page); } + +void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) +{ + struct hstate *h = page_hstate(oldpage); + + hugetlb_cgroup_migrate(oldpage, newpage); + set_page_owner_migrate_reason(newpage, reason); + + /* + * transfer temporary state of the new huge page. This is + * reverse to other transitions because the newpage is going to + * be final while the old one will be freed so it takes over + * the temporary status. + * + * Also note that we have to transfer the per-node surplus state + * here as well otherwise the global surplus count will not match + * the per-node's. + */ + if (PageHugeTemporary(newpage)) { + int old_nid = page_to_nid(oldpage); + int new_nid = page_to_nid(newpage); + + SetPageHugeTemporary(oldpage); + ClearPageHugeTemporary(newpage); + + spin_lock(&hugetlb_lock); + if (h->surplus_huge_pages_node[old_nid]) { + h->surplus_huge_pages_node[old_nid]--; + h->surplus_huge_pages_node[new_nid]++; + } + spin_unlock(&hugetlb_lock); + } +} diff --git a/mm/interval_tree.c b/mm/interval_tree.c index b47664358796..27ddfd29112a 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -18,7 +18,7 @@ static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) { - return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; + return v->vm_pgoff + vma_pages(v) - 1; } INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ea4ff259b671..b7e2268dfc9a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1399,8 +1399,7 @@ static void collapse_shmem(struct mm_struct *mm, } if (page_mapped(page)) - unmap_mapping_range(mapping, index << PAGE_SHIFT, - PAGE_SIZE, 0); + unmap_mapping_pages(mapping, index, 1, false); spin_lock_irq(&mapping->tree_lock); @@ -1674,10 +1673,14 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, spin_unlock(&khugepaged_mm_lock); mm = mm_slot->mm; - down_read(&mm->mmap_sem); - if (unlikely(khugepaged_test_exit(mm))) - vma = NULL; - else + /* + * Don't wait for semaphore (to avoid long wait times). Just move to + * the next mm on the list. + */ + vma = NULL; + if (unlikely(!down_read_trylock(&mm->mmap_sem))) + goto breakouterloop_mmap_sem; + if (likely(!khugepaged_test_exit(mm))) vma = find_vma(mm, khugepaged_scan.address); progress++; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f656ca27f6c2..e83987c55a08 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -91,7 +91,6 @@ #include <linux/stacktrace.h> #include <linux/cache.h> #include <linux/percpu.h> -#include <linux/hardirq.h> #include <linux/bootmem.h> #include <linux/pfn.h> #include <linux/mmzone.h> diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9011997d8a5c..0ae2dc3a1748 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -542,39 +542,10 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } -/* - * Return page count for single (non recursive) @memcg. - * - * Implementation Note: reading percpu statistics for memcg. - * - * Both of vmstat[] and percpu_counter has threshold and do periodic - * synchronization to implement "quick" read. There are trade-off between - * reading cost and precision of value. Then, we may have a chance to implement - * a periodic synchronization of counter in memcg's counter. - * - * But this _read() function is used for user interface now. The user accounts - * memory usage by memory cgroup and he _always_ requires exact value because - * he accounts memory. Even if we provide quick-and-fuzzy read, we always - * have to visit all online cpus and make sum. So, for now, unnecessary - * synchronization is not implemented. (just implemented for cpu hotplug) - * - * If there are kernel internal actions which can make use of some not-exact - * value, and reading all cpu value can be performance bottleneck in some - * common workload, threshold and synchronization as vmstat[] should be - * implemented. - * - * The parameter idx can be of type enum memcg_event_item or vm_event_item. - */ - static unsigned long memcg_sum_events(struct mem_cgroup *memcg, int event) { - unsigned long val = 0; - int cpu; - - for_each_possible_cpu(cpu) - val += per_cpu(memcg->stat->events[event], cpu); - return val; + return atomic_long_read(&memcg->events[event]); } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -586,27 +557,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, * counted as CACHE even if it's on ANON LRU. */ if (PageAnon(page)) - __this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages); + __mod_memcg_state(memcg, MEMCG_RSS, nr_pages); else { - __this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages); + __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages); if (PageSwapBacked(page)) - __this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages); + __mod_memcg_state(memcg, NR_SHMEM, nr_pages); } if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); - __this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages); + __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages); } /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __this_cpu_inc(memcg->stat->events[PGPGIN]); + __count_memcg_events(memcg, PGPGIN, 1); else { - __this_cpu_inc(memcg->stat->events[PGPGOUT]); + __count_memcg_events(memcg, PGPGOUT, 1); nr_pages = -nr_pages; /* for event */ } - __this_cpu_add(memcg->stat->nr_page_events, nr_pages); + __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); } unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, @@ -642,8 +613,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, { unsigned long val, next; - val = __this_cpu_read(memcg->stat->nr_page_events); - next = __this_cpu_read(memcg->stat->targets[target]); + val = __this_cpu_read(memcg->stat_cpu->nr_page_events); + next = __this_cpu_read(memcg->stat_cpu->targets[target]); /* from time_after() in jiffies.h */ if ((long)(next - val) < 0) { switch (target) { @@ -659,7 +630,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, default: break; } - __this_cpu_write(memcg->stat->targets[target], next); + __this_cpu_write(memcg->stat_cpu->targets[target], next); return true; } return false; @@ -1124,7 +1095,7 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } -unsigned int memcg1_stats[] = { +static const unsigned int memcg1_stats[] = { MEMCG_CACHE, MEMCG_RSS, MEMCG_RSS_HUGE, @@ -1206,20 +1177,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) } /* - * This function returns the number of memcg under hierarchy tree. Returns - * 1(self count) if no children. - */ -static int mem_cgroup_count_children(struct mem_cgroup *memcg) -{ - int num = 0; - struct mem_cgroup *iter; - - for_each_mem_cgroup_tree(iter, memcg) - num++; - return num; -} - -/* * Return the memory (and swap, if configured) limit for a memcg. */ unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) @@ -1707,11 +1664,6 @@ void unlock_page_memcg(struct page *page) } EXPORT_SYMBOL(unlock_page_memcg); -/* - * size of first charge trial. "32" comes from vmscan.c's magic value. - * TODO: maybe necessary to use big numbers in big irons. - */ -#define CHARGE_BATCH 32U struct memcg_stock_pcp { struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; @@ -1739,7 +1691,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) unsigned long flags; bool ret = false; - if (nr_pages > CHARGE_BATCH) + if (nr_pages > MEMCG_CHARGE_BATCH) return ret; local_irq_save(flags); @@ -1808,7 +1760,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) } stock->nr_pages += nr_pages; - if (stock->nr_pages > CHARGE_BATCH) + if (stock->nr_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); local_irq_restore(flags); @@ -1858,9 +1810,44 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; + struct mem_cgroup *memcg; stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); + + for_each_mem_cgroup(memcg) { + int i; + + for (i = 0; i < MEMCG_NR_STAT; i++) { + int nid; + long x; + + x = this_cpu_xchg(memcg->stat_cpu->count[i], 0); + if (x) + atomic_long_add(x, &memcg->stat[i]); + + if (i >= NR_VM_NODE_STAT_ITEMS) + continue; + + for_each_node(nid) { + struct mem_cgroup_per_node *pn; + + pn = mem_cgroup_nodeinfo(memcg, nid); + x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); + if (x) + atomic_long_add(x, &pn->lruvec_stat[i]); + } + } + + for (i = 0; i < MEMCG_NR_EVENTS; i++) { + long x; + + x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); + if (x) + atomic_long_add(x, &memcg->events[i]); + } + } + return 0; } @@ -1881,7 +1868,7 @@ static void high_work_func(struct work_struct *work) struct mem_cgroup *memcg; memcg = container_of(work, struct mem_cgroup, high_work); - reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL); + reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); } /* @@ -1905,7 +1892,7 @@ void mem_cgroup_handle_over_high(void) static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages) { - unsigned int batch = max(CHARGE_BATCH, nr_pages); + unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct page_counter *counter; @@ -2415,18 +2402,11 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) head[i].mem_cgroup = head->mem_cgroup; - __this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE], - HPAGE_PMD_NR); + __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_MEMCG_SWAP -static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, - int nr_entries) -{ - this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries); -} - /** * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. * @entry: swap entry to be moved @@ -2450,8 +2430,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, new_id = mem_cgroup_id(to); if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { - mem_cgroup_swap_statistics(from, -1); - mem_cgroup_swap_statistics(to, 1); + mod_memcg_state(from, MEMCG_SWAP, -1); + mod_memcg_state(to, MEMCG_SWAP, 1); return 0; } return -EINVAL; @@ -2467,23 +2447,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, static DEFINE_MUTEX(memcg_limit_mutex); static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, - unsigned long limit) + unsigned long limit, bool memsw) { - unsigned long curusage; - unsigned long oldusage; bool enlarge = false; - int retry_count; int ret; - - /* - * For keeping hierarchical_reclaim simple, how long we should retry - * is depends on callers. We set our retry-count to be function - * of # of children which we should visit in this loop. - */ - retry_count = MEM_CGROUP_RECLAIM_RETRIES * - mem_cgroup_count_children(memcg); - - oldusage = page_counter_read(&memcg->memory); + bool limits_invariant; + struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; do { if (signal_pending(current)) { @@ -2492,79 +2461,31 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, } mutex_lock(&memcg_limit_mutex); - if (limit > memcg->memsw.limit) { + /* + * Make sure that the new limit (memsw or memory limit) doesn't + * break our basic invariant rule memory.limit <= memsw.limit. + */ + limits_invariant = memsw ? limit >= memcg->memory.limit : + limit <= memcg->memsw.limit; + if (!limits_invariant) { mutex_unlock(&memcg_limit_mutex); ret = -EINVAL; break; } - if (limit > memcg->memory.limit) + if (limit > counter->limit) enlarge = true; - ret = page_counter_limit(&memcg->memory, limit); + ret = page_counter_limit(counter, limit); mutex_unlock(&memcg_limit_mutex); if (!ret) break; - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); - - curusage = page_counter_read(&memcg->memory); - /* Usage is reduced ? */ - if (curusage >= oldusage) - retry_count--; - else - oldusage = curusage; - } while (retry_count); - - if (!ret && enlarge) - memcg_oom_recover(memcg); - - return ret; -} - -static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, - unsigned long limit) -{ - unsigned long curusage; - unsigned long oldusage; - bool enlarge = false; - int retry_count; - int ret; - - /* see mem_cgroup_resize_res_limit */ - retry_count = MEM_CGROUP_RECLAIM_RETRIES * - mem_cgroup_count_children(memcg); - - oldusage = page_counter_read(&memcg->memsw); - - do { - if (signal_pending(current)) { - ret = -EINTR; + if (!try_to_free_mem_cgroup_pages(memcg, 1, + GFP_KERNEL, !memsw)) { + ret = -EBUSY; break; } - - mutex_lock(&memcg_limit_mutex); - if (limit < memcg->memory.limit) { - mutex_unlock(&memcg_limit_mutex); - ret = -EINVAL; - break; - } - if (limit > memcg->memsw.limit) - enlarge = true; - ret = page_counter_limit(&memcg->memsw, limit); - mutex_unlock(&memcg_limit_mutex); - - if (!ret) - break; - - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); - - curusage = page_counter_read(&memcg->memsw); - /* Usage is reduced ? */ - if (curusage >= oldusage) - retry_count--; - else - oldusage = curusage; - } while (retry_count); + } while (true); if (!ret && enlarge) memcg_oom_recover(memcg); @@ -3020,10 +2941,10 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, } switch (MEMFILE_TYPE(of_cft(of)->private)) { case _MEM: - ret = mem_cgroup_resize_limit(memcg, nr_pages); + ret = mem_cgroup_resize_limit(memcg, nr_pages, false); break; case _MEMSWAP: - ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); + ret = mem_cgroup_resize_limit(memcg, nr_pages, true); break; case _KMEM: ret = memcg_update_kmem_limit(memcg, nr_pages); @@ -4168,8 +4089,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; - pn->lruvec_stat = alloc_percpu(struct lruvec_stat); - if (!pn->lruvec_stat) { + pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat); + if (!pn->lruvec_stat_cpu) { kfree(pn); return 1; } @@ -4187,7 +4108,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; - free_percpu(pn->lruvec_stat); + free_percpu(pn->lruvec_stat_cpu); kfree(pn); } @@ -4197,7 +4118,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); - free_percpu(memcg->stat); + free_percpu(memcg->stat_cpu); kfree(memcg); } @@ -4226,8 +4147,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg->id.id < 0) goto fail; - memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); - if (!memcg->stat) + memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!memcg->stat_cpu) goto fail; for_each_node(node) @@ -4584,8 +4505,8 @@ static int mem_cgroup_move_account(struct page *page, spin_lock_irqsave(&from->move_lock, flags); if (!anon && page_mapped(page)) { - __this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages); - __this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages); + __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages); + __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages); } /* @@ -4597,16 +4518,14 @@ static int mem_cgroup_move_account(struct page *page, struct address_space *mapping = page_mapping(page); if (mapping_cap_account_dirty(mapping)) { - __this_cpu_sub(from->stat->count[NR_FILE_DIRTY], - nr_pages); - __this_cpu_add(to->stat->count[NR_FILE_DIRTY], - nr_pages); + __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages); + __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages); } } if (PageWriteback(page)) { - __this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages); - __this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages); + __mod_memcg_state(from, NR_WRITEBACK, -nr_pages); + __mod_memcg_state(to, NR_WRITEBACK, nr_pages); } /* @@ -5642,12 +5561,12 @@ static void uncharge_batch(const struct uncharge_gather *ug) } local_irq_save(flags); - __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon); - __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file); - __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge); - __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem); - __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout); - __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages); + __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon); + __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file); + __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); + __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); + __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); + __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); @@ -5874,7 +5793,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) if (in_softirq()) gfp_mask = GFP_NOWAIT; - this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages); + mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); if (try_charge(memcg, gfp_mask, nr_pages) == 0) return true; @@ -5895,7 +5814,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) return; } - this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages); + mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); refill_stock(memcg, nr_pages); } @@ -6019,7 +5938,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), nr_entries); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(swap_memcg, nr_entries); + mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); page->mem_cgroup = NULL; @@ -6085,7 +6004,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) mem_cgroup_id_get_many(memcg, nr_pages - 1); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, nr_pages); + mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); return 0; } @@ -6113,7 +6032,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) else page_counter_uncharge(&memcg->memsw, nr_pages); } - mem_cgroup_swap_statistics(memcg, -nr_pages); + mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); mem_cgroup_id_put_many(memcg, nr_pages); } rcu_read_unlock(); diff --git a/mm/memory.c b/mm/memory.c index 793004608332..53373b7a1512 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -400,10 +400,17 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ -/* tlb_gather_mmu - * Called to initialize an (on-stack) mmu_gather structure for page-table - * tear-down from @mm. The @fullmm argument is used when @mm is without - * users and we're going to destroy the full address space (exit/execve). +/** + * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down + * @tlb: the mmu_gather structure to initialize + * @mm: the mm_struct of the target address space + * @start: start of the region that will be removed from the page-table + * @end: end of the region that will be removed from the page-table + * + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. The @start and @end are set to 0 and -1 + * respectively when @mm is without users and we're going to destroy + * the full address space (exit/execve). */ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) @@ -2792,8 +2799,37 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, } /** + * unmap_mapping_pages() - Unmap pages from processes. + * @mapping: The address space containing pages to be unmapped. + * @start: Index of first page to be unmapped. + * @nr: Number of pages to be unmapped. 0 to unmap to end of file. + * @even_cows: Whether to unmap even private COWed pages. + * + * Unmap the pages in this address space from any userspace process which + * has them mmaped. Generally, you want to remove COWed pages as well when + * a file is being truncated, but not when invalidating pages from the page + * cache. + */ +void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, + pgoff_t nr, bool even_cows) +{ + struct zap_details details = { }; + + details.check_mapping = even_cows ? NULL : mapping; + details.first_index = start; + details.last_index = start + nr - 1; + if (details.last_index < details.first_index) + details.last_index = ULONG_MAX; + + i_mmap_lock_write(mapping); + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) + unmap_mapping_range_tree(&mapping->i_mmap, &details); + i_mmap_unlock_write(mapping); +} + +/** * unmap_mapping_range - unmap the portion of all mmaps in the specified - * address_space corresponding to the specified page range in the underlying + * address_space corresponding to the specified byte range in the underlying * file. * * @mapping: the address space containing mmaps to be unmapped. @@ -2811,7 +2847,6 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { - struct zap_details details = { }; pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -2823,16 +2858,7 @@ void unmap_mapping_range(struct address_space *mapping, hlen = ULONG_MAX - hba + 1; } - details.check_mapping = even_cows ? NULL : mapping; - details.first_index = hba; - details.last_index = hba + hlen - 1; - if (details.last_index < details.first_index) - details.last_index = ULONG_MAX; - - i_mmap_lock_write(mapping); - if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) - unmap_mapping_range_tree(&mapping->i_mmap, &details); - i_mmap_unlock_write(mapping); + unmap_mapping_pages(mapping, hba, hlen, even_cows); } EXPORT_SYMBOL(unmap_mapping_range); @@ -3485,9 +3511,8 @@ static int fault_around_bytes_get(void *data, u64 *val) } /* - * fault_around_pages() and fault_around_mask() expects fault_around_bytes - * rounded down to nearest page order. It's what do_fault_around() expects to - * see. + * fault_around_bytes must be rounded down to the nearest page order as it's + * what do_fault_around() expects to see. */ static int fault_around_bytes_set(void *data, u64 val) { @@ -3530,13 +3555,14 @@ late_initcall(fault_around_debugfs); * This function doesn't cross the VMA boundaries, in order to call map_pages() * only once. * - * fault_around_pages() defines how many pages we'll try to map. - * do_fault_around() expects it to return a power of two less than or equal to - * PTRS_PER_PTE. + * fault_around_bytes defines how many bytes we'll try to map. + * do_fault_around() expects it to be set to a power of two less than or equal + * to PTRS_PER_PTE. * - * The virtual address of the area that we map is naturally aligned to the - * fault_around_pages() value (and therefore to page order). This way it's - * easier to guarantee that we don't cross page table boundaries. + * The virtual address of the area that we map is naturally aligned to + * fault_around_bytes rounded down to the machine page size + * (and therefore to page order). This way it's easier to guarantee + * that we don't cross page table boundaries. */ static int do_fault_around(struct vm_fault *vmf) { @@ -3553,8 +3579,8 @@ static int do_fault_around(struct vm_fault *vmf) start_pgoff -= off; /* - * end_pgoff is either end of page table or end of vma - * or fault_around_pages() from start_pgoff, depending what is nearest. + * end_pgoff is either the end of the page table, the end of + * the vma or nr_pages from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c52aa05b106c..9bbd6982d4e4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -184,7 +184,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, SECTION_INFO); - usemap = __nr_to_section(section_nr)->pageblock_flags; + usemap = ms->pageblock_flags; page = virt_to_page(usemap); mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; @@ -200,9 +200,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) struct mem_section *ms; struct page *page, *memmap; - if (!pfn_valid(start_pfn)) - return; - section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); @@ -210,7 +207,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); - usemap = __nr_to_section(section_nr)->pageblock_flags; + usemap = ms->pageblock_flags; page = virt_to_page(usemap); mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; @@ -1637,7 +1634,7 @@ repeat: goto failed_removal; cond_resched(); - lru_add_drain_all_cpuslocked(); + lru_add_drain_all(); drain_all_pages(zone); pfn = scan_movable_pages(start_pfn, end_pfn); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ce44d3ff03d..d879f1d8a44a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1121,8 +1121,8 @@ static struct page *new_page(struct page *page, unsigned long start, int **x) } if (PageHuge(page)) { - BUG_ON(!vma); - return alloc_huge_page_noerr(vma, address, 1); + return alloc_huge_page_vma(page_hstate(compound_head(page)), + vma, address); } else if (thp_migration_supported() && PageTransHuge(page)) { struct page *thp; @@ -1263,6 +1263,7 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long k; + unsigned long t; unsigned long nlongs; unsigned long endmask; @@ -1279,13 +1280,17 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, else endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; - /* When the user specified more nodes than supported just check - if the non supported part is all zero. */ + /* + * When the user specified more nodes than supported just check + * if the non supported part is all zero. + * + * If maxnode have more longs than MAX_NUMNODES, check + * the bits in that area first. And then go through to + * check the rest bits which equal or bigger than MAX_NUMNODES. + * Otherwise, just check bits [MAX_NUMNODES, maxnode). + */ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { - if (nlongs > PAGE_SIZE/sizeof(long)) - return -EINVAL; for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { - unsigned long t; if (get_user(t, nmask + k)) return -EFAULT; if (k == nlongs - 1) { @@ -1298,6 +1303,16 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, endmask = ~0UL; } + if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) { + unsigned long valid_mask = endmask; + + valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); + if (get_user(t, nmask + nlongs - 1)) + return -EFAULT; + if (t & valid_mask) + return -EINVAL; + } + if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) return -EFAULT; nodes_addr(*nodes)[nlongs-1] &= endmask; @@ -1418,10 +1433,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, goto out_put; } - if (!nodes_subset(*new, node_states[N_MEMORY])) { - err = -EINVAL; + task_nodes = cpuset_mems_allowed(current); + nodes_and(*new, *new, task_nodes); + if (nodes_empty(*new)) + goto out_put; + + nodes_and(*new, *new, node_states[N_MEMORY]); + if (nodes_empty(*new)) goto out_put; - } err = security_task_movememory(task); if (err) diff --git a/mm/migrate.c b/mm/migrate.c index 4d0be47a322a..1e5525a25691 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1323,9 +1323,8 @@ put_anon: put_anon_vma(anon_vma); if (rc == MIGRATEPAGE_SUCCESS) { - hugetlb_cgroup_migrate(hpage, new_hpage); + move_hugetlb_state(hpage, new_hpage, reason); put_new_page = NULL; - set_page_owner_migrate_reason(new_hpage, reason); } unlock_page(hpage); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 96edb33fd09a..eff6b88a993f 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); +/* + * Must be called while holding mm->mmap_sem for either read or write. + * The result is guaranteed to be valid until mm->mmap_sem is dropped. + */ +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) +{ + struct mmu_notifier *mn; + int id; + bool ret = false; + + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); + + if (!mm_has_notifiers(mm)) + return ret; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + if (!mn->ops->invalidate_range && + !mn->ops->invalidate_range_start && + !mn->ops->invalidate_range_end) + continue; + + if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) { + ret = true; + break; + } + } + srcu_read_unlock(&srcu, id); + return ret; +} + static int do_mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm, int take_mmap_sem) diff --git a/mm/mprotect.c b/mm/mprotect.c index 58b629bb70de..e3309fcf586b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -84,6 +84,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!page || PageKsm(page)) continue; + /* Also skip shared copy-on-write pages */ + if (is_cow_mapping(vma->vm_flags) && + page_mapcount(page) != 1) + continue; + /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; diff --git a/mm/nommu.c b/mm/nommu.c index 17c00d93de2e..4b9864b17cb0 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1788,13 +1788,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, - int even_cows) -{ -} -EXPORT_SYMBOL(unmap_mapping_range); - int filemap_fault(struct vm_fault *vmf) { BUG(); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 29f855551efe..f2e7dfb81eee 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) } /* - * If the mm has notifiers then we would need to invalidate them around - * unmap_page_range and that is risky because notifiers can sleep and - * what they do is basically undeterministic. So let's have a short + * If the mm has invalidate_{start,end}() notifiers that could block, * sleep to give the oom victim some more time. * TODO: we really want to get rid of this ugly hack and make sure that - * notifiers cannot block for unbounded amount of time and add - * mmu_notifier_invalidate_range_{start,end} around unmap_page_range + * notifiers cannot block for unbounded amount of time */ - if (mm_has_notifiers(mm)) { + if (mm_has_blockable_invalidate_notifiers(mm)) { up_read(&mm->mmap_sem); schedule_timeout_idle(HZ); goto unlock_oom; @@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * count elevated without a good reason. */ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { - tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end); - unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, - NULL); - tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end); + const unsigned long start = vma->vm_start; + const unsigned long end = vma->vm_end; + + tlb_gather_mmu(&tlb, mm, start, end); + mmu_notifier_invalidate_range_start(mm, start, end); + unmap_page_range(&tlb, vma, start, end, NULL); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); } } pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 76c9688b6a0a..c7dd9c86e353 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -293,7 +293,7 @@ int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* - * Determine how many pages need to be initialized durig early boot + * Determine how many pages need to be initialized during early boot * (non-deferred initialization). * The value of first_deferred_pfn will be set later, once non-deferred pages * are initialized, but for now set it ULONG_MAX. @@ -344,7 +344,7 @@ static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) { - /* Always populate low zones for address-contrained allocations */ + /* Always populate low zones for address-constrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; (*nr_initialised)++; @@ -1177,9 +1177,10 @@ static void free_one_page(struct zone *zone, } static void __meminit __init_single_page(struct page *page, unsigned long pfn, - unsigned long zone, int nid) + unsigned long zone, int nid, bool zero) { - mm_zero_struct_page(page); + if (zero) + mm_zero_struct_page(page); set_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); @@ -1194,9 +1195,9 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, } static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, - int nid) + int nid, bool zero) { - return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); + return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero); } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -1217,7 +1218,7 @@ static void __meminit init_reserved_page(unsigned long pfn) if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) break; } - __init_single_pfn(pfn, zid, nid); + __init_single_pfn(pfn, zid, nid, true); } #else static inline void init_reserved_page(unsigned long pfn) @@ -1457,92 +1458,87 @@ static inline void __init pgdat_init_report_one_done(void) } /* - * Helper for deferred_init_range, free the given range, reset the counters, and - * return number of pages freed. + * Returns true if page needs to be initialized or freed to buddy allocator. + * + * First we check if pfn is valid on architectures where it is possible to have + * holes within pageblock_nr_pages. On systems where it is not possible, this + * function is optimized out. + * + * Then, we check if a current large page is valid by only checking the validity + * of the head pfn. + * + * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave + * within a node: a pfn is between start and end of a node, but does not belong + * to this memory node. */ -static inline unsigned long __init __def_free(unsigned long *nr_free, - unsigned long *free_base_pfn, - struct page **page) +static inline bool __init +deferred_pfn_valid(int nid, unsigned long pfn, + struct mminit_pfnnid_cache *nid_init_state) { - unsigned long nr = *nr_free; + if (!pfn_valid_within(pfn)) + return false; + if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) + return false; + if (!meminit_pfn_in_nid(pfn, nid, nid_init_state)) + return false; + return true; +} - deferred_free_range(*free_base_pfn, nr); - *free_base_pfn = 0; - *nr_free = 0; - *page = NULL; +/* + * Free pages to buddy allocator. Try to free aligned pages in + * pageblock_nr_pages sizes. + */ +static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, + unsigned long end_pfn) +{ + struct mminit_pfnnid_cache nid_init_state = { }; + unsigned long nr_pgmask = pageblock_nr_pages - 1; + unsigned long nr_free = 0; - return nr; + for (; pfn < end_pfn; pfn++) { + if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + deferred_free_range(pfn - nr_free, nr_free); + nr_free = 0; + } else if (!(pfn & nr_pgmask)) { + deferred_free_range(pfn - nr_free, nr_free); + nr_free = 1; + cond_resched(); + } else { + nr_free++; + } + } + /* Free the last block of pages to allocator */ + deferred_free_range(pfn - nr_free, nr_free); } -static unsigned long __init deferred_init_range(int nid, int zid, - unsigned long start_pfn, - unsigned long end_pfn) +/* + * Initialize struct pages. We minimize pfn page lookups and scheduler checks + * by performing it only once every pageblock_nr_pages. + * Return number of pages initialized. + */ +static unsigned long __init deferred_init_pages(int nid, int zid, + unsigned long pfn, + unsigned long end_pfn) { struct mminit_pfnnid_cache nid_init_state = { }; unsigned long nr_pgmask = pageblock_nr_pages - 1; - unsigned long free_base_pfn = 0; unsigned long nr_pages = 0; - unsigned long nr_free = 0; struct page *page = NULL; - unsigned long pfn; - /* - * First we check if pfn is valid on architectures where it is possible - * to have holes within pageblock_nr_pages. On systems where it is not - * possible, this function is optimized out. - * - * Then, we check if a current large page is valid by only checking the - * validity of the head pfn. - * - * meminit_pfn_in_nid is checked on systems where pfns can interleave - * within a node: a pfn is between start and end of a node, but does not - * belong to this memory node. - * - * Finally, we minimize pfn page lookups and scheduler checks by - * performing it only once every pageblock_nr_pages. - * - * We do it in two loops: first we initialize struct page, than free to - * buddy allocator, becuse while we are freeing pages we can access - * pages that are ahead (computing buddy page in __free_one_page()). - */ - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) + for (; pfn < end_pfn; pfn++) { + if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + page = NULL; continue; - if ((pfn & nr_pgmask) || pfn_valid(pfn)) { - if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { - if (page && (pfn & nr_pgmask)) - page++; - else - page = pfn_to_page(pfn); - __init_single_page(page, pfn, zid, nid); - cond_resched(); - } - } - } - - page = NULL; - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - } else if (page && (pfn & nr_pgmask)) { - page++; - nr_free++; - } else { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + } else if (!page || !(pfn & nr_pgmask)) { page = pfn_to_page(pfn); - free_base_pfn = pfn; - nr_free = 1; cond_resched(); + } else { + page++; } + __init_single_page(page, pfn, zid, nid, true); + nr_pages++; } - /* Free the last block of pages to allocator */ - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - - return nr_pages; + return (nr_pages); } /* Initialise remaining memory on a node */ @@ -1582,10 +1578,21 @@ static int __init deferred_init_memmap(void *data) } first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); + /* + * Initialize and free pages. We do it in two loops: first we initialize + * struct page, than free to buddy allocator, because while we are + * freeing pages we can access pages that are ahead (computing buddy + * page in __free_one_page()). + */ + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + nr_pages += deferred_init_pages(nid, zid, spfn, epfn); + } for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); - nr_pages += deferred_init_range(nid, zid, spfn, epfn); + deferred_free_pages(nid, zid, spfn, epfn); } /* Sanity check that the next zone really is unpopulated */ @@ -3391,7 +3398,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_THISNODE) goto out; - /* Exhausted what can be done so it's blamo time */ + /* Exhausted what can be done so it's blame time */ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; @@ -4272,7 +4279,7 @@ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) struct page *page; /* - * __get_free_pages() returns a 32-bit address, which cannot represent + * __get_free_pages() returns a virtual address, which cannot represent * a highmem page */ VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); @@ -5393,15 +5400,20 @@ not_early: * can be created for invalid pages (for alignment) * check here not to call set_pageblock_migratetype() against * pfn out of zone. + * + * Please note that MEMMAP_HOTPLUG path doesn't clear memmap + * because this is done early in sparse_add_one_section */ if (!(pfn & (pageblock_nr_pages - 1))) { struct page *page = pfn_to_page(pfn); - __init_single_page(page, pfn, zone, nid); + __init_single_page(page, pfn, zone, nid, + context != MEMMAP_HOTPLUG); set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } else { - __init_single_pfn(pfn, zone, nid); + __init_single_pfn(pfn, zone, nid, + context != MEMMAP_HOTPLUG); } } } diff --git a/mm/page_ext.c b/mm/page_ext.c index 2c16216c29b6..5295ef331165 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -59,7 +59,9 @@ */ static struct page_ext_operations *page_ext_ops[] = { +#ifdef CONFIG_DEBUG_PAGEALLOC &debug_guardpage_ops, +#endif #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif diff --git a/mm/page_owner.c b/mm/page_owner.c index 270a8219ccd0..9886c6073828 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -528,21 +528,18 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) { - struct page *page; - struct page_ext *page_ext; - unsigned long pfn = zone->zone_start_pfn, block_end_pfn; - unsigned long end_pfn = pfn + zone->spanned_pages; + unsigned long pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); unsigned long count = 0; - /* Scan block by block. First and last block may be incomplete */ - pfn = zone->zone_start_pfn; - /* * Walk the zone in pageblock_nr_pages steps. If a page block spans * a zone boundary, it will be double counted between zones. This does * not matter as the mixed block count will still be correct */ for (; pfn < end_pfn; ) { + unsigned long block_end_pfn; + if (!pfn_valid(pfn)) { pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); continue; @@ -551,9 +548,10 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); - page = pfn_to_page(pfn); - for (; pfn < block_end_pfn; pfn++) { + struct page *page; + struct page_ext *page_ext; + if (!pfn_valid_within(pfn)) continue; @@ -635,9 +633,7 @@ static int __init pageowner_init(void) dentry = debugfs_create_file("page_owner", S_IRUSR, NULL, NULL, &proc_page_owner_operations); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - return 0; + return PTR_ERR_OR_ZERO(dentry); } late_initcall(pageowner_init) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 1e4ee763c190..cf2af04b34b9 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -181,12 +181,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_t entry = *pmdp; - set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); + pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return old; } #endif diff --git a/mm/shmem.c b/mm/shmem.c index 7fbe67be86fa..1907688b75ee 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2717,15 +2717,28 @@ continue_resched: return error; } +static unsigned int *memfd_file_seals_ptr(struct file *file) +{ + if (file->f_op == &shmem_file_operations) + return &SHMEM_I(file_inode(file))->seals; + +#ifdef CONFIG_HUGETLBFS + if (file->f_op == &hugetlbfs_file_operations) + return &HUGETLBFS_I(file_inode(file))->seals; +#endif + + return NULL; +} + #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ F_SEAL_WRITE) -int shmem_add_seals(struct file *file, unsigned int seals) +static int memfd_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); - struct shmem_inode_info *info = SHMEM_I(inode); + unsigned int *file_seals; int error; /* @@ -2758,8 +2771,6 @@ int shmem_add_seals(struct file *file, unsigned int seals) * other file types. */ - if (file->f_op != &shmem_file_operations) - return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EPERM; if (seals & ~(unsigned int)F_ALL_SEALS) @@ -2767,12 +2778,18 @@ int shmem_add_seals(struct file *file, unsigned int seals) inode_lock(inode); - if (info->seals & F_SEAL_SEAL) { + file_seals = memfd_file_seals_ptr(file); + if (!file_seals) { + error = -EINVAL; + goto unlock; + } + + if (*file_seals & F_SEAL_SEAL) { error = -EPERM; goto unlock; } - if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) { + if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { error = mapping_deny_writable(file->f_mapping); if (error) goto unlock; @@ -2784,25 +2801,22 @@ int shmem_add_seals(struct file *file, unsigned int seals) } } - info->seals |= seals; + *file_seals |= seals; error = 0; unlock: inode_unlock(inode); return error; } -EXPORT_SYMBOL_GPL(shmem_add_seals); -int shmem_get_seals(struct file *file) +static int memfd_get_seals(struct file *file) { - if (file->f_op != &shmem_file_operations) - return -EINVAL; + unsigned int *seals = memfd_file_seals_ptr(file); - return SHMEM_I(file_inode(file))->seals; + return seals ? *seals : -EINVAL; } -EXPORT_SYMBOL_GPL(shmem_get_seals); -long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { long error; @@ -2812,10 +2826,10 @@ long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (arg > UINT_MAX) return -EINVAL; - error = shmem_add_seals(file, arg); + error = memfd_add_seals(file, arg); break; case F_GET_SEALS: - error = shmem_get_seals(file); + error = memfd_get_seals(file); break; default: error = -EINVAL; @@ -3657,7 +3671,7 @@ SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { - struct shmem_inode_info *info; + unsigned int *file_seals; struct file *file; int fd, error; char *name; @@ -3667,9 +3681,6 @@ SYSCALL_DEFINE2(memfd_create, if (flags & ~(unsigned int)MFD_ALL_FLAGS) return -EINVAL; } else { - /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */ - if (flags & MFD_ALLOW_SEALING) - return -EINVAL; /* Allow huge page size encoding in flags. */ if (flags & ~(unsigned int)(MFD_ALL_FLAGS | (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) @@ -3722,12 +3733,8 @@ SYSCALL_DEFINE2(memfd_create, file->f_flags |= O_RDWR | O_LARGEFILE; if (flags & MFD_ALLOW_SEALING) { - /* - * flags check at beginning of function ensures - * this is not a hugetlbfs (MFD_HUGETLB) file. - */ - info = SHMEM_I(file_inode(file)); - info->seals &= ~F_SEAL_SEAL; + file_seals = memfd_file_seals_ptr(file); + *file_seals &= ~F_SEAL_SEAL; } fd_install(fd, file); diff --git a/mm/slab.c b/mm/slab.c index 4e51ef954026..226906294183 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1316,8 +1316,6 @@ void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; - slab_state = UP; - /* 6) resize the head arrays to their final sizes */ mutex_lock(&slab_mutex); list_for_each_entry(cachep, &slab_caches, list) @@ -1353,8 +1351,6 @@ static int __init cpucache_init(void) slab_online_cpu, slab_offline_cpu); WARN_ON(ret < 0); - /* Done! */ - slab_state = FULL; return 0; } __initcall(cpucache_init); diff --git a/mm/slab.h b/mm/slab.h index ad657ffa44e5..e8e2095a6185 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -78,9 +78,6 @@ extern const struct kmalloc_info_struct { unsigned long size; } kmalloc_info[]; -unsigned long calculate_alignment(slab_flags_t flags, - unsigned long align, unsigned long size); - #ifndef CONFIG_SLOB /* Kmalloc array related functions */ void setup_kmalloc_cache_index_table(void); diff --git a/mm/slab_common.c b/mm/slab_common.c index c8cb36774ba1..deeddf95cdcf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -268,6 +268,35 @@ static inline void memcg_unlink_cache(struct kmem_cache *s) #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ /* + * Figure out what the alignment of the objects will be given a set of + * flags, a user specified alignment and the size of the objects. + */ +static unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size) +{ + /* + * If the user wants hardware cache aligned objects then follow that + * suggestion if the object is sufficiently large. + * + * The hardware cache alignment cannot override the specified + * alignment though. If that is greater then use it. + */ + if (flags & SLAB_HWCACHE_ALIGN) { + unsigned long ralign; + + ralign = cache_line_size(); + while (size <= ralign / 2) + ralign /= 2; + align = max(align, ralign); + } + + if (align < ARCH_SLAB_MINALIGN) + align = ARCH_SLAB_MINALIGN; + + return ALIGN(align, sizeof(void *)); +} + +/* * Find a mergeable slab cache */ int slab_unmergeable(struct kmem_cache *s) @@ -337,33 +366,6 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, return NULL; } -/* - * Figure out what the alignment of the objects will be given a set of - * flags, a user specified alignment and the size of the objects. - */ -unsigned long calculate_alignment(slab_flags_t flags, - unsigned long align, unsigned long size) -{ - /* - * If the user wants hardware cache aligned objects then follow that - * suggestion if the object is sufficiently large. - * - * The hardware cache alignment cannot override the specified - * alignment though. If that is greater then use it. - */ - if (flags & SLAB_HWCACHE_ALIGN) { - unsigned long ralign = cache_line_size(); - while (size <= ralign / 2) - ralign /= 2; - align = max(align, ralign); - } - - if (align < ARCH_SLAB_MINALIGN) - align = ARCH_SLAB_MINALIGN; - - return ALIGN(align, sizeof(void *)); -} - static struct kmem_cache *create_cache(const char *name, size_t object_size, size_t size, size_t align, slab_flags_t flags, void (*ctor)(void *), diff --git a/mm/slub.c b/mm/slub.c index cfd56e5a35fb..693b7074bc53 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -838,6 +838,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) u8 *start; u8 *fault; u8 *end; + u8 *pad; int length; int remainder; @@ -851,8 +852,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) if (!remainder) return 1; + pad = end - remainder; metadata_access_enable(); - fault = memchr_inv(end - remainder, POISON_INUSE, remainder); + fault = memchr_inv(pad, POISON_INUSE, remainder); metadata_access_disable(); if (!fault) return 1; @@ -860,9 +862,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section(KERN_ERR, "Padding ", end - remainder, remainder); + print_section(KERN_ERR, "Padding ", pad, remainder); - restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); + restore_bytes(s, "slab padding", POISON_INUSE, fault, end); return 0; } @@ -2220,9 +2222,7 @@ static void unfreeze_partials(struct kmem_cache *s, /* * Put a page that was just frozen (in __slab_free) into a partial page - * slot if available. This is done without interrupts disabled and without - * preemption disabled. The cmpxchg is racy and may put the partial page - * onto a random cpus partial slot. + * slot if available. * * If we did not find a slot then simply move all the partials to the * per node partial list. diff --git a/mm/sparse.c b/mm/sparse.c index 2609aba121e8..6b8b5e91ceef 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -264,7 +264,11 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, */ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum) { - return (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); + unsigned long coded_mem_map = + (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); + BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT)); + BUG_ON(coded_mem_map & ~SECTION_MAP_MASK); + return coded_mem_map; } /* diff --git a/mm/swap.c b/mm/swap.c index 38e1b6374a97..10568b1548d4 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -411,7 +411,7 @@ static void __lru_cache_add(struct page *page) } /** - * lru_cache_add: add a page to the page lists + * lru_cache_add_anon - add a page to the page lists * @page: the page to add */ void lru_cache_add_anon(struct page *page) @@ -688,7 +688,14 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); -void lru_add_drain_all_cpuslocked(void) +/* + * Doesn't need any cpu hotplug locking because we do rely on per-cpu + * kworkers being shut down before our page_alloc_cpu_dead callback is + * executed on the offlined cpu. + * Calling this function with cpu hotplug locks held can actually lead + * to obscure indirect dependencies via WQ context. + */ +void lru_add_drain_all(void) { static DEFINE_MUTEX(lock); static struct cpumask has_work; @@ -724,13 +731,6 @@ void lru_add_drain_all_cpuslocked(void) mutex_unlock(&lock); } -void lru_add_drain_all(void) -{ - get_online_cpus(); - lru_add_drain_all_cpuslocked(); - put_online_cpus(); -} - /** * release_pages - batched put_page() * @pages: array of pages to release @@ -930,10 +930,10 @@ EXPORT_SYMBOL(__pagevec_lru_add); */ unsigned pagevec_lookup_entries(struct pagevec *pvec, struct address_space *mapping, - pgoff_t start, unsigned nr_pages, + pgoff_t start, unsigned nr_entries, pgoff_t *indices) { - pvec->nr = find_get_entries(mapping, start, nr_pages, + pvec->nr = find_get_entries(mapping, start, nr_entries, pvec->pages, indices); return pagevec_count(pvec); } @@ -965,9 +965,8 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) * @mapping: The address_space to search * @start: The starting page index * @end: The final page index - * @nr_pages: The maximum number of pages * - * pagevec_lookup_range() will search for and return a group of up to @nr_pages + * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE * pages in the mapping starting from index @start and upto index @end * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a * reference against the pages in @pvec. @@ -977,7 +976,7 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) * also update @start to index the next page for the traversal. * * pagevec_lookup_range() returns the number of pages which were found. If this - * number is smaller than @nr_pages, the end of specified range has been + * number is smaller than PAGEVEC_SIZE, the end of specified range has been * reached. */ unsigned pagevec_lookup_range(struct pagevec *pvec, diff --git a/mm/truncate.c b/mm/truncate.c index e4b4cf0f4070..c34e2fd4f583 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -179,12 +179,8 @@ static void truncate_cleanup_page(struct address_space *mapping, struct page *page) { if (page_mapped(page)) { - loff_t holelen; - - holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; - unmap_mapping_range(mapping, - (loff_t)page->index << PAGE_SHIFT, - holelen, 0); + pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1; + unmap_mapping_pages(mapping, page->index, nr, false); } if (page_has_private(page)) @@ -715,19 +711,15 @@ int invalidate_inode_pages2_range(struct address_space *mapping, /* * Zap the rest of the file in one hit. */ - unmap_mapping_range(mapping, - (loff_t)index << PAGE_SHIFT, - (loff_t)(1 + end - index) - << PAGE_SHIFT, - 0); + unmap_mapping_pages(mapping, index, + (1 + end - index), false); did_range_unmap = 1; } else { /* * Just zap this page */ - unmap_mapping_range(mapping, - (loff_t)index << PAGE_SHIFT, - PAGE_SIZE, 0); + unmap_mapping_pages(mapping, index, + 1, false); } } BUG_ON(page_mapped(page)); @@ -753,8 +745,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * get remapped later. */ if (dax_mapping(mapping)) { - unmap_mapping_range(mapping, (loff_t)start << PAGE_SHIFT, - (loff_t)(end - start + 1) << PAGE_SHIFT, 0); + unmap_mapping_pages(mapping, start, end - start + 1, false); } out: cleancache_invalidate_inode(mapping); diff --git a/mm/vmscan.c b/mm/vmscan.c index 47d5ced51f2d..fdd3fc6be862 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -220,22 +220,6 @@ unsigned long zone_reclaimable_pages(struct zone *zone) return nr; } -unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) -{ - unsigned long nr; - - nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) + - node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) + - node_page_state_snapshot(pgdat, NR_ISOLATED_FILE); - - if (get_nr_swap_pages() > 0) - nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) + - node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) + - node_page_state_snapshot(pgdat, NR_ISOLATED_ANON); - - return nr; -} - /** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector @@ -310,9 +294,7 @@ EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, - struct shrinker *shrinker, - unsigned long nr_scanned, - unsigned long nr_eligible) + struct shrinker *shrinker, int priority) { unsigned long freed = 0; unsigned long long delta; @@ -337,9 +319,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); total_scan = nr; - delta = (4 * nr_scanned) / shrinker->seeks; - delta *= freeable; - do_div(delta, nr_eligible + 1); + delta = freeable >> priority; + delta *= 4; + do_div(delta, shrinker->seeks); total_scan += delta; if (total_scan < 0) { pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", @@ -373,8 +355,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, total_scan = freeable * 2; trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - nr_scanned, nr_eligible, - freeable, delta, total_scan); + freeable, delta, total_scan, priority); /* * Normally, we should not scan less than batch_size objects in one @@ -434,8 +415,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * @gfp_mask: allocation context * @nid: node whose slab caches to target * @memcg: memory cgroup whose slab caches to target - * @nr_scanned: pressure numerator - * @nr_eligible: pressure denominator + * @priority: the reclaim priority * * Call the shrink functions to age shrinkable caches. * @@ -447,20 +427,14 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * objects from the memory cgroup specified. Otherwise, only unaware * shrinkers are called. * - * @nr_scanned and @nr_eligible form a ratio that indicate how much of - * the available objects should be scanned. Page reclaim for example - * passes the number of pages scanned and the number of pages on the - * LRU lists that it considered on @nid, plus a bias in @nr_scanned - * when it encountered mapped pages. The ratio is further biased by - * the ->seeks setting of the shrink function, which indicates the - * cost to recreate an object relative to that of an LRU page. + * @priority is sc->priority, we take the number of objects and >> by priority + * in order to get the scan target. * * Returns the number of reclaimed slab objects. */ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, - unsigned long nr_scanned, - unsigned long nr_eligible) + int priority) { struct shrinker *shrinker; unsigned long freed = 0; @@ -468,9 +442,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) return 0; - if (nr_scanned == 0) - nr_scanned = SWAP_CLUSTER_MAX; - if (!down_read_trylock(&shrinker_rwsem)) { /* * If we would return 0, our callers would understand that we @@ -501,7 +472,16 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) sc.nid = 0; - freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); + freed += do_shrink_slab(&sc, shrinker, priority); + /* + * Bail out if someone want to register a new shrinker to + * prevent the regsitration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } } up_read(&shrinker_rwsem); @@ -519,8 +499,7 @@ void drop_slab_node(int nid) freed = 0; do { - freed += shrink_slab(GFP_KERNEL, nid, memcg, - 1000, 1000); + freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); } while (freed > 10); } @@ -1436,14 +1415,24 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) if (PageDirty(page)) { struct address_space *mapping; + bool migrate_dirty; /* * Only pages without mappings or that have a * ->migratepage callback are possible to migrate - * without blocking + * without blocking. However, we can be racing with + * truncation so it's necessary to lock the page + * to stabilise the mapping as truncation holds + * the page lock until after the page is removed + * from the page cache. */ + if (!trylock_page(page)) + return ret; + mapping = page_mapping(page); - if (mapping && !mapping->a_ops->migratepage) + migrate_dirty = mapping && mapping->a_ops->migratepage; + unlock_page(page); + if (!migrate_dirty) return ret; } } @@ -2615,14 +2604,12 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; - shrink_node_memcg(pgdat, memcg, sc, &lru_pages); node_lru_pages += lru_pages; if (memcg) shrink_slab(sc->gfp_mask, pgdat->node_id, - memcg, sc->nr_scanned - scanned, - lru_pages); + memcg, sc->priority); /* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, @@ -2646,14 +2633,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); - /* - * Shrink the slab caches in the same proportion that - * the eligible LRU pages were scanned. - */ if (global_reclaim(sc)) shrink_slab(sc->gfp_mask, pgdat->node_id, NULL, - sc->nr_scanned - nr_scanned, - node_lru_pages); + sc->priority); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; diff --git a/mm/zpool.c b/mm/zpool.c index fd3ff719c32c..e1e7aa6d1d06 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -21,6 +21,7 @@ struct zpool { struct zpool_driver *driver; void *pool; const struct zpool_ops *ops; + bool evictable; struct list_head list; }; @@ -142,7 +143,7 @@ EXPORT_SYMBOL(zpool_has_pool); * * This creates a new zpool of the specified type. The gfp flags will be * used when allocating memory, if the implementation supports it. If the - * ops param is NULL, then the created zpool will not be shrinkable. + * ops param is NULL, then the created zpool will not be evictable. * * Implementations must guarantee this to be thread-safe. * @@ -180,6 +181,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, zpool->driver = driver; zpool->pool = driver->create(name, gfp, ops, zpool); zpool->ops = ops; + zpool->evictable = driver->shrink && ops && ops->evict; if (!zpool->pool) { pr_err("couldn't create %s pool\n", type); @@ -296,7 +298,8 @@ void zpool_free(struct zpool *zpool, unsigned long handle) int zpool_shrink(struct zpool *zpool, unsigned int pages, unsigned int *reclaimed) { - return zpool->driver->shrink(zpool->pool, pages, reclaimed); + return zpool->driver->shrink ? + zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL; } /** @@ -355,6 +358,24 @@ u64 zpool_get_total_size(struct zpool *zpool) return zpool->driver->total_size(zpool->pool); } +/** + * zpool_evictable() - Test if zpool is potentially evictable + * @pool The zpool to test + * + * Zpool is only potentially evictable when it's created with struct + * zpool_ops.evict and its driver implements struct zpool_driver.shrink. + * + * However, it doesn't necessarily mean driver will use zpool_ops.evict + * in its implementation of zpool_driver.shrink. It could do internal + * defragmentation instead. + * + * Returns: true if potentially evictable; false otherwise. + */ +bool zpool_evictable(struct zpool *zpool) +{ + return zpool->evictable; +} + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 683c0651098c..c3013505c305 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -46,6 +46,7 @@ #include <linux/vmalloc.h> #include <linux/preempt.h> #include <linux/spinlock.h> +#include <linux/shrinker.h> #include <linux/types.h> #include <linux/debugfs.h> #include <linux/zsmalloc.h> @@ -257,11 +258,7 @@ struct zs_pool { /* Compact classes */ struct shrinker shrinker; - /* - * To signify that register_shrinker() was successful - * and unregister_shrinker() will not Oops. - */ - bool shrinker_enabled; + #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif @@ -407,12 +404,6 @@ static void zs_zpool_free(void *pool, unsigned long handle) zs_free(pool, handle); } -static int zs_zpool_shrink(void *pool, unsigned int pages, - unsigned int *reclaimed) -{ - return -EINVAL; -} - static void *zs_zpool_map(void *pool, unsigned long handle, enum zpool_mapmode mm) { @@ -450,7 +441,6 @@ static struct zpool_driver zs_zpool_driver = { .destroy = zs_zpool_destroy, .malloc = zs_zpool_malloc, .free = zs_zpool_free, - .shrink = zs_zpool_shrink, .map = zs_zpool_map, .unmap = zs_zpool_unmap, .total_size = zs_zpool_total_size, @@ -1057,7 +1047,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) * Reset OBJ_TAG_BITS bit to last link to tell * whether it's allocated object or not. */ - link->next = -1 << OBJ_TAG_BITS; + link->next = -1UL << OBJ_TAG_BITS; } kunmap_atomic(vaddr); page = next_page; @@ -2324,10 +2314,7 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, static void zs_unregister_shrinker(struct zs_pool *pool) { - if (pool->shrinker_enabled) { - unregister_shrinker(&pool->shrinker); - pool->shrinker_enabled = false; - } + unregister_shrinker(&pool->shrinker); } static int zs_register_shrinker(struct zs_pool *pool) @@ -2426,11 +2413,13 @@ struct zs_pool *zs_create_pool(const char *name) goto err; /* - * Not critical, we still can use the pool - * and user can trigger compaction manually. + * Not critical since shrinker is only used to trigger internal + * defragmentation of the pool which is pretty optional thing. If + * registration fails we still can use the pool normally and user can + * trigger compaction manually. Thus, ignore return code. */ - if (zs_register_shrinker(pool) == 0) - pool->shrinker_enabled = true; + zs_register_shrinker(pool); + return pool; err: diff --git a/mm/zswap.c b/mm/zswap.c index d39581a076c3..c004aa4fd3f4 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -49,6 +49,8 @@ static u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ static atomic_t zswap_stored_pages = ATOMIC_INIT(0); +/* The number of same-value filled pages currently stored in zswap */ +static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -116,6 +118,11 @@ module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); +/* Enable/disable handling same-value filled pages (enabled by default) */ +static bool zswap_same_filled_pages_enabled = true; +module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, + bool, 0644); + /********************************* * data structures **********************************/ @@ -145,9 +152,10 @@ struct zswap_pool { * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * length - the length in bytes of the compressed page data. Needed during - * decompression + * decompression. For a same value filled page length is 0. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data + * value - value of the same-value filled pages which have same content */ struct zswap_entry { struct rb_node rbnode; @@ -155,7 +163,10 @@ struct zswap_entry { int refcount; unsigned int length; struct zswap_pool *pool; - unsigned long handle; + union { + unsigned long handle; + unsigned long value; + }; }; struct zswap_header { @@ -320,8 +331,12 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) */ static void zswap_free_entry(struct zswap_entry *entry) { - zpool_free(entry->pool->zpool, entry->handle); - zswap_pool_put(entry->pool); + if (!entry->length) + atomic_dec(&zswap_same_filled_pages); + else { + zpool_free(entry->pool->zpool, entry->handle); + zswap_pool_put(entry->pool); + } zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); zswap_update_total_size(); @@ -953,6 +968,28 @@ static int zswap_shrink(void) return ret; } +static int zswap_is_page_same_filled(void *ptr, unsigned long *value) +{ + unsigned int pos; + unsigned long *page; + + page = (unsigned long *)ptr; + for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) { + if (page[pos] != page[0]) + return 0; + } + *value = page[0]; + return 1; +} + +static void zswap_fill_page(void *ptr, unsigned long value) +{ + unsigned long *page; + + page = (unsigned long *)ptr; + memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); +} + /********************************* * frontswap hooks **********************************/ @@ -964,11 +1001,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct zswap_entry *entry, *dupentry; struct crypto_comp *tfm; int ret; - unsigned int dlen = PAGE_SIZE, len; - unsigned long handle; + unsigned int hlen, dlen = PAGE_SIZE; + unsigned long handle, value; char *buf; u8 *src, *dst; - struct zswap_header *zhdr; + struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; if (!zswap_enabled || !tree) { ret = -ENODEV; @@ -993,6 +1030,19 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, goto reject; } + if (zswap_same_filled_pages_enabled) { + src = kmap_atomic(page); + if (zswap_is_page_same_filled(src, &value)) { + kunmap_atomic(src); + entry->offset = offset; + entry->length = 0; + entry->value = value; + atomic_inc(&zswap_same_filled_pages); + goto insert_entry; + } + kunmap_atomic(src); + } + /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) { @@ -1013,8 +1063,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, } /* store */ - len = dlen + sizeof(struct zswap_header); - ret = zpool_malloc(entry->pool->zpool, len, + hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; + ret = zpool_malloc(entry->pool->zpool, hlen + dlen, __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, &handle); if (ret == -ENOSPC) { @@ -1025,10 +1075,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto put_dstmem; } - zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); - zhdr->swpentry = swp_entry(type, offset); - buf = (u8 *)(zhdr + 1); - memcpy(buf, dst, dlen); + buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); + memcpy(buf, &zhdr, hlen); + memcpy(buf + hlen, dst, dlen); zpool_unmap_handle(entry->pool->zpool, handle); put_cpu_var(zswap_dstmem); @@ -1037,6 +1086,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, entry->handle = handle; entry->length = dlen; +insert_entry: /* map */ spin_lock(&tree->lock); do { @@ -1089,10 +1139,18 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, } spin_unlock(&tree->lock); + if (!entry->length) { + dst = kmap_atomic(page); + zswap_fill_page(dst, entry->value); + kunmap_atomic(dst); + goto freeentry; + } + /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, - ZPOOL_MM_RO) + sizeof(struct zswap_header); + src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); + if (zpool_evictable(entry->pool->zpool)) + src += sizeof(struct zswap_header); dst = kmap_atomic(page); tfm = *get_cpu_ptr(entry->pool->tfm); ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); @@ -1101,6 +1159,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, zpool_unmap_handle(entry->pool->zpool, entry->handle); BUG_ON(ret); +freeentry: spin_lock(&tree->lock); zswap_entry_put(tree, entry); spin_unlock(&tree->lock); @@ -1209,6 +1268,8 @@ static int __init zswap_debugfs_init(void) zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", S_IRUGO, zswap_debugfs_root, &zswap_stored_pages); + debugfs_create_atomic_t("same_filled_pages", 0444, + zswap_debugfs_root, &zswap_same_filled_pages); return 0; } diff --git a/scripts/decodecode b/scripts/decodecode index 5ea071099330..9cef558528aa 100755 --- a/scripts/decodecode +++ b/scripts/decodecode @@ -21,12 +21,24 @@ trap cleanup EXIT T=`mktemp` || die "cannot create temp file" code= +cont= while read i ; do case "$i" in *Code:*) code=$i + cont=yes + ;; +*) + [ -n "$cont" ] && { + xdump="$(echo $i | grep '^[[:xdigit:]<>[:space:]]\+$')" + if [ -n "$xdump" ]; then + code="$code $xdump" + else + cont= + fi + } ;; esac diff --git a/scripts/tags.sh b/scripts/tags.sh index d23dcbf17457..78e546ff689c 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -77,7 +77,7 @@ find_include_sources() find_other_sources() { find ${tree}* $ignore \ - \( -name include -o -name arch -o -name '.tmp_*' \) -prune -o \ + \( -path ${tree}include -o -path ${tree}arch -o -name '.tmp_*' \) -prune -o \ -name "$1" -not -type l -print; } diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile index 3926a0409dda..a5276a91dfbf 100644 --- a/tools/testing/selftests/memfd/Makefile +++ b/tools/testing/selftests/memfd/Makefile @@ -12,3 +12,8 @@ fuse_mnt.o: CFLAGS += $(shell pkg-config fuse --cflags) include ../lib.mk $(OUTPUT)/fuse_mnt: LDLIBS += $(shell pkg-config fuse --libs) + +$(OUTPUT)/memfd_test: memfd_test.c common.o +$(OUTPUT)/fuse_test: fuse_test.c common.o + +EXTRA_CLEAN = common.o diff --git a/tools/testing/selftests/memfd/common.c b/tools/testing/selftests/memfd/common.c new file mode 100644 index 000000000000..8eb3d75f6e60 --- /dev/null +++ b/tools/testing/selftests/memfd/common.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#define __EXPORTED_HEADERS__ + +#include <stdio.h> +#include <stdlib.h> +#include <linux/fcntl.h> +#include <linux/memfd.h> +#include <unistd.h> +#include <sys/syscall.h> + +#include "common.h" + +int hugetlbfs_test = 0; + +/* + * Copied from mlock2-tests.c + */ +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +int sys_memfd_create(const char *name, unsigned int flags) +{ + if (hugetlbfs_test) + flags |= MFD_HUGETLB; + + return syscall(__NR_memfd_create, name, flags); +} diff --git a/tools/testing/selftests/memfd/common.h b/tools/testing/selftests/memfd/common.h new file mode 100644 index 000000000000..522d2c630bd8 --- /dev/null +++ b/tools/testing/selftests/memfd/common.h @@ -0,0 +1,9 @@ +#ifndef COMMON_H_ +#define COMMON_H_ + +extern int hugetlbfs_test; + +unsigned long default_huge_page_size(void); +int sys_memfd_create(const char *name, unsigned int flags); + +#endif diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c index 1ccb7a3eb14b..b018e835737d 100644 --- a/tools/testing/selftests/memfd/fuse_test.c +++ b/tools/testing/selftests/memfd/fuse_test.c @@ -33,14 +33,12 @@ #include <sys/wait.h> #include <unistd.h> +#include "common.h" + #define MFD_DEF_SIZE 8192 #define STACK_SIZE 65536 -static int sys_memfd_create(const char *name, - unsigned int flags) -{ - return syscall(__NR_memfd_create, name, flags); -} +static size_t mfd_def_size = MFD_DEF_SIZE; static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) { @@ -127,7 +125,7 @@ static void *mfd_assert_mmap_shared(int fd) void *p; p = mmap(NULL, - MFD_DEF_SIZE, + mfd_def_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, @@ -145,7 +143,7 @@ static void *mfd_assert_mmap_private(int fd) void *p; p = mmap(NULL, - MFD_DEF_SIZE, + mfd_def_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, @@ -178,7 +176,7 @@ static int sealing_thread_fn(void *arg) usleep(200000); /* unmount mapping before sealing to avoid i_mmap_writable failures */ - munmap(global_p, MFD_DEF_SIZE); + munmap(global_p, mfd_def_size); /* Try sealing the global file; expect EBUSY or success. Current * kernels will never succeed, but in the future, kernels might @@ -228,7 +226,7 @@ static void join_sealing_thread(pid_t pid) int main(int argc, char **argv) { - static const char zero[MFD_DEF_SIZE]; + char *zero; int fd, mfd, r; void *p; int was_sealed; @@ -239,6 +237,25 @@ int main(int argc, char **argv) abort(); } + if (argc >= 3) { + if (!strcmp(argv[2], "hugetlbfs")) { + unsigned long hpage_size = default_huge_page_size(); + + if (!hpage_size) { + printf("Unable to determine huge page size\n"); + abort(); + } + + hugetlbfs_test = 1; + mfd_def_size = hpage_size * 2; + } else { + printf("Unknown option: %s\n", argv[2]); + abort(); + } + } + + zero = calloc(sizeof(*zero), mfd_def_size); + /* open FUSE memfd file for GUP testing */ printf("opening: %s\n", argv[1]); fd = open(argv[1], O_RDONLY | O_CLOEXEC); @@ -249,7 +266,7 @@ int main(int argc, char **argv) /* create new memfd-object */ mfd = mfd_assert_new("kern_memfd_fuse", - MFD_DEF_SIZE, + mfd_def_size, MFD_CLOEXEC | MFD_ALLOW_SEALING); /* mmap memfd-object for writing */ @@ -268,7 +285,7 @@ int main(int argc, char **argv) * This guarantees that the receive-buffer is pinned for 1s until the * data is written into it. The racing ADD_SEALS should thus fail as * the pages are still pinned. */ - r = read(fd, p, MFD_DEF_SIZE); + r = read(fd, p, mfd_def_size); if (r < 0) { printf("read() failed: %m\n"); abort(); @@ -295,10 +312,10 @@ int main(int argc, char **argv) * enough to avoid any in-flight writes. */ p = mfd_assert_mmap_private(mfd); - if (was_sealed && memcmp(p, zero, MFD_DEF_SIZE)) { + if (was_sealed && memcmp(p, zero, mfd_def_size)) { printf("memfd sealed during read() but data not discarded\n"); abort(); - } else if (!was_sealed && !memcmp(p, zero, MFD_DEF_SIZE)) { + } else if (!was_sealed && !memcmp(p, zero, mfd_def_size)) { printf("memfd sealed after read() but data discarded\n"); abort(); } @@ -307,6 +324,7 @@ int main(int argc, char **argv) close(fd); printf("fuse: DONE\n"); + free(zero); return 0; } diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 132a54f74e88..10baa1652fc2 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -19,7 +19,10 @@ #include <sys/wait.h> #include <unistd.h> +#include "common.h" + #define MEMFD_STR "memfd:" +#define MEMFD_HUGE_STR "memfd-hugetlb:" #define SHARED_FT_STR "(shared file-table)" #define MFD_DEF_SIZE 8192 @@ -28,41 +31,8 @@ /* * Default is not to test hugetlbfs */ -static int hugetlbfs_test; static size_t mfd_def_size = MFD_DEF_SIZE; - -/* - * Copied from mlock2-tests.c - */ -static unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - - free(line); - fclose(f); - return hps; -} - -static int sys_memfd_create(const char *name, - unsigned int flags) -{ - if (hugetlbfs_test) - flags |= MFD_HUGETLB; - - return syscall(__NR_memfd_create, name, flags); -} +static const char *memfd_str = MEMFD_STR; static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) { @@ -513,6 +483,10 @@ static void mfd_assert_grow_write(int fd) static char *buf; ssize_t l; + /* hugetlbfs does not support write */ + if (hugetlbfs_test) + return; + buf = malloc(mfd_def_size * 8); if (!buf) { printf("malloc(%zu) failed: %m\n", mfd_def_size * 8); @@ -533,6 +507,10 @@ static void mfd_fail_grow_write(int fd) static char *buf; ssize_t l; + /* hugetlbfs does not support write */ + if (hugetlbfs_test) + return; + buf = malloc(mfd_def_size * 8); if (!buf) { printf("malloc(%zu) failed: %m\n", mfd_def_size * 8); @@ -598,7 +576,7 @@ static void test_create(void) char buf[2048]; int fd; - printf("%s CREATE\n", MEMFD_STR); + printf("%s CREATE\n", memfd_str); /* test NULL name */ mfd_fail_new(NULL, 0); @@ -627,18 +605,13 @@ static void test_create(void) fd = mfd_assert_new("", 0, MFD_CLOEXEC); close(fd); - if (!hugetlbfs_test) { - /* verify MFD_ALLOW_SEALING is allowed */ - fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING); - close(fd); - - /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */ - fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC); - close(fd); - } else { - /* sealing is not supported on hugetlbfs */ - mfd_fail_new("", MFD_ALLOW_SEALING); - } + /* verify MFD_ALLOW_SEALING is allowed */ + fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING); + close(fd); + + /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */ + fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC); + close(fd); } /* @@ -649,11 +622,7 @@ static void test_basic(void) { int fd; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - - printf("%s BASIC\n", MEMFD_STR); + printf("%s BASIC\n", memfd_str); fd = mfd_assert_new("kern_memfd_basic", mfd_def_size, @@ -698,28 +667,6 @@ static void test_basic(void) } /* - * hugetlbfs doesn't support seals or write, so just verify grow and shrink - * on a hugetlbfs file created via memfd_create. - */ -static void test_hugetlbfs_grow_shrink(void) -{ - int fd; - - printf("%s HUGETLBFS-GROW-SHRINK\n", MEMFD_STR); - - fd = mfd_assert_new("kern_memfd_seal_write", - mfd_def_size, - MFD_CLOEXEC); - - mfd_assert_read(fd); - mfd_assert_write(fd); - mfd_assert_shrink(fd); - mfd_assert_grow(fd); - - close(fd); -} - -/* * Test SEAL_WRITE * Test whether SEAL_WRITE actually prevents modifications. */ @@ -727,14 +674,7 @@ static void test_seal_write(void) { int fd; - /* - * hugetlbfs does not contain sealing or write support. Just test - * basic grow and shrink via test_hugetlbfs_grow_shrink. - */ - if (hugetlbfs_test) - return test_hugetlbfs_grow_shrink(); - - printf("%s SEAL-WRITE\n", MEMFD_STR); + printf("%s SEAL-WRITE\n", memfd_str); fd = mfd_assert_new("kern_memfd_seal_write", mfd_def_size, @@ -760,11 +700,7 @@ static void test_seal_shrink(void) { int fd; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - - printf("%s SEAL-SHRINK\n", MEMFD_STR); + printf("%s SEAL-SHRINK\n", memfd_str); fd = mfd_assert_new("kern_memfd_seal_shrink", mfd_def_size, @@ -790,11 +726,7 @@ static void test_seal_grow(void) { int fd; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - - printf("%s SEAL-GROW\n", MEMFD_STR); + printf("%s SEAL-GROW\n", memfd_str); fd = mfd_assert_new("kern_memfd_seal_grow", mfd_def_size, @@ -820,11 +752,7 @@ static void test_seal_resize(void) { int fd; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - - printf("%s SEAL-RESIZE\n", MEMFD_STR); + printf("%s SEAL-RESIZE\n", memfd_str); fd = mfd_assert_new("kern_memfd_seal_resize", mfd_def_size, @@ -843,32 +771,6 @@ static void test_seal_resize(void) } /* - * hugetlbfs does not support seals. Basic test to dup the memfd created - * fd and perform some basic operations on it. - */ -static void hugetlbfs_dup(char *b_suffix) -{ - int fd, fd2; - - printf("%s HUGETLBFS-DUP %s\n", MEMFD_STR, b_suffix); - - fd = mfd_assert_new("kern_memfd_share_dup", - mfd_def_size, - MFD_CLOEXEC); - - fd2 = mfd_assert_dup(fd); - - mfd_assert_read(fd); - mfd_assert_write(fd); - - mfd_assert_shrink(fd2); - mfd_assert_grow(fd2); - - close(fd2); - close(fd); -} - -/* * Test sharing via dup() * Test that seals are shared between dupped FDs and they're all equal. */ @@ -876,16 +778,7 @@ static void test_share_dup(char *banner, char *b_suffix) { int fd, fd2; - /* - * hugetlbfs does not contain sealing support. Perform some - * basic testing on dup'ed fd instead via hugetlbfs_dup. - */ - if (hugetlbfs_test) { - hugetlbfs_dup(b_suffix); - return; - } - - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); + printf("%s %s %s\n", memfd_str, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_dup", mfd_def_size, @@ -927,11 +820,7 @@ static void test_share_mmap(char *banner, char *b_suffix) int fd; void *p; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); + printf("%s %s %s\n", memfd_str, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_mmap", mfd_def_size, @@ -956,32 +845,6 @@ static void test_share_mmap(char *banner, char *b_suffix) } /* - * Basic test to make sure we can open the hugetlbfs fd via /proc and - * perform some simple operations on it. - */ -static void hugetlbfs_proc_open(char *b_suffix) -{ - int fd, fd2; - - printf("%s HUGETLBFS-PROC-OPEN %s\n", MEMFD_STR, b_suffix); - - fd = mfd_assert_new("kern_memfd_share_open", - mfd_def_size, - MFD_CLOEXEC); - - fd2 = mfd_assert_open(fd, O_RDWR, 0); - - mfd_assert_read(fd); - mfd_assert_write(fd); - - mfd_assert_shrink(fd2); - mfd_assert_grow(fd2); - - close(fd2); - close(fd); -} - -/* * Test sealing with open(/proc/self/fd/%d) * Via /proc we can get access to a separate file-context for the same memfd. * This is *not* like dup(), but like a real separate open(). Make sure the @@ -991,16 +854,7 @@ static void test_share_open(char *banner, char *b_suffix) { int fd, fd2; - /* - * hugetlbfs does not contain sealing support. So test basic - * functionality of using /proc fd via hugetlbfs_proc_open - */ - if (hugetlbfs_test) { - hugetlbfs_proc_open(b_suffix); - return; - } - - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); + printf("%s %s %s\n", memfd_str, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_open", mfd_def_size, @@ -1043,11 +897,7 @@ static void test_share_fork(char *banner, char *b_suffix) int fd; pid_t pid; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); + printf("%s %s %s\n", memfd_str, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_fork", mfd_def_size, @@ -1083,7 +933,11 @@ int main(int argc, char **argv) } hugetlbfs_test = 1; + memfd_str = MEMFD_HUGE_STR; mfd_def_size = hpage_size * 2; + } else { + printf("Unknown option: %s\n", argv[1]); + abort(); } } diff --git a/tools/testing/selftests/memfd/run_fuse_test.sh b/tools/testing/selftests/memfd/run_fuse_test.sh index 407df68dfe27..22e572e2d66a 100755 --- a/tools/testing/selftests/memfd/run_fuse_test.sh +++ b/tools/testing/selftests/memfd/run_fuse_test.sh @@ -10,6 +10,6 @@ set -e mkdir mnt ./fuse_mnt ./mnt -./fuse_test ./mnt/memfd +./fuse_test ./mnt/memfd $@ fusermount -u ./mnt rmdir ./mnt diff --git a/tools/testing/selftests/memfd/run_tests.sh b/tools/testing/selftests/memfd/run_tests.sh index daabb350697c..c2d41ed81b24 100755 --- a/tools/testing/selftests/memfd/run_tests.sh +++ b/tools/testing/selftests/memfd/run_tests.sh @@ -60,6 +60,7 @@ fi # Run the hugetlbfs test # ./memfd_test hugetlbfs +./run_fuse_test.sh hugetlbfs # # Give back any huge pages allocated for the test diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 7f45806bd863..fdefa2295ddc 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -8,17 +8,18 @@ endif CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS) LDLIBS = -lrt TEST_GEN_FILES = compaction_test +TEST_GEN_FILES += gup_benchmark TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-shm TEST_GEN_FILES += map_hugetlb +TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd -TEST_GEN_FILES += mlock-random-test +TEST_GEN_FILES += va_128TBswitch TEST_GEN_FILES += virtual_address_range -TEST_GEN_FILES += gup_benchmark TEST_PROGS := run_vmtests diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index cc826326de87..d2561895a021 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -177,4 +177,15 @@ else echo "[PASS]" fi +echo "-----------------------------" +echo "running virtual address 128TB switch test" +echo "-----------------------------" +./va_128TBswitch +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + exit $exitcode diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/vm/va_128TBswitch.c new file mode 100644 index 000000000000..e7fe734c374f --- /dev/null +++ b/tools/testing/selftests/vm/va_128TBswitch.c @@ -0,0 +1,297 @@ +/* + * + * Authors: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> + * Authors: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include <stdio.h> +#include <sys/mman.h> +#include <string.h> + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) + +#ifdef __powerpc64__ +#define PAGE_SIZE (64 << 10) +/* + * This will work with 16M and 2M hugepage size + */ +#define HUGETLB_SIZE (16 << 20) +#else +#define PAGE_SIZE (4 << 10) +#define HUGETLB_SIZE (2 << 20) +#endif + +/* + * >= 128TB is the hint addr value we used to select + * large address space. + */ +#define ADDR_SWITCH_HINT (1UL << 47) +#define LOW_ADDR ((void *) (1UL << 30)) +#define HIGH_ADDR ((void *) (1UL << 48)) + +struct testcase { + void *addr; + unsigned long size; + unsigned long flags; + const char *msg; + unsigned int low_addr_required:1; + unsigned int keep_mapped:1; +}; + +static struct testcase testcases[] = { + { + /* + * If stack is moved, we could possibly allocate + * this at the requested address. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + /* + * We should never allocate at the requested address or above it + * The len cross the 128TB boundary. Without MAP_FIXED + * we will always search in the lower address space. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))", + .low_addr_required = 1, + }, + { + /* + * Exact mapping at 128TB, the area is free we should get that + * even without MAP_FIXED. + */ + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, + { + .addr = NULL, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED)", + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1) again", + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, +}; + +static struct testcase hugetlb_testcases[] = { + { + .addr = NULL, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB) again", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)", + }, +}; + +static int run_test(struct testcase *test, int count) +{ + void *p; + int i, ret = 0; + + for (i = 0; i < count; i++) { + struct testcase *t = test + i; + + p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0); + + printf("%s: %p - ", t->msg, p); + + if (p == MAP_FAILED) { + printf("FAILED\n"); + ret = 1; + continue; + } + + if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { + printf("FAILED\n"); + ret = 1; + } else { + /* + * Do a dereference of the address returned so that we catch + * bugs in page fault handling + */ + memset(p, 0, t->size); + printf("OK\n"); + } + if (!t->keep_mapped) + munmap(p, t->size); + } + + return ret; +} + +static int supported_arch(void) +{ +#if defined(__powerpc64__) + return 1; +#elif defined(__x86_64__) + return 1; +#else + return 0; +#endif +} + +int main(int argc, char **argv) +{ + int ret; + + if (!supported_arch()) + return 0; + + ret = run_test(testcases, ARRAY_SIZE(testcases)); + if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) + ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases)); + return ret; +} diff --git a/tools/testing/selftests/x86/5lvl.c b/tools/testing/selftests/x86/5lvl.c deleted file mode 100644 index 2eafdcd4c2b3..000000000000 --- a/tools/testing/selftests/x86/5lvl.c +++ /dev/null @@ -1,177 +0,0 @@ -#include <stdio.h> -#include <sys/mman.h> - -#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) - -#define PAGE_SIZE 4096 -#define LOW_ADDR ((void *) (1UL << 30)) -#define HIGH_ADDR ((void *) (1UL << 50)) - -struct testcase { - void *addr; - unsigned long size; - unsigned long flags; - const char *msg; - unsigned int low_addr_required:1; - unsigned int keep_mapped:1; -}; - -static struct testcase testcases[] = { - { - .addr = NULL, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED)", - }, - { - .addr = (void*) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1)", - .keep_mapped = 1, - }, - { - .addr = (void*) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1) again", - }, - { - .addr = (void *)((1UL << 47) - PAGE_SIZE), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap((1UL << 47), 2 * PAGE_SIZE)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)((1UL << 47) - PAGE_SIZE / 2), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap((1UL << 47), 2 * PAGE_SIZE / 2)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)((1UL << 47) - PAGE_SIZE), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap((1UL << 47) - PAGE_SIZE, 2 * PAGE_SIZE, MAP_FIXED)", - }, - { - .addr = NULL, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", - }, - { - .addr = (void*) -1, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = (void*) -1, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB) again", - }, - { - .addr = (void *)((1UL << 47) - PAGE_SIZE), - .size = 4UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap((1UL << 47), 4UL << 20, MAP_HUGETLB)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)((1UL << 47) - (2UL << 20)), - .size = 4UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap((1UL << 47) - (2UL << 20), 4UL << 20, MAP_FIXED | MAP_HUGETLB)", - }, -}; - -int main(int argc, char **argv) -{ - int i; - void *p; - - for (i = 0; i < ARRAY_SIZE(testcases); i++) { - struct testcase *t = testcases + i; - - p = mmap(t->addr, t->size, PROT_NONE, t->flags, -1, 0); - - printf("%s: %p - ", t->msg, p); - - if (p == MAP_FAILED) { - printf("FAILED\n"); - continue; - } - - if (t->low_addr_required && p >= (void *)(1UL << 47)) - printf("FAILED\n"); - else - printf("OK\n"); - if (!t->keep_mapped) - munmap(p, t->size); - } - return 0; -} diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index e92903fc7113..a8783f48f77f 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -169,9 +169,10 @@ static int opt_raw; /* for kernel developers */ static int opt_list; /* list pages (in ranges) */ static int opt_no_summary; /* don't show summary */ static pid_t opt_pid; /* process to walk */ -const char * opt_file; /* file or directory path */ +const char *opt_file; /* file or directory path */ static uint64_t opt_cgroup; /* cgroup inode */ static int opt_list_cgroup;/* list page cgroup */ +static const char *opt_kpageflags;/* kpageflags file to parse */ #define MAX_ADDR_RANGES 1024 static int nr_addr_ranges; @@ -258,7 +259,7 @@ static int checked_open(const char *pathname, int flags) * pagemap/kpageflags routines */ -static unsigned long do_u64_read(int fd, char *name, +static unsigned long do_u64_read(int fd, const char *name, uint64_t *buf, unsigned long index, unsigned long count) @@ -283,7 +284,7 @@ static unsigned long kpageflags_read(uint64_t *buf, unsigned long index, unsigned long pages) { - return do_u64_read(kpageflags_fd, PROC_KPAGEFLAGS, buf, index, pages); + return do_u64_read(kpageflags_fd, opt_kpageflags, buf, index, pages); } static unsigned long kpagecgroup_read(uint64_t *buf, @@ -293,7 +294,7 @@ static unsigned long kpagecgroup_read(uint64_t *buf, if (kpagecgroup_fd < 0) return pages; - return do_u64_read(kpagecgroup_fd, PROC_KPAGEFLAGS, buf, index, pages); + return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages); } static unsigned long pagemap_read(uint64_t *buf, @@ -743,7 +744,7 @@ static void walk_addr_ranges(void) { int i; - kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY); + kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY); if (!nr_addr_ranges) add_addr_range(0, ULONG_MAX); @@ -790,6 +791,7 @@ static void usage(void) " -N|--no-summary Don't show summary info\n" " -X|--hwpoison hwpoison pages\n" " -x|--unpoison unpoison pages\n" +" -F|--kpageflags filename kpageflags file to parse\n" " -h|--help Show this usage message\n" "flags:\n" " 0x10 bitfield format, e.g.\n" @@ -1013,7 +1015,7 @@ static void walk_page_cache(void) { struct stat st; - kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY); + kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY); pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY); sigaction(SIGBUS, &sigbus_action, NULL); @@ -1164,6 +1166,11 @@ static void parse_bits_mask(const char *optarg) add_bits_filter(mask, bits); } +static void parse_kpageflags(const char *name) +{ + opt_kpageflags = name; +} + static void describe_flags(const char *optarg) { uint64_t flags = parse_flag_names(optarg, 0); @@ -1188,6 +1195,7 @@ static const struct option opts[] = { { "no-summary", 0, NULL, 'N' }, { "hwpoison" , 0, NULL, 'X' }, { "unpoison" , 0, NULL, 'x' }, + { "kpageflags", 0, NULL, 'F' }, { "help" , 0, NULL, 'h' }, { NULL , 0, NULL, 0 } }; @@ -1199,7 +1207,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); while ((c = getopt_long(argc, argv, - "rp:f:a:b:d:c:ClLNXxh", opts, NULL)) != -1) { + "rp:f:a:b:d:c:ClLNXxF:h", opts, NULL)) != -1) { switch (c) { case 'r': opt_raw = 1; @@ -1242,6 +1250,9 @@ int main(int argc, char *argv[]) opt_unpoison = 1; prepare_hwpoison_fd(); break; + case 'F': + parse_kpageflags(optarg); + break; case 'h': usage(); exit(0); @@ -1251,6 +1262,9 @@ int main(int argc, char *argv[]) } } + if (!opt_kpageflags) + opt_kpageflags = PROC_KPAGEFLAGS; + if (opt_cgroup || opt_list_cgroup) kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d6b9370806f8..35db929f92f0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, |