diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 27 | ||||
-rw-r--r-- | mm/bootmem.c | 8 | ||||
-rw-r--r-- | mm/bounce.c | 2 | ||||
-rw-r--r-- | mm/compaction.c | 18 | ||||
-rw-r--r-- | mm/filemap.c | 21 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/fremap.c | 8 | ||||
-rw-r--r-- | mm/huge_memory.c | 438 | ||||
-rw-r--r-- | mm/hugetlb.c | 178 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 5 | ||||
-rw-r--r-- | mm/kmemleak.c | 4 | ||||
-rw-r--r-- | mm/ksm.c | 4 | ||||
-rw-r--r-- | mm/list_lru.c | 3 | ||||
-rw-r--r-- | mm/madvise.c | 5 | ||||
-rw-r--r-- | mm/memblock.c | 124 | ||||
-rw-r--r-- | mm/memcontrol.c | 972 | ||||
-rw-r--r-- | mm/memory-failure.c | 70 | ||||
-rw-r--r-- | mm/memory.c | 217 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 65 | ||||
-rw-r--r-- | mm/mempolicy.c | 165 | ||||
-rw-r--r-- | mm/migrate.c | 183 | ||||
-rw-r--r-- | mm/mlock.c | 53 | ||||
-rw-r--r-- | mm/mm_init.c | 18 | ||||
-rw-r--r-- | mm/mmap.c | 23 | ||||
-rw-r--r-- | mm/mmzone.c | 14 | ||||
-rw-r--r-- | mm/mprotect.c | 89 | ||||
-rw-r--r-- | mm/mremap.c | 5 | ||||
-rw-r--r-- | mm/nobootmem.c | 25 | ||||
-rw-r--r-- | mm/nommu.c | 5 | ||||
-rw-r--r-- | mm/oom_kill.c | 8 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 61 | ||||
-rw-r--r-- | mm/pagewalk.c | 2 | ||||
-rw-r--r-- | mm/percpu.c | 5 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 24 | ||||
-rw-r--r-- | mm/readahead.c | 8 | ||||
-rw-r--r-- | mm/rmap.c | 19 | ||||
-rw-r--r-- | mm/shmem.c | 36 | ||||
-rw-r--r-- | mm/slab.c | 573 | ||||
-rw-r--r-- | mm/slab.h | 6 | ||||
-rw-r--r-- | mm/slab_common.c | 4 | ||||
-rw-r--r-- | mm/slub.c | 49 | ||||
-rw-r--r-- | mm/sparse.c | 53 | ||||
-rw-r--r-- | mm/swap.c | 146 | ||||
-rw-r--r-- | mm/swapfile.c | 20 | ||||
-rw-r--r-- | mm/util.c | 13 | ||||
-rw-r--r-- | mm/vmalloc.c | 48 | ||||
-rw-r--r-- | mm/vmscan.c | 88 | ||||
-rw-r--r-- | mm/vmstat.c | 22 | ||||
-rw-r--r-- | mm/zswap.c | 199 |
50 files changed, 2430 insertions, 1715 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 026771a9b097..723bbe04a0b0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -20,7 +20,7 @@ config FLATMEM_MANUAL Some users of more advanced features like NUMA and memory hotplug may have different options here. - DISCONTIGMEM is an more mature, better tested system, + DISCONTIGMEM is a more mature, better tested system, but is incompatible with memory hotplug and may suffer decreased performance over SPARSEMEM. If unsure between "Sparse Memory" and "Discontiguous Memory", choose @@ -153,11 +153,18 @@ config MOVABLE_NODE help Allow a node to have only movable memory. Pages used by the kernel, such as direct mapping pages cannot be migrated. So the corresponding - memory device cannot be hotplugged. This option allows users to - online all the memory of a node as movable memory so that the whole - node can be hotplugged. Users who don't use the memory hotplug - feature are fine with this option on since they don't online memory - as movable. + memory device cannot be hotplugged. This option allows the following + two things: + - When the system is booting, node full of hotpluggable memory can + be arranged to have only movable memory so that the whole node can + be hot-removed. (need movable_node boot option specified). + - After the system is up, the option allows users to online all the + memory of a node as movable memory so that the whole node can be + hot-removed. + + Users who don't use the memory hotplug feature are fine with this + option on since they don't specify movable_node boot option or they + don't online memory as movable. Say Y here if you want to hotplug a whole node. Say N here if you want kernel to use memory on all nodes evenly. @@ -183,7 +190,7 @@ config MEMORY_HOTPLUG_SPARSE config MEMORY_HOTREMOVE bool "Allow for memory hot remove" select MEMORY_ISOLATION - select HAVE_BOOTMEM_INFO_NODE if X86_64 + select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION @@ -211,9 +218,11 @@ config SPLIT_PTLOCK_CPUS int default "999999" if ARM && !CPU_CACHE_VIPT default "999999" if PARISC && !PA20 - default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC default "4" +config ARCH_ENABLE_SPLIT_PMD_PTLOCK + boolean + # # support for memory balloon compaction config BALLOON_COMPACTION @@ -534,7 +543,7 @@ config ZSWAP config MEM_SOFT_DIRTY bool "Track memory changes" - depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS select PROC_PAGE_MONITOR help This option enables memory changes tracking by introducing a diff --git a/mm/bootmem.c b/mm/bootmem.c index 6ab7744e692e..90bd3507b413 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -172,11 +172,12 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { struct page *page; - unsigned long start, end, pages, count = 0; + unsigned long *map, start, end, pages, count = 0; if (!bdata->node_bootmem_map) return 0; + map = bdata->node_bootmem_map; start = bdata->node_min_pfn; end = bdata->node_low_pfn; @@ -184,10 +185,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) bdata - bootmem_node_data, start, end); while (start < end) { - unsigned long *map, idx, vec; + unsigned long idx, vec; unsigned shift; - map = bdata->node_bootmem_map; idx = start - bdata->node_min_pfn; shift = idx & (BITS_PER_LONG - 1); /* @@ -784,7 +784,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); /* update goal according ...MAX_DMA32_PFN */ - end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; + end_pfn = pgdat_end_pfn(pgdat); if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { diff --git a/mm/bounce.c b/mm/bounce.c index c9f0a4339a7d..5a7d58fb883b 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -204,6 +204,8 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, struct bio_vec *to, *from; unsigned i; + if (force) + goto bounce; bio_for_each_segment(from, *bio_orig, i) if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) goto bounce; diff --git a/mm/compaction.c b/mm/compaction.c index c43789388cd8..f58bcd016f43 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -134,6 +134,10 @@ static void update_pageblock_skip(struct compact_control *cc, bool migrate_scanner) { struct zone *zone = cc->zone; + + if (cc->ignore_skip_hint) + return; + if (!page) return; @@ -235,10 +239,9 @@ static bool suitable_migration_target(struct page *page) } /* - * Isolate free pages onto a private freelist. Caller must hold zone->lock. - * If @strict is true, will abort returning 0 on any invalid PFNs or non-free - * pages inside of the pageblock (even though it may still end up isolating - * some pages). + * Isolate free pages onto a private freelist. If @strict is true, will abort + * returning 0 on any invalid PFNs or non-free pages inside of the pageblock + * (even though it may still end up isolating some pages). */ static unsigned long isolate_freepages_block(struct compact_control *cc, unsigned long blockpfn, @@ -677,6 +680,13 @@ static void isolate_freepages(struct zone *zone, pfn -= pageblock_nr_pages) { unsigned long isolated; + /* + * This can iterate a massively long zone without finding any + * suitable migration targets, so periodically check if we need + * to schedule. + */ + cond_resched(); + if (!pfn_valid(pfn)) continue; diff --git a/mm/filemap.c b/mm/filemap.c index 1e6aec4a2d2e..b7749a92021c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1090,7 +1090,6 @@ static void shrink_readahead_size_eio(struct file *filp, * @filp: the file to read * @ppos: current file position * @desc: read_descriptor - * @actor: read method * * This is a generic file read routine, and uses the * mapping->a_ops->readpage() function for the actual low-level stuff. @@ -1099,7 +1098,7 @@ static void shrink_readahead_size_eio(struct file *filp, * of the logic when it comes to error handling etc. */ static void do_generic_file_read(struct file *filp, loff_t *ppos, - read_descriptor_t *desc, read_actor_t actor) + read_descriptor_t *desc) { struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; @@ -1200,13 +1199,14 @@ page_ok: * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... * - * The actor routine returns how many bytes were actually used.. + * The file_read_actor routine returns how many bytes were + * actually used.. * NOTE! This may not be the same as how much of a user buffer * we filled up (we may be padding etc), so we can only update * "pos" here (the actor routine has to update the user buffer * pointers and the remaining count). */ - ret = actor(desc, page, offset, nr); + ret = file_read_actor(desc, page, offset, nr); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; @@ -1479,7 +1479,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, if (desc.count == 0) continue; desc.error = 0; - do_generic_file_read(filp, ppos, &desc, file_read_actor); + do_generic_file_read(filp, ppos, &desc); retval += desc.written; if (desc.error) { retval = retval ?: desc.error; @@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; struct page *page; - bool memcg_oom; pgoff_t size; int ret = 0; @@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* - * Do we have something in the page cache already? Either - * way, try readahead, but disable the memcg OOM killer for it - * as readahead is optional and no errors are propagated up - * the fault stack. The OOM killer is enabled while trying to - * instantiate the faulting page individually below. + * Do we have something in the page cache already? */ page = find_get_page(mapping, offset); if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { @@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ - memcg_oom = mem_cgroup_toggle_oom(false); do_async_mmap_readahead(vma, ra, file, page, offset); - mem_cgroup_toggle_oom(memcg_oom); } else if (!page) { /* No page in the page cache at all */ - memcg_oom = mem_cgroup_toggle_oom(false); do_sync_mmap_readahead(vma, ra, file, offset); - mem_cgroup_toggle_oom(memcg_oom); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 28fe26b64f8a..d8d9fe3f685c 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -26,7 +26,7 @@ * of ZERO_PAGE(), such as /dev/zero */ static DEFINE_MUTEX(xip_sparse_mutex); -static seqcount_t xip_sparse_seq = SEQCNT_ZERO; +static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq); static struct page *__xip_sparse_page; /* called under xip_sparse_mutex */ diff --git a/mm/fremap.c b/mm/fremap.c index 5bff08147768..bbc4d660221a 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -208,9 +208,10 @@ get_write_lock: if (mapping_cap_account_dirty(mapping)) { unsigned long addr; struct file *file = get_file(vma->vm_file); + /* mmap_region may free vma; grab the info now */ + vm_flags = vma->vm_flags; - addr = mmap_region(file, start, size, - vma->vm_flags, pgoff); + addr = mmap_region(file, start, size, vm_flags, pgoff); fput(file); if (IS_ERR_VALUE(addr)) { err = addr; @@ -218,7 +219,7 @@ get_write_lock: BUG_ON(addr != start); err = 0; } - goto out; + goto out_freed; } mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); @@ -253,6 +254,7 @@ get_write_lock: out: if (vma) vm_flags = vma->vm_flags; +out_freed: if (likely(!has_write_lock)) up_read(&mm->mmap_sem); else diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7489884682d8..9c0b17295ba0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -27,11 +27,12 @@ #include "internal.h" /* - * By default transparent hugepage support is enabled for all mappings - * and khugepaged scans all mappings. Defrag is only invoked by - * khugepaged hugepage allocations and by page faults inside - * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived - * allocations. + * By default transparent hugepage support is disabled in order that avoid + * to risk increase the memory footprint of applications without a guaranteed + * benefit. When transparent hugepage support is enabled, is for all mappings, + * and khugepaged scans all mappings. + * Defrag is invoked by khugepaged hugepage allocations and by page faults + * for all hugepage allocations. */ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS @@ -709,6 +710,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct page *page) { pgtable_t pgtable; + spinlock_t *ptl; VM_BUG_ON(!PageCompound(page)); pgtable = pte_alloc_one(mm, haddr); @@ -723,9 +725,9 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, */ __SetPageUptodate(page); - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_none(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mem_cgroup_uncharge_page(page); put_page(page); pte_free(mm, pgtable); @@ -737,8 +739,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); - mm->nr_ptes++; - spin_unlock(&mm->page_table_lock); + atomic_long_inc(&mm->nr_ptes); + spin_unlock(ptl); } return 0; @@ -758,14 +760,7 @@ static inline struct page *alloc_hugepage_vma(int defrag, HPAGE_PMD_ORDER, vma, haddr, nd); } -#ifndef CONFIG_NUMA -static inline struct page *alloc_hugepage(int defrag) -{ - return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), - HPAGE_PMD_ORDER); -} -#endif - +/* Caller must hold page table lock. */ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) @@ -778,7 +773,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - mm->nr_ptes++; + atomic_long_inc(&mm->nr_ptes); return true; } @@ -797,6 +792,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (!(flags & FAULT_FLAG_WRITE) && transparent_hugepage_use_zero_page()) { + spinlock_t *ptl; pgtable_t pgtable; struct page *zero_page; bool set; @@ -809,10 +805,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, zero_page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); if (!set) { pte_free(mm, pgtable); put_huge_zero_page(); @@ -845,6 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) { + spinlock_t *dst_ptl, *src_ptl; struct page *src_page; pmd_t pmd; pgtable_t pgtable; @@ -855,8 +852,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(!pgtable)) goto out; - spin_lock(&dst_mm->page_table_lock); - spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); + dst_ptl = pmd_lock(dst_mm, dst_pmd); + src_ptl = pmd_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); ret = -EAGAIN; pmd = *src_pmd; @@ -865,7 +863,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_unlock; } /* - * mm->page_table_lock is enough to be sure that huge zero pmd is not + * When page table lock is held, the huge zero pmd should not be * under splitting since we don't split the page itself, only pmd to * a page table. */ @@ -884,10 +882,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = 0; goto out_unlock; } + if (unlikely(pmd_trans_splitting(pmd))) { /* split huge page running from under us */ - spin_unlock(&src_mm->page_table_lock); - spin_unlock(&dst_mm->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); pte_free(dst_mm, pgtable); wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ @@ -903,12 +902,12 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd = pmd_mkold(pmd_wrprotect(pmd)); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - dst_mm->nr_ptes++; + atomic_long_inc(&dst_mm->nr_ptes); ret = 0; out_unlock: - spin_unlock(&src_mm->page_table_lock); - spin_unlock(&dst_mm->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); out: return ret; } @@ -919,10 +918,11 @@ void huge_pmd_set_accessed(struct mm_struct *mm, pmd_t *pmd, pmd_t orig_pmd, int dirty) { + spinlock_t *ptl; pmd_t entry; unsigned long haddr; - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto unlock; @@ -932,13 +932,14 @@ void huge_pmd_set_accessed(struct mm_struct *mm, update_mmu_cache_pmd(vma, address, pmd); unlock: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); } static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) { + spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; struct page *page; @@ -965,7 +966,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_page; @@ -992,7 +993,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, } smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); put_huge_zero_page(); inc_mm_counter(mm, MM_ANONPAGES); @@ -1002,7 +1003,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, out: return ret; out_free_page: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_page(page); put_page(page); @@ -1016,6 +1017,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct page *page, unsigned long haddr) { + spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; int ret = 0, i; @@ -1062,7 +1064,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON(!PageHead(page)); @@ -1088,7 +1090,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); page_remove_rmap(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -1099,7 +1101,7 @@ out: return ret; out_free_pages: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { @@ -1114,17 +1116,19 @@ out_free_pages: int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd) { + spinlock_t *ptl; int ret = 0; struct page *page = NULL, *new_page; unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + ptl = pmd_lockptr(mm, pmd); VM_BUG_ON(!vma->anon_vma); haddr = address & HPAGE_PMD_MASK; if (is_huge_zero_pmd(orig_pmd)) goto alloc; - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_unlock; @@ -1140,7 +1144,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_unlock; } get_page(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) @@ -1187,11 +1191,11 @@ alloc: mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (page) put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mem_cgroup_uncharge_page(new_page); put_page(new_page); goto out_mn; @@ -1213,13 +1217,13 @@ alloc: } ret |= VM_FAULT_WRITE; } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); out_mn: mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return ret; out_unlock: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); return ret; } @@ -1231,7 +1235,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; struct page *page = NULL; - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(pmd_lockptr(mm, pmd)); if (flags & FOLL_WRITE && !pmd_write(*pmd)) goto out; @@ -1240,6 +1244,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) return ERR_PTR(-EFAULT); + /* Full NUMA hinting faults to serialise migration in fault paths */ + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + goto out; + page = pmd_page(*pmd); VM_BUG_ON(!PageHead(page)); if (flags & FOLL_TOUCH) { @@ -1278,73 +1286,133 @@ out: int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp) { + spinlock_t *ptl; + struct anon_vma *anon_vma = NULL; struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; - int target_nid; - int current_nid = -1; - bool migrated; + int page_nid = -1, this_nid = numa_node_id(); + int target_nid, last_cpupid = -1; + bool page_locked; + bool migrated = false; + int flags = 0; - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmdp); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; + /* + * If there are potential migrations, wait for completion and retry + * without disrupting NUMA hinting information. Do not relock and + * check_same as the page may no longer be mapped. + */ + if (unlikely(pmd_trans_migrating(*pmdp))) { + spin_unlock(ptl); + wait_migrate_huge_page(vma->anon_vma, pmdp); + goto out; + } + page = pmd_page(pmd); - get_page(page); - current_nid = page_to_nid(page); + BUG_ON(is_huge_zero_page(page)); + page_nid = page_to_nid(page); + last_cpupid = page_cpupid_last(page); count_vm_numa_event(NUMA_HINT_FAULTS); - if (current_nid == numa_node_id()) + if (page_nid == this_nid) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + flags |= TNF_FAULT_LOCAL; + } + /* + * Avoid grouping on DSO/COW pages in specific and RO pages + * in general, RO pages shouldn't hurt as much anyway since + * they can be in shared cache state. + */ + if (!pmd_write(pmd)) + flags |= TNF_NO_GROUP; + + /* + * Acquire the page lock to serialise THP migrations but avoid dropping + * page_table_lock if at all possible + */ + page_locked = trylock_page(page); target_nid = mpol_misplaced(page, vma, haddr); if (target_nid == -1) { - put_page(page); - goto clear_pmdnuma; + /* If the page was locked, there are no parallel migrations */ + if (page_locked) + goto clear_pmdnuma; } - /* Acquire the page lock to serialise THP migrations */ - spin_unlock(&mm->page_table_lock); - lock_page(page); + /* Migration could have started since the pmd_trans_migrating check */ + if (!page_locked) { + spin_unlock(ptl); + wait_on_page_locked(page); + page_nid = -1; + goto out; + } - /* Confirm the PTE did not while locked */ - spin_lock(&mm->page_table_lock); + /* + * Page is misplaced. Page lock serialises migrations. Acquire anon_vma + * to serialises splits + */ + get_page(page); + spin_unlock(ptl); + anon_vma = page_lock_anon_vma_read(page); + + /* Confirm the PMD did not change while page_table_lock was released */ + spin_lock(ptl); if (unlikely(!pmd_same(pmd, *pmdp))) { unlock_page(page); put_page(page); + page_nid = -1; goto out_unlock; } - spin_unlock(&mm->page_table_lock); - /* Migrate the THP to the requested node */ + /* Bail if we fail to protect against THP splits for any reason */ + if (unlikely(!anon_vma)) { + put_page(page); + page_nid = -1; + goto clear_pmdnuma; + } + + /* + * Migrate the THP to the requested node, returns with page unlocked + * and pmd_numa cleared. + */ + spin_unlock(ptl); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); - if (!migrated) - goto check_same; - - task_numa_fault(target_nid, HPAGE_PMD_NR, true); - return 0; + if (migrated) { + flags |= TNF_MIGRATED; + page_nid = target_nid; + } -check_same: - spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) - goto out_unlock; + goto out; clear_pmdnuma: + BUG_ON(!PageLocked(page)); pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); + unlock_page(page); out_unlock: - spin_unlock(&mm->page_table_lock); - if (current_nid != -1) - task_numa_fault(current_nid, HPAGE_PMD_NR, false); + spin_unlock(ptl); + +out: + if (anon_vma) + page_unlock_anon_vma_read(anon_vma); + + if (page_nid != -1) + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); + return 0; } int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { struct page *page; pgtable_t pgtable; pmd_t orig_pmd; @@ -1358,8 +1426,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_remove_pmd_tlb_entry(tlb, pmd, addr); pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { - tlb->mm->nr_ptes--; - spin_unlock(&tlb->mm->page_table_lock); + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); put_huge_zero_page(); } else { page = pmd_page(orig_pmd); @@ -1367,8 +1435,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(page_mapcount(page) < 0); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON(!PageHead(page)); - tlb->mm->nr_ptes--; - spin_unlock(&tlb->mm->page_table_lock); + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); tlb_remove_page(tlb, page); } pte_free(tlb->mm, pgtable); @@ -1381,14 +1449,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec) { + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { /* * All logical pages in the range are present * if backed by a huge page. */ - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); memset(vec, 1, (end - addr) >> PAGE_SHIFT); ret = 1; } @@ -1401,6 +1470,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { + spinlock_t *old_ptl, *new_ptl; int ret = 0; pmd_t pmd; @@ -1421,41 +1491,81 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, goto out; } - ret = __pmd_trans_huge_lock(old_pmd, vma); + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_sem prevents deadlock. + */ + ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl); if (ret == 1) { + new_ptl = pmd_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - spin_unlock(&mm->page_table_lock); + if (new_ptl != old_ptl) { + pgtable_t pgtable; + + /* + * Move preallocated PTE page table if new_pmd is on + * different PMD page table. + */ + pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); + pgtable_trans_huge_deposit(mm, new_pmd, pgtable); + + spin_unlock(new_ptl); + } + spin_unlock(old_ptl); } out: return ret; } +/* + * Returns + * - 0 if PMD could not be locked + * - 1 if PMD was locked but protections unchange and TLB flush unnecessary + * - HPAGE_PMD_NR is protections changed and TLB flush necessary + */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; - entry = pmdp_get_and_clear(mm, addr, pmd); + ret = 1; if (!prot_numa) { + entry = pmdp_get_and_clear(mm, addr, pmd); + if (pmd_numa(entry)) + entry = pmd_mknonnuma(entry); entry = pmd_modify(entry, newprot); + ret = HPAGE_PMD_NR; BUG_ON(pmd_write(entry)); } else { struct page *page = pmd_page(*pmd); - /* only check non-shared pages */ - if (page_mapcount(page) == 1 && + /* + * Do not trap faults against the zero page. The + * read-only data is likely to be read-cached on the + * local CPU cache and it is less useful to know about + * local vs remote hits on the zero page. + */ + if (!is_huge_zero_page(page) && !pmd_numa(*pmd)) { + entry = *pmd; entry = pmd_mknuma(entry); + ret = HPAGE_PMD_NR; } } - set_pmd_at(mm, addr, pmd, entry); - spin_unlock(&vma->vm_mm->page_table_lock); - ret = 1; + + /* Set PMD if cleared earlier */ + if (ret == HPAGE_PMD_NR) + set_pmd_at(mm, addr, pmd, entry); + + spin_unlock(ptl); } return ret; @@ -1468,12 +1578,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * Note that if it returns 1, this routine returns without unlocking page * table locks. So callers must unlock them. */ -int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) +int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl) { - spin_lock(&vma->vm_mm->page_table_lock); + *ptl = pmd_lock(vma->vm_mm, pmd); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(*ptl); wait_split_huge_page(vma->anon_vma, pmd); return -1; } else { @@ -1482,27 +1593,37 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) return 1; } } - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(*ptl); return 0; } +/* + * This function returns whether a given @page is mapped onto the @address + * in the virtual space of @mm. + * + * When it's true, this function returns *pmd with holding the page table lock + * and passing it back to the caller via @ptl. + * If it's false, returns NULL without holding the page table lock. + */ pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, - enum page_check_address_pmd_flag flag) + enum page_check_address_pmd_flag flag, + spinlock_t **ptl) { - pmd_t *pmd, *ret = NULL; + pmd_t *pmd; if (address & ~HPAGE_PMD_MASK) - goto out; + return NULL; pmd = mm_find_pmd(mm, address); if (!pmd) - goto out; + return NULL; + *ptl = pmd_lock(mm, pmd); if (pmd_none(*pmd)) - goto out; + goto unlock; if (pmd_page(*pmd) != page) - goto out; + goto unlock; /* * split_vma() may create temporary aliased mappings. There is * no risk as long as all huge pmd are found and have their @@ -1512,14 +1633,15 @@ pmd_t *page_check_address_pmd(struct page *page, */ if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && pmd_trans_splitting(*pmd)) - goto out; + goto unlock; if (pmd_trans_huge(*pmd)) { VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && !pmd_trans_splitting(*pmd)); - ret = pmd; + return pmd; } -out: - return ret; +unlock: + spin_unlock(*ptl); + return NULL; } static int __split_huge_page_splitting(struct page *page, @@ -1527,6 +1649,7 @@ static int __split_huge_page_splitting(struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pmd_t *pmd; int ret = 0; /* For mmu_notifiers */ @@ -1534,9 +1657,8 @@ static int __split_huge_page_splitting(struct page *page, const unsigned long mmun_end = address + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl); if (pmd) { /* * We can't temporarily set the pmd to null in order @@ -1547,8 +1669,8 @@ static int __split_huge_page_splitting(struct page *page, */ pmdp_splitting_flush(vma, address, pmd); ret = 1; + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return ret; @@ -1636,7 +1758,7 @@ static void __split_huge_page_refcount(struct page *page, page_tail->mapping = page->mapping; page_tail->index = page->index + i; - page_nid_xchg_last(page_tail, page_nid_last(page)); + page_cpupid_xchg_last(page_tail, page_cpupid_last(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); @@ -1679,14 +1801,14 @@ static int __split_huge_page_map(struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pmd_t *pmd, _pmd; int ret = 0, i; pgtable_t pgtable; unsigned long haddr; - spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl); if (pmd) { pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -1741,8 +1863,8 @@ static int __split_huge_page_map(struct page *page, pmdp_invalidate(vma, address, pmd); pmd_populate(mm, pmd, pgtable); ret = 1; + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); return ret; } @@ -2139,7 +2261,34 @@ static void khugepaged_alloc_sleep(void) msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); } +static int khugepaged_node_load[MAX_NUMNODES]; + #ifdef CONFIG_NUMA +static int khugepaged_find_target_node(void) +{ + static int last_khugepaged_target_node = NUMA_NO_NODE; + int nid, target_node = 0, max_value = 0; + + /* find first node with max normal pages hit */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + if (khugepaged_node_load[nid] > max_value) { + max_value = khugepaged_node_load[nid]; + target_node = nid; + } + + /* do some balance if several nodes have the same hit record */ + if (target_node <= last_khugepaged_target_node) + for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; + nid++) + if (max_value == khugepaged_node_load[nid]) { + target_node = nid; + break; + } + + last_khugepaged_target_node = target_node; + return target_node; +} + static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) { if (IS_ERR(*hpage)) { @@ -2173,9 +2322,8 @@ static struct page * mmap_sem in read mode is good idea also to allow greater * scalability. */ - *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, - node, __GFP_OTHER_NODE); - + *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( + khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); /* * After allocating the hugepage, release the mmap_sem read lock in * preparation for taking it in write mode. @@ -2191,6 +2339,17 @@ static struct page return *hpage; } #else +static int khugepaged_find_target_node(void) +{ + return 0; +} + +static inline struct page *alloc_hugepage(int defrag) +{ + return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), + HPAGE_PMD_ORDER); +} + static struct page *khugepaged_alloc_hugepage(bool *wait) { struct page *hpage; @@ -2257,7 +2416,7 @@ static void collapse_huge_page(struct mm_struct *mm, pte_t *pte; pgtable_t pgtable; struct page *new_page; - spinlock_t *ptl; + spinlock_t *pmd_ptl, *pte_ptl; int isolated; unsigned long hstart, hend; unsigned long mmun_start; /* For mmu_notifiers */ @@ -2300,12 +2459,12 @@ static void collapse_huge_page(struct mm_struct *mm, anon_vma_lock_write(vma->anon_vma); pte = pte_offset_map(pmd, address); - ptl = pte_lockptr(mm, pmd); + pte_ptl = pte_lockptr(mm, pmd); mmun_start = address; mmun_end = address + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); /* probably unnecessary */ + pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * After this gup_fast can't run anymore. This also removes * any huge TLB entry from the CPU so we won't allow @@ -2313,16 +2472,16 @@ static void collapse_huge_page(struct mm_struct *mm, * to avoid the risk of CPU bugs in that area. */ _pmd = pmdp_clear_flush(vma, address, pmd); - spin_unlock(&mm->page_table_lock); + spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - spin_lock(ptl); + spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); - spin_unlock(ptl); + spin_unlock(pte_ptl); if (unlikely(!isolated)) { pte_unmap(pte); - spin_lock(&mm->page_table_lock); + spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); /* * We can only use set_pmd_at when establishing @@ -2330,7 +2489,7 @@ static void collapse_huge_page(struct mm_struct *mm, * points to regular pagetables. Use pmd_populate for that */ pmd_populate(mm, pmd, pmd_pgtable(_pmd)); - spin_unlock(&mm->page_table_lock); + spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); goto out; } @@ -2341,7 +2500,7 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); @@ -2356,13 +2515,13 @@ static void collapse_huge_page(struct mm_struct *mm, */ smp_wmb(); - spin_lock(&mm->page_table_lock); + spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); - spin_unlock(&mm->page_table_lock); + spin_unlock(pmd_ptl); *hpage = NULL; @@ -2397,6 +2556,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (pmd_trans_huge(*pmd)) goto out; + memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { @@ -2413,12 +2573,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (unlikely(!page)) goto out_unmap; /* - * Chose the node of the first page. This could - * be more sophisticated and look at more pages, - * but isn't for now. + * Record which node the original page is from and save this + * information to khugepaged_node_load[]. + * Khupaged will allocate hugepage from the node has the max + * hit record. */ - if (node == NUMA_NO_NODE) - node = page_to_nid(page); + node = page_to_nid(page); + khugepaged_node_load[node]++; VM_BUG_ON(PageCompound(page)); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; @@ -2433,9 +2594,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ret = 1; out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) + if (ret) { + node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ collapse_huge_page(mm, address, hpage, vma, node); + } out: return ret; } @@ -2687,6 +2850,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd) { + spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; unsigned long haddr = address & HPAGE_PMD_MASK; @@ -2697,29 +2861,37 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, mmun_start = haddr; mmun_end = haddr + HPAGE_PMD_SIZE; +again: mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_trans_huge(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return; } if (is_huge_zero_pmd(*pmd)) { __split_huge_zero_page_pmd(vma, haddr, pmd); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return; } page = pmd_page(*pmd); VM_BUG_ON(!page_count(page)); get_page(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); split_huge_page(page); put_page(page); - BUG_ON(pmd_trans_huge(*pmd)); + + /* + * We don't always have down_write of mmap_sem here: a racing + * do_huge_pmd_wp_page() might have copied-on-write to another + * huge page before our split_huge_page() got the anon_vma lock. + */ + if (unlikely(pmd_trans_huge(*pmd))) + goto again; } void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b49579c7f2a5..dee6cf4e6d34 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -476,40 +476,6 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg) return 0; } -static void copy_gigantic_page(struct page *dst, struct page *src) -{ - int i; - struct hstate *h = page_hstate(src); - struct page *dst_base = dst; - struct page *src_base = src; - - for (i = 0; i < pages_per_huge_page(h); ) { - cond_resched(); - copy_highpage(dst, src); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); - } -} - -void copy_huge_page(struct page *dst, struct page *src) -{ - int i; - struct hstate *h = page_hstate(src); - - if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_gigantic_page(dst, src); - return; - } - - might_sleep(); - for (i = 0; i < pages_per_huge_page(h); i++) { - cond_resched(); - copy_highpage(dst + i, src + i); - } -} - static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); @@ -653,6 +619,7 @@ static void free_huge_page(struct page *page) BUG_ON(page_count(page)); BUG_ON(page_mapcount(page)); restore_reserve = PagePrivate(page); + ClearPagePrivate(page); spin_lock(&hugetlb_lock); hugetlb_cgroup_uncharge_page(hstate_index(h), @@ -695,8 +662,22 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); __SetPageHead(page); + __ClearPageReserved(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { __SetPageTail(p); + /* + * For gigantic hugepages allocated through bootmem at + * boot, it's safer to be consistent with the not-gigantic + * hugepages and clear the PG_reserved bit from all tail pages + * too. Otherwse drivers using get_user_pages() to access tail + * pages may get the reference counting wrong if they see + * PG_reserved set on a tail page (despite the head page not + * having PG_reserved set). Enforcing this consistency between + * head and tail pages allows drivers to optimize away a check + * on the head page when they need know if put_page() is needed + * after get_user_pages(). + */ + __ClearPageReserved(p); set_page_count(p, 0); p->first_page = page; } @@ -721,6 +702,23 @@ int PageHuge(struct page *page) } EXPORT_SYMBOL_GPL(PageHuge); +/* + * PageHeadHuge() only returns true for hugetlbfs head page, but not for + * normal or transparent huge pages. + */ +int PageHeadHuge(struct page *page_head) +{ + compound_page_dtor *dtor; + + if (!PageHead(page_head)) + return 0; + + dtor = get_compound_page_dtor(page_head); + + return dtor == free_huge_page; +} +EXPORT_SYMBOL_GPL(PageHeadHuge); + pgoff_t __basepage_index(struct page *page) { struct page *page_head = compound_head(page); @@ -1329,9 +1327,9 @@ static void __init gather_bootmem_prealloc(void) #else page = virt_to_page(m); #endif - __ClearPageReserved(page); WARN_ON(page_count(page) != 1); prep_compound_huge_page(page, h->order); + WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); /* * If we had gigantic hugepages allocated at boot time, we need @@ -2361,6 +2359,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { + spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; @@ -2372,8 +2371,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, if (dst_pte == src_pte) continue; - spin_lock(&dst->page_table_lock); - spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); + dst_ptl = huge_pte_lock(h, dst, dst_pte); + src_ptl = huge_pte_lockptr(h, src, src_pte); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); if (!huge_pte_none(huge_ptep_get(src_pte))) { if (cow) huge_ptep_set_wrprotect(src, addr, src_pte); @@ -2383,8 +2383,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); } return 0; @@ -2427,6 +2427,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long address; pte_t *ptep; pte_t pte; + spinlock_t *ptl; struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); @@ -2440,25 +2441,25 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); again: - spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; + ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) - continue; + goto unlock; pte = huge_ptep_get(ptep); if (huge_pte_none(pte)) - continue; + goto unlock; /* * HWPoisoned hugepage is already unmapped and dropped reference */ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { huge_pte_clear(mm, address, ptep); - continue; + goto unlock; } page = pte_page(pte); @@ -2469,7 +2470,7 @@ again: */ if (ref_page) { if (page != ref_page) - continue; + goto unlock; /* * Mark the VMA as having unmapped its page so that @@ -2486,13 +2487,18 @@ again: page_remove_rmap(page); force_flush = !__tlb_remove_page(tlb, page); - if (force_flush) + if (force_flush) { + spin_unlock(ptl); break; + } /* Bail out after unmapping reference page if supplied */ - if (ref_page) + if (ref_page) { + spin_unlock(ptl); break; + } +unlock: + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); /* * mmu_gather ran out of room to batch pages, we break out of * the PTE lock to avoid doing the potential expensive TLB invalidate @@ -2598,7 +2604,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, - struct page *pagecache_page) + struct page *pagecache_page, spinlock_t *ptl) { struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; @@ -2632,8 +2638,8 @@ retry_avoidcopy: page_cache_get(old_page); - /* Drop page_table_lock as buddy allocator may be called */ - spin_unlock(&mm->page_table_lock); + /* Drop page table lock as buddy allocator may be called */ + spin_unlock(ptl); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { @@ -2651,13 +2657,13 @@ retry_avoidcopy: BUG_ON(huge_pte_none(pte)); if (unmap_ref_private(mm, vma, old_page, address)) { BUG_ON(huge_pte_none(pte)); - spin_lock(&mm->page_table_lock); + spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; /* - * race occurs while re-acquiring page_table_lock, and - * our job is done. + * race occurs while re-acquiring page table + * lock, and our job is done. */ return 0; } @@ -2665,7 +2671,7 @@ retry_avoidcopy: } /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (err == -ENOMEM) return VM_FAULT_OOM; else @@ -2680,7 +2686,7 @@ retry_avoidcopy: page_cache_release(new_page); page_cache_release(old_page); /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); return VM_FAULT_OOM; } @@ -2692,10 +2698,10 @@ retry_avoidcopy: mmun_end = mmun_start + huge_page_size(h); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* - * Retake the page_table_lock to check for racing updates + * Retake the page table lock to check for racing updates * before the page tables are altered */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); @@ -2709,13 +2715,13 @@ retry_avoidcopy: /* Make the old page be freed below */ new_page = old_page; } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); page_cache_release(new_page); page_cache_release(old_page); /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); return 0; } @@ -2763,6 +2769,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; struct address_space *mapping; pte_t new_pte; + spinlock_t *ptl; /* * Currently, we are forced to kill the process in the event the @@ -2849,7 +2856,8 @@ retry: goto backout_unlocked; } - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(h, mm, ptep); + spin_lock(ptl); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; @@ -2870,16 +2878,16 @@ retry: if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); + ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); unlock_page(page); out: return ret; backout: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); backout_unlocked: unlock_page(page); put_page(page); @@ -2891,6 +2899,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, { pte_t *ptep; pte_t entry; + spinlock_t *ptl; int ret; struct page *page = NULL; struct page *pagecache_page = NULL; @@ -2903,7 +2912,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait_huge(mm, ptep); + migration_entry_wait_huge(vma, mm, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | @@ -2959,17 +2968,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (page != pagecache_page) lock_page(page); - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(h, mm, ptep); + spin_lock(ptl); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) - goto out_page_table_lock; + goto out_ptl; if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, - pagecache_page); - goto out_page_table_lock; + pagecache_page, ptl); + goto out_ptl; } entry = huge_pte_mkdirty(entry); } @@ -2978,8 +2988,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, address, ptep); -out_page_table_lock: - spin_unlock(&mm->page_table_lock); +out_ptl: + spin_unlock(ptl); if (pagecache_page) { unlock_page(pagecache_page); @@ -3005,9 +3015,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long remainder = *nr_pages; struct hstate *h = hstate_vma(vma); - spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; + spinlock_t *ptl = NULL; int absent; struct page *page; @@ -3015,8 +3025,12 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make sure we get the * first, for the page indexing below to work. + * + * Note that page table lock is not held when pte is null. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + if (pte) + ptl = huge_pte_lock(h, mm, pte); absent = !pte || huge_pte_none(huge_ptep_get(pte)); /* @@ -3028,6 +3042,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, */ if (absent && (flags & FOLL_DUMP) && !hugetlbfs_pagecache_present(h, vma, vaddr)) { + if (pte) + spin_unlock(ptl); remainder = 0; break; } @@ -3047,10 +3063,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, !huge_pte_write(huge_ptep_get(pte)))) { int ret; - spin_unlock(&mm->page_table_lock); + if (pte) + spin_unlock(ptl); ret = hugetlb_fault(mm, vma, vaddr, (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); - spin_lock(&mm->page_table_lock); if (!(ret & VM_FAULT_ERROR)) continue; @@ -3081,8 +3097,8 @@ same_page: */ goto same_page; } + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); *nr_pages = remainder; *position = vaddr; @@ -3103,13 +3119,15 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, address, end); mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); - spin_lock(&mm->page_table_lock); for (; address < end; address += huge_page_size(h)) { + spinlock_t *ptl; ptep = huge_pte_offset(mm, address); if (!ptep) continue; + ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) { pages++; + spin_unlock(ptl); continue; } if (!huge_pte_none(huge_ptep_get(ptep))) { @@ -3119,8 +3137,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, set_huge_pte_at(mm, address, ptep, pte); pages++; } + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); /* * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: @@ -3283,6 +3301,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) unsigned long saddr; pte_t *spte = NULL; pte_t *pte; + spinlock_t *ptl; if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); @@ -3305,13 +3324,14 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!spte) goto out; - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); + spin_lock(ptl); if (pud_none(*pud)) pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); else put_page(virt_to_page(spte)); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); mutex_unlock(&mapping->i_mmap_mutex); @@ -3325,7 +3345,7 @@ out: * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * called with vma->vm_mm->page_table_lock held. + * called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index afc2daa91c60..4c84678371eb 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -20,8 +20,6 @@ static int hwpoison_inject(void *data, u64 val) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!hwpoison_filter_enable) - goto inject; if (!pfn_valid(pfn)) return -ENXIO; @@ -33,6 +31,9 @@ static int hwpoison_inject(void *data, u64 val) if (!get_page_unless_zero(hpage)) return 0; + if (!hwpoison_filter_enable) + goto inject; + if (!PageLRU(p) && !PageHuge(p)) shake_page(p, 0); /* diff --git a/mm/kmemleak.c b/mm/kmemleak.c index e126b0ef9ad2..31f01c5011e5 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -753,7 +753,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) } spin_lock_irqsave(&object->lock, flags); - if (ptr + size > object->pointer + object->size) { + if (size == SIZE_MAX) { + size = object->pointer + object->size - ptr; + } else if (ptr + size > object->pointer + object->size) { kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); dump_object_info(object); kmem_cache_free(scan_area_cache, area); @@ -2309,8 +2309,8 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj, * Allocate stable and unstable together: * MAXSMP NODES_SHIFT 10 will use 16kB. */ - buf = kcalloc(nr_node_ids + nr_node_ids, - sizeof(*buf), GFP_KERNEL | __GFP_ZERO); + buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), + GFP_KERNEL); /* Let us assume that RB_ROOT is NULL is zero */ if (!buf) err = -ENOMEM; diff --git a/mm/list_lru.c b/mm/list_lru.c index 72467914b856..72f9decb0104 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -81,8 +81,9 @@ restart: * decrement nr_to_walk first so that we don't livelock if we * get stuck on large numbesr of LRU_RETRY items */ - if (--(*nr_to_walk) == 0) + if (!*nr_to_walk) break; + --*nr_to_walk; ret = isolate(item, &nlru->lock, cb_arg); switch (ret) { diff --git a/mm/madvise.c b/mm/madvise.c index 6975bc812542..539eeb96b323 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -343,10 +343,11 @@ static long madvise_remove(struct vm_area_struct *vma, */ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) { + struct page *p; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - for (; start < end; start += PAGE_SIZE) { - struct page *p; + for (; start < end; start += PAGE_SIZE << + compound_order(compound_head(p))) { int ret; ret = get_user_pages_fast(start, 1, 0, &p); diff --git a/mm/memblock.c b/mm/memblock.c index 0ac412a0a7ee..53e477bb5558 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -20,6 +20,8 @@ #include <linux/seq_file.h> #include <linux/memblock.h> +#include <asm-generic/sections.h> + static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; @@ -32,6 +34,7 @@ struct memblock memblock __initdata_memblock = { .reserved.cnt = 1, /* empty dummy entry */ .reserved.max = INIT_MEMBLOCK_REGIONS, + .bottom_up = false, .current_limit = MEMBLOCK_ALLOC_ANYWHERE, }; @@ -82,6 +85,73 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, return (i < type->cnt) ? i : -1; } +/* + * __memblock_find_range_bottom_up - find free area utility in bottom-up + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @size: size of free area to find + * @align: alignment of free area to find + * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * + * Utility called from memblock_find_in_range_node(), find free area bottom-up. + * + * RETURNS: + * Found address on success, 0 on failure. + */ +static phys_addr_t __init_memblock +__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align, int nid) +{ + phys_addr_t this_start, this_end, cand; + u64 i; + + for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) { + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); + + cand = round_up(this_start, align); + if (cand < this_end && this_end - cand >= size) + return cand; + } + + return 0; +} + +/** + * __memblock_find_range_top_down - find free area utility, in top-down + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @size: size of free area to find + * @align: alignment of free area to find + * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * + * Utility called from memblock_find_in_range_node(), find free area top-down. + * + * RETURNS: + * Found address on success, 0 on failure. + */ +static phys_addr_t __init_memblock +__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align, int nid) +{ + phys_addr_t this_start, this_end, cand; + u64 i; + + for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); + + if (this_end < size) + continue; + + cand = round_down(this_end - size, align); + if (cand >= this_start) + return cand; + } + + return 0; +} + /** * memblock_find_in_range_node - find free area in given range and node * @start: start of candidate range @@ -92,15 +162,23 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, * * Find @size free area aligned to @align in the specified range and node. * + * When allocation direction is bottom-up, the @start should be greater + * than the end of the kernel image. Otherwise, it will be trimmed. The + * reason is that we want the bottom-up allocation just near the kernel + * image so it is highly likely that the allocated memory and the kernel + * will reside in the same node. + * + * If bottom-up allocation failed, will try to allocate memory top-down. + * * RETURNS: - * Found address on success, %0 on failure. + * Found address on success, 0 on failure. */ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align, int nid) { - phys_addr_t this_start, this_end, cand; - u64 i; + int ret; + phys_addr_t kernel_end; /* pump up @end */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) @@ -109,19 +187,39 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, /* avoid allocating the first page */ start = max_t(phys_addr_t, start, PAGE_SIZE); end = max(start, end); + kernel_end = __pa_symbol(_end); - for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { - this_start = clamp(this_start, start, end); - this_end = clamp(this_end, start, end); + /* + * try bottom-up allocation only when bottom-up mode + * is set and @end is above the kernel image. + */ + if (memblock_bottom_up() && end > kernel_end) { + phys_addr_t bottom_up_start; - if (this_end < size) - continue; + /* make sure we will allocate above the kernel */ + bottom_up_start = max(start, kernel_end); - cand = round_down(this_end - size, align); - if (cand >= this_start) - return cand; + /* ok, try bottom-up allocation first */ + ret = __memblock_find_range_bottom_up(bottom_up_start, end, + size, align, nid); + if (ret) + return ret; + + /* + * we always limit bottom-up allocation above the kernel, + * but top-down allocation doesn't have the limit, so + * retrying top-down allocation may succeed when bottom-up + * allocation failed. + * + * bottom-up allocation is expected to be fail very rarely, + * so we use WARN_ONCE() here to see the stack trace if + * fail happens. + */ + WARN_ONCE(1, "memblock: bottom-up allocation failed, " + "memory hotunplug may be affected\n"); } - return 0; + + return __memblock_find_range_top_down(start, end, size, align, nid); } /** @@ -134,7 +232,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, * Find @size free area aligned to @align in the specified range. * * RETURNS: - * Found address on success, %0 on failure. + * Found address on success, 0 on failure. */ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5ff3ce13029..7f1a356153c0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -39,6 +39,7 @@ #include <linux/limits.h> #include <linux/export.h> #include <linux/mutex.h> +#include <linux/rbtree.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/swapops.h> @@ -53,10 +54,12 @@ #include <linux/page_cgroup.h> #include <linux/cpu.h> #include <linux/oom.h> +#include <linux/lockdep.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> #include <net/tcp_memcontrol.h> +#include "slab.h" #include <asm/uaccess.h> @@ -160,6 +163,10 @@ struct mem_cgroup_per_zone { struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; + struct rb_node tree_node; /* RB tree node */ + unsigned long long usage_in_excess;/* Set to the value by which */ + /* the soft limit is exceeded*/ + bool on_tree; struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; @@ -168,6 +175,26 @@ struct mem_cgroup_per_node { struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; }; +/* + * Cgroups above their limits are maintained in a RB-Tree, independent of + * their hierarchy representation + */ + +struct mem_cgroup_tree_per_zone { + struct rb_root rb_root; + spinlock_t lock; +}; + +struct mem_cgroup_tree_per_node { + struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; +}; + +struct mem_cgroup_tree { + struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; +}; + +static struct mem_cgroup_tree soft_limit_tree __read_mostly; + struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; u64 threshold; @@ -286,7 +313,7 @@ struct mem_cgroup { atomic_t dead_count; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) - struct tcp_memcontrol tcp_mem; + struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) /* analogous to slab_common's slab_caches list. per-memcg */ @@ -303,22 +330,6 @@ struct mem_cgroup { atomic_t numainfo_events; atomic_t numainfo_updating; #endif - /* - * Protects soft_contributed transitions. - * See mem_cgroup_update_soft_limit - */ - spinlock_t soft_lock; - - /* - * If true then this group has increased parents' children_in_excess - * when it got over the soft limit. - * When a group falls bellow the soft limit, parents' children_in_excess - * is decreased and soft_contributed changed to false. - */ - bool soft_contributed; - - /* Number of children that are in soft limit excess */ - atomic_t children_in_excess; struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ @@ -327,7 +338,7 @@ struct mem_cgroup { static size_t memcg_size(void) { return sizeof(struct mem_cgroup) + - nr_node_ids * sizeof(struct mem_cgroup_per_node); + nr_node_ids * sizeof(struct mem_cgroup_per_node *); } /* internal only representation about the status of kmem accounting. */ @@ -422,6 +433,7 @@ static bool move_file(void) * limit reclaim to prevent infinite loops, if they ever occur. */ #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 +#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, @@ -488,6 +500,29 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } +/* + * We restrict the id in the range of [1, 65535], so it can fit into + * an unsigned short. + */ +#define MEM_CGROUP_ID_MAX USHRT_MAX + +static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) +{ + /* + * The ID of the root cgroup is 0, but memcg treat 0 as an + * invalid ID, so we return (cgroup_id + 1). + */ + return memcg->css.cgroup->id + 1; +} + +static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + struct cgroup_subsys_state *css; + + css = css_from_id(id - 1, &mem_cgroup_subsys); + return mem_cgroup_from_css(css); +} + /* Writing them here to avoid exposing memcg's inner layout */ #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) @@ -540,13 +575,13 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) if (!memcg || mem_cgroup_is_root(memcg)) return NULL; - return &memcg->tcp_mem.cg_proto; + return &memcg->tcp_mem; } EXPORT_SYMBOL(tcp_proto_cgroup); static void disarm_sock_keys(struct mem_cgroup *memcg) { - if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) + if (!memcg_proto_activated(&memcg->tcp_mem)) return; static_key_slow_dec(&memcg_socket_limit_enabled); } @@ -559,16 +594,11 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) #ifdef CONFIG_MEMCG_KMEM /* * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. - * There are two main reasons for not using the css_id for this: - * 1) this works better in sparse environments, where we have a lot of memcgs, - * but only a few kmem-limited. Or also, if we have, for instance, 200 - * memcgs, and none but the 200th is kmem-limited, we'd have to have a - * 200 entry array for that. - * - * 2) In order not to violate the cgroup API, we would like to do all memory - * allocation in ->create(). At that point, we haven't yet allocated the - * css_id. Having a separate index prevents us from messing with the cgroup - * core for this + * The main reason for not using cgroup id for this: + * this works better in sparse environments, where we have a lot of memcgs, + * but only a few kmem-limited. Or also, if we have, for instance, 200 + * memcgs, and none but the 200th is kmem-limited, we'd have to have a + * 200 entry array for that. * * The current size of the caches array is stored in * memcg_limited_groups_array_size. It will double each time we have to @@ -583,14 +613,14 @@ int memcg_limited_groups_array_size; * cgroups is a reasonable guess. In the future, it could be a parameter or * tunable, but that is strictly not necessary. * - * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get + * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get * this constant directly from cgroup, but it is understandable that this is * better kept as an internal representation in cgroup.c. In any case, the - * css_id space is not getting any smaller, and we don't have to necessarily + * cgrp_id space is not getting any smaller, and we don't have to necessarily * increase ours as well if it increases. */ #define MEMCG_CACHES_MIN_SIZE 4 -#define MEMCG_CACHES_MAX_SIZE 65535 +#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX /* * A lot of the calls to the cache allocation functions are expected to be @@ -648,6 +678,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) return mem_cgroup_zoneinfo(memcg, nid, zid); } +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_node_zone(int nid, int zid) +{ + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_from_page(struct page *page) +{ + int nid = page_to_nid(page); + int zid = page_zonenum(page); + + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static void +__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz, + unsigned long long new_usage_in_excess) +{ + struct rb_node **p = &mctz->rb_root.rb_node; + struct rb_node *parent = NULL; + struct mem_cgroup_per_zone *mz_node; + + if (mz->on_tree) + return; + + mz->usage_in_excess = new_usage_in_excess; + if (!mz->usage_in_excess) + return; + while (*p) { + parent = *p; + mz_node = rb_entry(parent, struct mem_cgroup_per_zone, + tree_node); + if (mz->usage_in_excess < mz_node->usage_in_excess) + p = &(*p)->rb_left; + /* + * We can't avoid mem cgroups that are over their soft + * limit by the same amount + */ + else if (mz->usage_in_excess >= mz_node->usage_in_excess) + p = &(*p)->rb_right; + } + rb_link_node(&mz->tree_node, parent, p); + rb_insert_color(&mz->tree_node, &mctz->rb_root); + mz->on_tree = true; +} + +static void +__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + if (!mz->on_tree) + return; + rb_erase(&mz->tree_node, &mctz->rb_root); + mz->on_tree = false; +} + +static void +mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + spin_lock(&mctz->lock); + __mem_cgroup_remove_exceeded(memcg, mz, mctz); + spin_unlock(&mctz->lock); +} + + +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) +{ + unsigned long long excess; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_tree_per_zone *mctz; + int nid = page_to_nid(page); + int zid = page_zonenum(page); + mctz = soft_limit_tree_from_page(page); + + /* + * Necessary to update all ancestors when hierarchy is used. + * because their event counter is not touched. + */ + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + mz = mem_cgroup_zoneinfo(memcg, nid, zid); + excess = res_counter_soft_limit_excess(&memcg->res); + /* + * We have to update the tree if mz is on RB-tree or + * mem is over its softlimit. + */ + if (excess || mz->on_tree) { + spin_lock(&mctz->lock); + /* if on-tree, remove it */ + if (mz->on_tree) + __mem_cgroup_remove_exceeded(memcg, mz, mctz); + /* + * Insert again. mz->usage_in_excess will be updated. + * If excess is 0, no tree ops. + */ + __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); + spin_unlock(&mctz->lock); + } + } +} + +static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) +{ + int node, zone; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_tree_per_zone *mctz; + + for_each_node(node) { + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + mz = mem_cgroup_zoneinfo(memcg, node, zone); + mctz = soft_limit_tree_node_zone(node, zone); + mem_cgroup_remove_exceeded(memcg, mz, mctz); + } + } +} + +static struct mem_cgroup_per_zone * +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct rb_node *rightmost = NULL; + struct mem_cgroup_per_zone *mz; + +retry: + mz = NULL; + rightmost = rb_last(&mctz->rb_root); + if (!rightmost) + goto done; /* Nothing to reclaim from */ + + mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); + /* + * Remove the node now but someone else can add it back, + * we will to add it back at the end of reclaim to its correct + * position in the tree. + */ + __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + if (!res_counter_soft_limit_excess(&mz->memcg->res) || + !css_tryget(&mz->memcg->css)) + goto retry; +done: + return mz; +} + +static struct mem_cgroup_per_zone * +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct mem_cgroup_per_zone *mz; + + spin_lock(&mctz->lock); + mz = __mem_cgroup_largest_soft_limit_node(mctz); + spin_unlock(&mctz->lock); + return mz; +} + /* * Implementation Note: reading percpu statistics for memcg. * @@ -698,6 +886,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, unsigned long val = 0; int cpu; + get_online_cpus(); for_each_online_cpu(cpu) val += per_cpu(memcg->stat->events[idx], cpu); #ifdef CONFIG_HOTPLUG_CPU @@ -705,6 +894,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, val += memcg->nocpu_base.events[idx]; spin_unlock(&memcg->pcp_counter_lock); #endif + put_online_cpus(); return val; } @@ -822,48 +1012,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, } /* - * Called from rate-limited memcg_check_events when enough - * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure - * that all the parents up the hierarchy will be notified that this group - * is in excess or that it is not in excess anymore. mmecg->soft_contributed - * makes the transition a single action whenever the state flips from one to - * the other. - */ -static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) -{ - unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); - struct mem_cgroup *parent = memcg; - int delta = 0; - - spin_lock(&memcg->soft_lock); - if (excess) { - if (!memcg->soft_contributed) { - delta = 1; - memcg->soft_contributed = true; - } - } else { - if (memcg->soft_contributed) { - delta = -1; - memcg->soft_contributed = false; - } - } - - /* - * Necessary to update all ancestors when hierarchy is used - * because their event counter is not touched. - * We track children even outside the hierarchy for the root - * cgroup because tree walk starting at root should visit - * all cgroups and we want to prevent from pointless tree - * walk if no children is below the limit. - */ - while (delta && (parent = parent_mem_cgroup(parent))) - atomic_add(delta, &parent->children_in_excess); - if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) - atomic_add(delta, &root_mem_cgroup->children_in_excess); - spin_unlock(&memcg->soft_lock); -} - -/* * Check events in order. * */ @@ -886,7 +1034,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) - mem_cgroup_update_soft_limit(memcg); + mem_cgroup_update_tree(memcg, page); #if MAX_NUMNODES > 1 if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); @@ -929,15 +1077,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } -static enum mem_cgroup_filter_t -mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, - mem_cgroup_iter_filter cond) -{ - if (!cond) - return VISIT; - return cond(memcg, root); -} - /* * Returns a next (in a pre-order walk) alive memcg (with elevated css * ref. count) or NULL if the whole root's subtree has been visited. @@ -945,7 +1084,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, * helper function to be used by mem_cgroup_iter */ static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, - struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) + struct mem_cgroup *last_visited) { struct cgroup_subsys_state *prev_css, *next_css; @@ -963,31 +1102,11 @@ skip_node: if (next_css) { struct mem_cgroup *mem = mem_cgroup_from_css(next_css); - switch (mem_cgroup_filter(mem, root, cond)) { - case SKIP: + if (css_tryget(&mem->css)) + return mem; + else { prev_css = next_css; goto skip_node; - case SKIP_TREE: - if (mem == root) - return NULL; - /* - * css_rightmost_descendant is not an optimal way to - * skip through a subtree (especially for imbalanced - * trees leaning to right) but that's what we have right - * now. More effective solution would be traversing - * right-up for first non-NULL without calling - * css_next_descendant_pre afterwards. - */ - prev_css = css_rightmost_descendant(next_css); - goto skip_node; - case VISIT: - if (css_tryget(&mem->css)) - return mem; - else { - prev_css = next_css; - goto skip_node; - } - break; } } @@ -1051,7 +1170,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation * @reclaim: cookie for shared reclaim walks, NULL for full walks - * @cond: filter for visited nodes, NULL for no filter * * Returns references to children of the hierarchy below @root, or * @root itself, or %NULL after a full round-trip. @@ -1064,18 +1182,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * divide up the memcgs in the hierarchy among all concurrent * reclaimers operating on the same zone and priority. */ -struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim, - mem_cgroup_iter_filter cond) + struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup *memcg = NULL; struct mem_cgroup *last_visited = NULL; - if (mem_cgroup_disabled()) { - /* first call must return non-NULL, second return NULL */ - return (struct mem_cgroup *)(unsigned long)!prev; - } + if (mem_cgroup_disabled()) + return NULL; if (!root) root = root_mem_cgroup; @@ -1086,9 +1201,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) goto out_css_put; - if (mem_cgroup_filter(root, root, cond) == VISIT) - return root; - return NULL; + return root; } rcu_read_lock(); @@ -1111,7 +1224,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, last_visited = mem_cgroup_iter_load(iter, root, &seq); } - memcg = __mem_cgroup_iter_next(root, last_visited, cond); + memcg = __mem_cgroup_iter_next(root, last_visited); if (reclaim) { mem_cgroup_iter_update(iter, last_visited, memcg, seq); @@ -1122,11 +1235,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, reclaim->generation = iter->generation; } - /* - * We have finished the whole tree walk or no group has been - * visited because filter told us to skip the root node. - */ - if (!memcg && (prev || (cond && !last_visited))) + if (prev && !memcg) goto out_unlock; } out_unlock: @@ -1318,7 +1427,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, return true; if (!root_memcg->use_hierarchy || !memcg) return false; - return css_is_ancestor(&memcg->css, &root_memcg->css); + return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); } static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, @@ -1767,7 +1876,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, return total; } -#if MAX_NUMNODES > 1 /** * test_mem_cgroup_node_reclaimable * @memcg: the target memcg @@ -1790,6 +1898,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, return false; } +#if MAX_NUMNODES > 1 /* * Always updating the nodemask is not very good - even if we have an empty @@ -1857,52 +1966,112 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) return node; } +/* + * Check all nodes whether it contains reclaimable pages or not. + * For quick scan, we make use of scan_nodes. This will allow us to skip + * unused nodes. But scan_nodes is lazily updated and may not cotain + * enough new information. We need to do double check. + */ +static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) +{ + int nid; + + /* + * quick check...making use of scan_node. + * We can skip unused nodes. + */ + if (!nodes_empty(memcg->scan_nodes)) { + for (nid = first_node(memcg->scan_nodes); + nid < MAX_NUMNODES; + nid = next_node(nid, memcg->scan_nodes)) { + + if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) + return true; + } + } + /* + * Check rest of nodes. + */ + for_each_node_state(nid, N_MEMORY) { + if (node_isset(nid, memcg->scan_nodes)) + continue; + if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) + return true; + } + return false; +} + #else int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) { return 0; } -#endif - -/* - * A group is eligible for the soft limit reclaim under the given root - * hierarchy if - * a) it is over its soft limit - * b) any parent up the hierarchy is over its soft limit - * - * If the given group doesn't have any children over the limit then it - * doesn't make any sense to iterate its subtree. - */ -enum mem_cgroup_filter_t -mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, - struct mem_cgroup *root) +static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) { - struct mem_cgroup *parent; - - if (!memcg) - memcg = root_mem_cgroup; - parent = memcg; - - if (res_counter_soft_limit_excess(&memcg->res)) - return VISIT; + return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); +} +#endif - /* - * If any parent up to the root in the hierarchy is over its soft limit - * then we have to obey and reclaim from this group as well. - */ - while ((parent = parent_mem_cgroup(parent))) { - if (res_counter_soft_limit_excess(&parent->res)) - return VISIT; - if (parent == root) +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, + struct zone *zone, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + struct mem_cgroup *victim = NULL; + int total = 0; + int loop = 0; + unsigned long excess; + unsigned long nr_scanned; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = 0, + }; + + excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; + + while (1) { + victim = mem_cgroup_iter(root_memcg, victim, &reclaim); + if (!victim) { + loop++; + if (loop >= 2) { + /* + * If we have not been able to reclaim + * anything, it might because there are + * no reclaimable pages under this hierarchy + */ + if (!total) + break; + /* + * We want to do more targeted reclaim. + * excess >> 2 is not to excessive so as to + * reclaim too much, nor too less that we keep + * coming back to reclaim from this cgroup + */ + if (total >= (excess >> 2) || + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) + break; + } + continue; + } + if (!mem_cgroup_reclaimable(victim, false)) + continue; + total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, + zone, &nr_scanned); + *total_scanned += nr_scanned; + if (!res_counter_soft_limit_excess(&root_memcg->res)) break; } - - if (!atomic_read(&memcg->children_in_excess)) - return SKIP_TREE; - return SKIP; + mem_cgroup_iter_break(root_memcg, victim); + return total; } +#ifdef CONFIG_LOCKDEP +static struct lockdep_map memcg_oom_lock_dep_map = { + .name = "memcg_oom_lock", +}; +#endif + static DEFINE_SPINLOCK(memcg_oom_lock); /* @@ -1940,7 +2109,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) } iter->oom_lock = false; } - } + } else + mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); spin_unlock(&memcg_oom_lock); @@ -1952,6 +2122,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) struct mem_cgroup *iter; spin_lock(&memcg_oom_lock); + mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); for_each_mem_cgroup_tree(iter, memcg) iter->oom_lock = false; spin_unlock(&memcg_oom_lock); @@ -2018,110 +2189,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) memcg_wakeup_oom(memcg); } -/* - * try to call OOM killer - */ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - bool locked; - int wakeups; - if (!current->memcg_oom.may_oom) return; - - current->memcg_oom.in_memcg_oom = 1; - /* - * As with any blocking lock, a contender needs to start - * listening for wakeups before attempting the trylock, - * otherwise it can miss the wakeup from the unlock and sleep - * indefinitely. This is just open-coded because our locking - * is so particular to memcg hierarchies. + * We are in the middle of the charge context here, so we + * don't want to block when potentially sitting on a callstack + * that holds all kinds of filesystem and mm locks. + * + * Also, the caller may handle a failed allocation gracefully + * (like optional page cache readahead) and so an OOM killer + * invocation might not even be necessary. + * + * That's why we don't do anything here except remember the + * OOM context and then deal with it at the end of the page + * fault when the stack is unwound, the locks are released, + * and when we know whether the fault was overall successful. */ - wakeups = atomic_read(&memcg->oom_wakeups); - mem_cgroup_mark_under_oom(memcg); - - locked = mem_cgroup_oom_trylock(memcg); - - if (locked) - mem_cgroup_oom_notify(memcg); - - if (locked && !memcg->oom_kill_disable) { - mem_cgroup_unmark_under_oom(memcg); - mem_cgroup_out_of_memory(memcg, mask, order); - mem_cgroup_oom_unlock(memcg); - /* - * There is no guarantee that an OOM-lock contender - * sees the wakeups triggered by the OOM kill - * uncharges. Wake any sleepers explicitely. - */ - memcg_oom_recover(memcg); - } else { - /* - * A system call can just return -ENOMEM, but if this - * is a page fault and somebody else is handling the - * OOM already, we need to sleep on the OOM waitqueue - * for this memcg until the situation is resolved. - * Which can take some time because it might be - * handled by a userspace task. - * - * However, this is the charge context, which means - * that we may sit on a large call stack and hold - * various filesystem locks, the mmap_sem etc. and we - * don't want the OOM handler to deadlock on them - * while we sit here and wait. Store the current OOM - * context in the task_struct, then return -ENOMEM. - * At the end of the page fault handler, with the - * stack unwound, pagefault_out_of_memory() will check - * back with us by calling - * mem_cgroup_oom_synchronize(), possibly putting the - * task to sleep. - */ - current->memcg_oom.oom_locked = locked; - current->memcg_oom.wakeups = wakeups; - css_get(&memcg->css); - current->memcg_oom.wait_on_memcg = memcg; - } + css_get(&memcg->css); + current->memcg_oom.memcg = memcg; + current->memcg_oom.gfp_mask = mask; + current->memcg_oom.order = order; } /** * mem_cgroup_oom_synchronize - complete memcg OOM handling + * @handle: actually kill/wait or just clean up the OOM state * - * This has to be called at the end of a page fault if the the memcg - * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. + * This has to be called at the end of a page fault if the memcg OOM + * handler was enabled. * - * Memcg supports userspace OOM handling, so failed allocations must + * Memcg supports userspace OOM handling where failed allocations must * sleep on a waitqueue until the userspace task resolves the * situation. Sleeping directly in the charge context with all kinds * of locks held is not a good idea, instead we remember an OOM state * in the task and mem_cgroup_oom_synchronize() has to be called at - * the end of the page fault to put the task to sleep and clean up the - * OOM state. + * the end of the page fault to complete the OOM handling. * * Returns %true if an ongoing memcg OOM situation was detected and - * finalized, %false otherwise. + * completed, %false otherwise. */ -bool mem_cgroup_oom_synchronize(void) +bool mem_cgroup_oom_synchronize(bool handle) { + struct mem_cgroup *memcg = current->memcg_oom.memcg; struct oom_wait_info owait; - struct mem_cgroup *memcg; + bool locked; /* OOM is global, do not handle */ - if (!current->memcg_oom.in_memcg_oom) - return false; - - /* - * We invoked the OOM killer but there is a chance that a kill - * did not free up any charges. Everybody else might already - * be sleeping, so restart the fault and keep the rampage - * going until some charges are released. - */ - memcg = current->memcg_oom.wait_on_memcg; if (!memcg) - goto out; + return false; - if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) - goto out_memcg; + if (!handle) + goto cleanup; owait.memcg = memcg; owait.wait.flags = 0; @@ -2130,13 +2250,25 @@ bool mem_cgroup_oom_synchronize(void) INIT_LIST_HEAD(&owait.wait.task_list); prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); - /* Only sleep if we didn't miss any wakeups since OOM */ - if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) + mem_cgroup_mark_under_oom(memcg); + + locked = mem_cgroup_oom_trylock(memcg); + + if (locked) + mem_cgroup_oom_notify(memcg); + + if (locked && !memcg->oom_kill_disable) { + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, + current->memcg_oom.order); + } else { schedule(); - finish_wait(&memcg_oom_waitq, &owait.wait); -out_memcg: - mem_cgroup_unmark_under_oom(memcg); - if (current->memcg_oom.oom_locked) { + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + } + + if (locked) { mem_cgroup_oom_unlock(memcg); /* * There is no guarantee that an OOM-lock contender @@ -2145,10 +2277,9 @@ out_memcg: */ memcg_oom_recover(memcg); } +cleanup: + current->memcg_oom.memcg = NULL; css_put(&memcg->css); - current->memcg_oom.wait_on_memcg = NULL; -out: - current->memcg_oom.in_memcg_oom = 0; return true; } @@ -2562,6 +2693,12 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, || fatal_signal_pending(current))) goto bypass; + if (unlikely(task_in_memcg_oom(current))) + goto nomem; + + if (gfp_mask & __GFP_NOFAIL) + oom = false; + /* * We always charge the cgroup the mm_struct belongs to. * The mm_struct's mem_cgroup changes on task migration if the @@ -2659,8 +2796,10 @@ done: *ptr = memcg; return 0; nomem: - *ptr = NULL; - return -ENOMEM; + if (!(gfp_mask & __GFP_NOFAIL)) { + *ptr = NULL; + return -ENOMEM; + } bypass: *ptr = root_mem_cgroup; return -EINTR; @@ -2709,15 +2848,10 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, */ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) { - struct cgroup_subsys_state *css; - /* ID 0 is unused ID */ if (!id) return NULL; - css = css_lookup(&mem_cgroup_subsys, id); - if (!css) - return NULL; - return mem_cgroup_from_css(css); + return mem_cgroup_from_id(id); } struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) @@ -2812,7 +2946,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, unlock_page_cgroup(pc); /* - * "charge_statistics" updated event counter. + * "charge_statistics" updated event counter. Then, check it. + * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. + * if they exceeds softlimit. */ memcg_check_events(memcg, page); } @@ -2836,7 +2972,7 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) VM_BUG_ON(p->is_root_cache); cachep = p->root_cache; - return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; + return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); } #ifdef CONFIG_SLABINFO @@ -2865,21 +3001,14 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) struct res_counter *fail_res; struct mem_cgroup *_memcg; int ret = 0; - bool may_oom; ret = res_counter_charge(&memcg->kmem, size, &fail_res); if (ret) return ret; - /* - * Conditions under which we can wait for the oom_killer. Those are - * the same conditions tested by the core page allocator - */ - may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); - _memcg = memcg; ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, - &_memcg, may_oom); + &_memcg, oom_gfp_allowed(gfp)); if (ret == -EINTR) { /* @@ -3019,7 +3148,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) { struct memcg_cache_params *cur_params = s->memcg_params; - VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); + VM_BUG_ON(!is_root_cache(s)); if (num_groups > memcg_limited_groups_array_size) { int i; @@ -3280,7 +3409,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, idx = memcg_cache_id(memcg); mutex_lock(&memcg_cache_mutex); - new_cachep = cachep->memcg_params->memcg_caches[idx]; + new_cachep = cache_from_memcg_idx(cachep, idx); if (new_cachep) { css_put(&memcg->css); goto out; @@ -3326,8 +3455,8 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) * we'll take the set_limit_mutex to protect ourselves against this. */ mutex_lock(&set_limit_mutex); - for (i = 0; i < memcg_limited_groups_array_size; i++) { - c = s->memcg_params->memcg_caches[i]; + for_each_memcg_cache_index(i) { + c = cache_from_memcg_idx(s, i); if (!c) continue; @@ -3460,8 +3589,8 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, * code updating memcg_caches will issue a write barrier to match this. */ read_barrier_depends(); - if (likely(cachep->memcg_params->memcg_caches[idx])) { - cachep = cachep->memcg_params->memcg_caches[idx]; + if (likely(cache_from_memcg_idx(cachep, idx))) { + cachep = cache_from_memcg_idx(cachep, idx); goto out; } @@ -3663,8 +3792,7 @@ void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, { /* Update stat data for mem_cgroup */ preempt_disable(); - WARN_ON_ONCE(from->stat->count[idx] < nr_pages); - __this_cpu_add(from->stat->count[idx], -nr_pages); + __this_cpu_sub(from->stat->count[idx], nr_pages); __this_cpu_add(to->stat->count[idx], nr_pages); preempt_enable(); } @@ -4232,7 +4360,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) * css_get() was called in uncharge(). */ if (do_swap_account && swapout && memcg) - swap_cgroup_record(ent, css_id(&memcg->css)); + swap_cgroup_record(ent, mem_cgroup_id(memcg)); } #endif @@ -4284,8 +4412,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, { unsigned short old_id, new_id; - old_id = css_id(&from->css); - new_id = css_id(&to->css); + old_id = mem_cgroup_id(from); + new_id = mem_cgroup_id(to); if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { mem_cgroup_swap_statistics(from, false); @@ -4647,6 +4775,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, return ret; } +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + unsigned long nr_reclaimed = 0; + struct mem_cgroup_per_zone *mz, *next_mz = NULL; + unsigned long reclaimed; + int loop = 0; + struct mem_cgroup_tree_per_zone *mctz; + unsigned long long excess; + unsigned long nr_scanned; + + if (order > 0) + return 0; + + mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); + /* + * This loop can run a while, specially if mem_cgroup's continuously + * keep exceeding their soft limit and putting the system under + * pressure + */ + do { + if (next_mz) + mz = next_mz; + else + mz = mem_cgroup_largest_soft_limit_node(mctz); + if (!mz) + break; + + nr_scanned = 0; + reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, + gfp_mask, &nr_scanned); + nr_reclaimed += reclaimed; + *total_scanned += nr_scanned; + spin_lock(&mctz->lock); + + /* + * If we failed to reclaim anything from this memory cgroup + * it is time to move on to the next cgroup + */ + next_mz = NULL; + if (!reclaimed) { + do { + /* + * Loop until we find yet another one. + * + * By the time we get the soft_limit lock + * again, someone might have aded the + * group back on the RB tree. Iterate to + * make sure we get a different mem. + * mem_cgroup_largest_soft_limit_node returns + * NULL if no other cgroup is present on + * the tree + */ + next_mz = + __mem_cgroup_largest_soft_limit_node(mctz); + if (next_mz == mz) + css_put(&next_mz->memcg->css); + else /* next_mz == NULL or other memcg */ + break; + } while (1); + } + __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + excess = res_counter_soft_limit_excess(&mz->memcg->res); + /* + * One school of thought says that we should not add + * back the node to the tree if reclaim returns 0. + * But our reclaim could return 0, simply because due + * to priority we are exposing a smaller subset of + * memory to reclaim from. Consider this as a longer + * term TODO. + */ + /* If excess == 0, no tree ops */ + __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); + spin_unlock(&mctz->lock); + css_put(&mz->memcg->css); + loop++; + /* + * Could not reclaim anything and there are no more + * mem cgroups to try or we seem to be looping without + * reclaiming anything. + */ + if (!nr_reclaimed && + (next_mz == NULL || + loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) + break; + } while (!nr_reclaimed); + if (next_mz) + css_put(&next_mz->memcg->css); + return nr_reclaimed; +} + /** * mem_cgroup_force_empty_list - clears LRU of a group * @memcg: group to clear @@ -4748,31 +4968,18 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) } while (usage > 0); } -/* - * This mainly exists for tests during the setting of set of use_hierarchy. - * Since this is the very setting we are changing, the current hierarchy value - * is meaningless - */ -static inline bool __memcg_has_children(struct mem_cgroup *memcg) -{ - struct cgroup_subsys_state *pos; - - /* bounce at first found */ - css_for_each_child(pos, &memcg->css) - return true; - return false; -} - -/* - * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed - * to be already dead (as in mem_cgroup_force_empty, for instance). This is - * from mem_cgroup_count_children(), in the sense that we don't really care how - * many children we have; we only need to know if we have any. It also counts - * any memcg without hierarchy as infertile. - */ static inline bool memcg_has_children(struct mem_cgroup *memcg) { - return memcg->use_hierarchy && __memcg_has_children(memcg); + lockdep_assert_held(&memcg_create_mutex); + /* + * The lock does not prevent addition or deletion to the list + * of children, but it prevents a new child from being + * initialized based on this parent in css_online(), so it's + * enough to decide whether hierarchically inherited + * attributes can still be changed or not. + */ + return memcg->use_hierarchy && + !list_empty(&memcg->css.cgroup->children); } /* @@ -4852,7 +5059,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, */ if ((!parent_memcg || !parent_memcg->use_hierarchy) && (val == 1 || val == 0)) { - if (!__memcg_has_children(memcg)) + if (list_empty(&memcg->css.cgroup->children)) memcg->use_hierarchy = val; else retval = -EBUSY; @@ -5179,45 +5386,50 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, static int memcg_numa_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { + struct numa_stat { + const char *name; + unsigned int lru_mask; + }; + + static const struct numa_stat stats[] = { + { "total", LRU_ALL }, + { "file", LRU_ALL_FILE }, + { "anon", LRU_ALL_ANON }, + { "unevictable", BIT(LRU_UNEVICTABLE) }, + }; + const struct numa_stat *stat; int nid; - unsigned long total_nr, file_nr, anon_nr, unevictable_nr; - unsigned long node_nr; + unsigned long nr; struct mem_cgroup *memcg = mem_cgroup_from_css(css); - total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); - seq_printf(m, "total=%lu", total_nr); - for_each_node_state(nid, N_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); - seq_printf(m, " N%d=%lu", nid, node_nr); - } - seq_putc(m, '\n'); - - file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); - seq_printf(m, "file=%lu", file_nr); - for_each_node_state(nid, N_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - LRU_ALL_FILE); - seq_printf(m, " N%d=%lu", nid, node_nr); + for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { + nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); + seq_printf(m, "%s=%lu", stat->name, nr); + for_each_node_state(nid, N_MEMORY) { + nr = mem_cgroup_node_nr_lru_pages(memcg, nid, + stat->lru_mask); + seq_printf(m, " N%d=%lu", nid, nr); + } + seq_putc(m, '\n'); } - seq_putc(m, '\n'); - anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); - seq_printf(m, "anon=%lu", anon_nr); - for_each_node_state(nid, N_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - LRU_ALL_ANON); - seq_printf(m, " N%d=%lu", nid, node_nr); + for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { + struct mem_cgroup *iter; + + nr = 0; + for_each_mem_cgroup_tree(iter, memcg) + nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); + seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); + for_each_node_state(nid, N_MEMORY) { + nr = 0; + for_each_mem_cgroup_tree(iter, memcg) + nr += mem_cgroup_node_nr_lru_pages( + iter, nid, stat->lru_mask); + seq_printf(m, " N%d=%lu", nid, nr); + } + seq_putc(m, '\n'); } - seq_putc(m, '\n'); - unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); - seq_printf(m, "unevictable=%lu", unevictable_nr); - for_each_node_state(nid, N_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - BIT(LRU_UNEVICTABLE)); - seq_printf(m, " N%d=%lu", nid, node_nr); - } - seq_putc(m, '\n'); return 0; } #endif /* CONFIG_NUMA */ @@ -5911,6 +6123,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; lruvec_init(&mz->lruvec); + mz->usage_in_excess = 0; + mz->on_tree = false; mz->memcg = memcg; } memcg->nodeinfo[node] = pn; @@ -5966,7 +6180,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) int node; size_t size = memcg_size(); - free_css_id(&mem_cgroup_subsys, &memcg->css); + mem_cgroup_remove_from_trees(memcg); for_each_node(node) free_mem_cgroup_per_zone_info(memcg, node); @@ -6002,6 +6216,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(parent_mem_cgroup); +static void __init mem_cgroup_soft_limit_tree_init(void) +{ + struct mem_cgroup_tree_per_node *rtpn; + struct mem_cgroup_tree_per_zone *rtpz; + int tmp, node, zone; + + for_each_node(node) { + tmp = node; + if (!node_state(node, N_NORMAL_MEMORY)) + tmp = -1; + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); + BUG_ON(!rtpn); + + soft_limit_tree.rb_tree_per_node[node] = rtpn; + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + rtpz = &rtpn->rb_tree_per_zone[zone]; + rtpz->rb_root = RB_ROOT; + spin_lock_init(&rtpz->lock); + } + } +} + static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -6031,7 +6268,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); vmpressure_init(&memcg->vmpressure); - spin_lock_init(&memcg->soft_lock); return &memcg->css; @@ -6047,6 +6283,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); int error = 0; + if (css->cgroup->id > MEM_CGROUP_ID_MAX) + return -ENOSPC; + if (!parent) return 0; @@ -6109,13 +6348,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) mem_cgroup_invalidate_reclaim_iterators(memcg); mem_cgroup_reparent_charges(memcg); - if (memcg->soft_contributed) { - while ((memcg = parent_mem_cgroup(memcg))) - atomic_dec(&memcg->children_in_excess); - - if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) - atomic_dec(&root_mem_cgroup->children_in_excess); - } mem_cgroup_destroy_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } @@ -6123,6 +6355,42 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + /* + * XXX: css_offline() would be where we should reparent all + * memory to prepare the cgroup for destruction. However, + * memcg does not do css_tryget() and res_counter charging + * under the same RCU lock region, which means that charging + * could race with offlining. Offlining only happens to + * cgroups with no tasks in them but charges can show up + * without any tasks from the swapin path when the target + * memcg is looked up from the swapout record and not from the + * current task as it usually is. A race like this can leak + * charges and put pages with stale cgroup pointers into + * circulation: + * + * #0 #1 + * lookup_swap_cgroup_id() + * rcu_read_lock() + * mem_cgroup_lookup() + * css_tryget() + * rcu_read_unlock() + * disable css_tryget() + * call_rcu() + * offline_css() + * reparent_charges() + * res_counter_charge() + * css_put() + * css_free() + * pc->mem_cgroup = dead memcg + * add page to lru + * + * The bulk of the charges are still moved in offline_css() to + * avoid pinning a lot of pages in case a long-term reference + * like a swapout record is deferring the css_free() to long + * after offlining. But this makes sure we catch any charges + * made after offlining: + */ + mem_cgroup_reparent_charges(memcg); memcg_destroy_kmem(memcg); __mem_cgroup_free(memcg); @@ -6325,7 +6593,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, } /* There is a swap entry and a page doesn't exist or isn't charged */ if (ent.val && !ret && - css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { + mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { ret = MC_TARGET_SWAP; if (target) target->ent = ent; @@ -6376,10 +6644,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } @@ -6568,9 +6836,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, * to be unlocked in __split_huge_page_splitting(), where the main * part of thp split is not executed yet. */ - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { if (mc.precharge < HPAGE_PMD_NR) { - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); @@ -6587,7 +6855,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, } put_page(page); } - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } @@ -6745,7 +7013,6 @@ struct cgroup_subsys mem_cgroup_subsys = { .bind = mem_cgroup_bind, .base_cftypes = mem_cgroup_files, .early_init = 0, - .use_id = 1, }; #ifdef CONFIG_MEMCG_SWAP @@ -6790,6 +7057,7 @@ static int __init mem_cgroup_init(void) { hotcpu_notifier(memcg_cpu_hotplug_callback, 0); enable_swap_cgroup(); + mem_cgroup_soft_limit_tree_init(); memcg_stock_init(); return 0; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 947ed5413279..fabe55046c1d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -938,6 +938,16 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, BUG_ON(!PageHWPoison(p)); return SWAP_FAIL; } + /* + * We pinned the head page for hwpoison handling, + * now we split the thp and we are interested in + * the hwpoisoned raw page, so move the refcount + * to it. + */ + if (hpage != p) { + put_page(hpage); + get_page(p); + } /* THP is split, so ppage should be the real poisoned page. */ ppage = p; } @@ -1114,8 +1124,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * shake_page could have turned it free. */ if (is_free_buddy_page(p)) { - action_result(pfn, "free buddy, 2nd try", - DELAYED); + if (flags & MF_COUNT_INCREASED) + action_result(pfn, "free buddy", DELAYED); + else + action_result(pfn, "free buddy, 2nd try", DELAYED); return 0; } action_result(pfn, "non LRU", IGNORED); @@ -1267,7 +1279,7 @@ void memory_failure_queue(unsigned long pfn, int trapno, int flags) mf_cpu = &get_cpu_var(memory_failure_cpu); spin_lock_irqsave(&mf_cpu->lock, proc_flags); - if (kfifo_put(&mf_cpu->fifo, &entry)) + if (kfifo_put(&mf_cpu->fifo, entry)) schedule_work_on(smp_processor_id(), &mf_cpu->work); else pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", @@ -1349,7 +1361,7 @@ int unpoison_memory(unsigned long pfn) * worked by memory_failure() and the page lock is not held yet. * In such case, we yield to memory_failure() and make unpoison fail. */ - if (PageTransHuge(page)) { + if (!PageHuge(page) && PageTransHuge(page)) { pr_info("MCE: Memory failure is now running on %#lx\n", pfn); return 0; } @@ -1421,19 +1433,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) return 1; /* - * The lock_memory_hotplug prevents a race with memory hotplug. - * This is a big hammer, a better would be nicer. - */ - lock_memory_hotplug(); - - /* - * Isolate the page, so that it doesn't get reallocated if it - * was free. This flag should be kept set until the source page - * is freed and PG_hwpoison on it is set. - */ - if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE) - set_migratetype_isolate(p, true); - /* * When the target page is a free hugepage, just remove it * from free hugepage list. */ @@ -1453,7 +1452,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) /* Not a free page */ ret = 1; } - unlock_memory_hotplug(); return ret; } @@ -1517,10 +1515,16 @@ static int soft_offline_huge_page(struct page *page, int flags) if (ret > 0) ret = -EIO; } else { - set_page_hwpoison_huge_page(hpage); - dequeue_hwpoisoned_huge_page(hpage); - atomic_long_add(1 << compound_order(hpage), - &num_poisoned_pages); + /* overcommit hugetlb page will be freed to buddy */ + if (PageHuge(page)) { + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + atomic_long_add(1 << compound_order(hpage), + &num_poisoned_pages); + } else { + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + } } return ret; } @@ -1652,15 +1656,28 @@ int soft_offline_page(struct page *page, int flags) } } + /* + * The lock_memory_hotplug prevents a race with memory hotplug. + * This is a big hammer, a better would be nicer. + */ + lock_memory_hotplug(); + + /* + * Isolate the page, so that it doesn't get reallocated if it + * was free. This flag should be kept set until the source page + * is freed and PG_hwpoison on it is set. + */ + if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + set_migratetype_isolate(page, true); + ret = get_any_page(page, pfn, flags); - if (ret < 0) - goto unset; - if (ret) { /* for in-use pages */ + unlock_memory_hotplug(); + if (ret > 0) { /* for in-use pages */ if (PageHuge(page)) ret = soft_offline_huge_page(page, flags); else ret = __soft_offline_page(page, flags); - } else { /* for free pages */ + } else if (ret == 0) { /* for free pages */ if (PageHuge(page)) { set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); @@ -1671,7 +1688,6 @@ int soft_offline_page(struct page *page, int flags) atomic_long_inc(&num_poisoned_pages); } } -unset: unset_migratetype_isolate(page, MIGRATE_MOVABLE); return ret; } diff --git a/mm/memory.c b/mm/memory.c index ca0003947115..6768ce9e57d2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -69,8 +69,8 @@ #include "internal.h" -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS -#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -382,7 +382,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - tlb->mm->nr_ptes--; + atomic_long_dec(&tlb->mm->nr_ptes); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -453,8 +453,6 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, /* * This function frees user-level page tables of a process. - * - * Must be called with pagetable lock held. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, @@ -552,6 +550,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd, unsigned long address) { + spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); int wait_split_huge_page; if (!new) @@ -572,15 +571,15 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); wait_split_huge_page = 0; if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ - mm->nr_ptes++; + atomic_long_inc(&mm->nr_ptes); pmd_populate(mm, pmd, new); new = NULL; } else if (unlikely(pmd_trans_splitting(*pmd))) wait_split_huge_page = 1; - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); if (new) pte_free(mm, new); if (wait_split_huge_page) @@ -681,7 +680,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, if (vma->vm_ops) printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", vma->vm_ops->fault); - if (vma->vm_file && vma->vm_file->f_op) + if (vma->vm_file) printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", vma->vm_file->f_op->mmap); dump_stack(); @@ -837,6 +836,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ make_migration_entry_read(&entry); pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*src_pte)) + pte = pte_swp_mksoft_dirty(pte); set_pte_at(src_mm, addr, src_pte, pte); } } @@ -1516,20 +1517,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma, split_huge_page_pmd(vma, address, pmd); goto split_fallthrough; } - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); wait_split_huge_page(vma->anon_vma, pmd); } else { page = follow_trans_huge_pmd(vma, address, pmd, flags); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); *page_mask = HPAGE_PMD_NR - 1; goto out; } } else - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); /* fall through */ } split_fallthrough: @@ -2719,6 +2720,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, get_page(dirty_page); reuse: + /* + * Clear the pages cpupid information as the existing + * information potentially belongs to a now completely + * unrelated process. + */ + if (old_page) + page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); + flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -3519,13 +3528,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, - unsigned long addr, int current_nid) + unsigned long addr, int page_nid, + int *flags) { get_page(page); count_vm_numa_event(NUMA_HINT_FAULTS); - if (current_nid == numa_node_id()) + if (page_nid == numa_node_id()) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + *flags |= TNF_FAULT_LOCAL; + } return mpol_misplaced(page, vma, addr); } @@ -3535,9 +3547,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *page = NULL; spinlock_t *ptl; - int current_nid = -1; + int page_nid = -1; + int last_cpupid; int target_nid; bool migrated = false; + int flags = 0; /* * The "pte" at this point cannot be used safely without @@ -3564,123 +3578,44 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(ptep, ptl); return 0; } + BUG_ON(is_zero_pfn(page_to_pfn(page))); + + /* + * Avoid grouping on DSO/COW pages in specific and RO pages + * in general, RO pages shouldn't hurt as much anyway since + * they can be in shared cache state. + */ + if (!pte_write(pte)) + flags |= TNF_NO_GROUP; - current_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, addr, current_nid); + /* + * Flag if the page is shared between multiple address spaces. This + * is later used when determining whether to group tasks together + */ + if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) + flags |= TNF_SHARED; + + last_cpupid = page_cpupid_last(page); + page_nid = page_to_nid(page); + target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); pte_unmap_unlock(ptep, ptl); if (target_nid == -1) { - /* - * Account for the fault against the current node if it not - * being replaced regardless of where the page is located. - */ - current_nid = numa_node_id(); put_page(page); goto out; } /* Migrate to the requested node */ - migrated = migrate_misplaced_page(page, target_nid); - if (migrated) - current_nid = target_nid; - -out: - if (current_nid != -1) - task_numa_fault(current_nid, 1, migrated); - return 0; -} - -/* NUMA hinting page fault entry point for regular pmds */ -#ifdef CONFIG_NUMA_BALANCING -static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) -{ - pmd_t pmd; - pte_t *pte, *orig_pte; - unsigned long _addr = addr & PMD_MASK; - unsigned long offset; - spinlock_t *ptl; - bool numa = false; - int local_nid = numa_node_id(); - - spin_lock(&mm->page_table_lock); - pmd = *pmdp; - if (pmd_numa(pmd)) { - set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); - numa = true; + migrated = migrate_misplaced_page(page, vma, target_nid); + if (migrated) { + page_nid = target_nid; + flags |= TNF_MIGRATED; } - spin_unlock(&mm->page_table_lock); - - if (!numa) - return 0; - /* we're in a page fault so some vma must be in the range */ - BUG_ON(!vma); - BUG_ON(vma->vm_start >= _addr + PMD_SIZE); - offset = max(_addr, vma->vm_start) & ~PMD_MASK; - VM_BUG_ON(offset >= PMD_SIZE); - orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); - pte += offset >> PAGE_SHIFT; - for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { - pte_t pteval = *pte; - struct page *page; - int curr_nid = local_nid; - int target_nid; - bool migrated; - if (!pte_present(pteval)) - continue; - if (!pte_numa(pteval)) - continue; - if (addr >= vma->vm_end) { - vma = find_vma(mm, addr); - /* there's a pte present so there must be a vma */ - BUG_ON(!vma); - BUG_ON(addr < vma->vm_start); - } - if (pte_numa(pteval)) { - pteval = pte_mknonnuma(pteval); - set_pte_at(mm, addr, pte, pteval); - } - page = vm_normal_page(vma, addr, pteval); - if (unlikely(!page)) - continue; - /* only check non-shared pages */ - if (unlikely(page_mapcount(page) != 1)) - continue; - - /* - * Note that the NUMA fault is later accounted to either - * the node that is currently running or where the page is - * migrated to. - */ - curr_nid = local_nid; - target_nid = numa_migrate_prep(page, vma, addr, - page_to_nid(page)); - if (target_nid == -1) { - put_page(page); - continue; - } - - /* Migrate to the requested node */ - pte_unmap_unlock(pte, ptl); - migrated = migrate_misplaced_page(page, target_nid); - if (migrated) - curr_nid = target_nid; - task_numa_fault(curr_nid, 1, migrated); - - pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); - } - pte_unmap_unlock(orig_pte, ptl); - - return 0; -} -#else -static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) -{ - BUG(); +out: + if (page_nid != -1) + task_numa_fault(last_cpupid, page_nid, 1, flags); return 0; } -#endif /* CONFIG_NUMA_BALANCING */ /* * These routines also need to handle stuff like marking pages dirty @@ -3820,8 +3755,8 @@ retry: } } - if (pmd_numa(*pmd)) - return do_pmd_numa_page(mm, vma, address, pmd); + /* THP should already have been handled */ + BUG_ON(pmd_numa(*pmd)); /* * Use __pte_alloc instead of pte_alloc_map, because we can't @@ -3863,15 +3798,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, * space. Kernel faults are handled more gracefully. */ if (flags & FAULT_FLAG_USER) - mem_cgroup_enable_oom(); + mem_cgroup_oom_enable(); ret = __handle_mm_fault(mm, vma, address, flags); - if (flags & FAULT_FLAG_USER) - mem_cgroup_disable_oom(); - - if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) - mem_cgroup_oom_synchronize(); + if (flags & FAULT_FLAG_USER) { + mem_cgroup_oom_disable(); + /* + * The task may have entered a memcg OOM situation but + * if the allocation error was handled gracefully (no + * VM_FAULT_OOM), there is no need to kill anything. + * Just clean up the OOM state peacefully. + */ + if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) + mem_cgroup_oom_synchronize(false); + } return ret; } @@ -4329,3 +4270,21 @@ void copy_user_huge_page(struct page *dst, struct page *src, } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + +#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS +bool ptlock_alloc(struct page *page) +{ + spinlock_t *ptl; + + ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); + if (!ptl) + return false; + page->ptl = ptl; + return true; +} + +void ptlock_free(struct page *page) +{ + kfree(page->ptl); +} +#endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ed85fe3870e2..489f235502db 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -31,6 +31,7 @@ #include <linux/firmware-map.h> #include <linux/stop_machine.h> #include <linux/hugetlb.h> +#include <linux/memblock.h> #include <asm/tlbflush.h> @@ -365,8 +366,7 @@ out_fail: static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long old_pgdat_end_pfn = - pgdat->node_start_pfn + pgdat->node_spanned_pages; + unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) pgdat->node_start_pfn = start_pfn; @@ -402,13 +402,12 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) static int __meminit __add_section(int nid, struct zone *zone, unsigned long phys_start_pfn) { - int nr_pages = PAGES_PER_SECTION; int ret; if (pfn_valid(phys_start_pfn)) return -EEXIST; - ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); + ret = sparse_add_one_section(zone, phys_start_pfn); if (ret < 0) return ret; @@ -579,9 +578,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, static void shrink_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pgdat_start_pfn = pgdat->node_start_pfn; - unsigned long pgdat_end_pfn = - pgdat->node_start_pfn + pgdat->node_spanned_pages; + unsigned long pgdat_start_pfn = pgdat->node_start_pfn; + unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ + unsigned long pgdat_end_pfn = p; unsigned long pfn; struct mem_section *ms; int nid = pgdat->node_id; @@ -935,7 +934,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = page_to_nid(pfn_to_page(pfn)); + nid = pfn_to_nid(pfn); ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); @@ -1044,17 +1043,23 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } -/* +/** + * try_online_node - online a node if offlined + * * called by cpu_up() to online a node without onlined memory. */ -int mem_online_node(int nid) +int try_online_node(int nid) { pg_data_t *pgdat; int ret; + if (node_online(nid)) + return 0; + lock_memory_hotplug(); pgdat = hotadd_new_pgdat(nid, 0); if (!pgdat) { + pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; goto out; } @@ -1062,6 +1067,12 @@ int mem_online_node(int nid) ret = register_one_node(nid); BUG_ON(ret); + if (pgdat->node_zonelists->_zonerefs->zone == NULL) { + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL, NULL); + mutex_unlock(&zonelists_mutex); + } + out: unlock_memory_hotplug(); return ret; @@ -1412,6 +1423,36 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) } #endif /* CONFIG_MOVABLE_NODE */ +static int __init cmdline_parse_movable_node(char *p) +{ +#ifdef CONFIG_MOVABLE_NODE + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + memblock_set_bottom_up(true); +#else + pr_warn("movable_node option not supported\n"); +#endif + return 0; +} +early_param("movable_node", cmdline_parse_movable_node); + /* check which state of node_states will be changed when offline memory */ static void node_states_check_changes_offline(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) @@ -1702,7 +1743,7 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, } #ifdef CONFIG_MEMORY_HOTREMOVE -static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) +static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { int ret = !is_memblock_offlined(mem); @@ -1854,7 +1895,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) * if this is not the case. */ ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, - is_memblock_offlined_cb); + check_memblock_offlined_cb); if (ret) { unlock_memory_hotplug(); BUG(); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 04729647f359..0cd2c4d4e270 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -525,8 +525,9 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, #ifdef CONFIG_HUGETLB_PAGE int nid; struct page *page; + spinlock_t *ptl; - spin_lock(&vma->vm_mm->page_table_lock); + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); page = pte_page(huge_ptep_get((pte_t *)pmd)); nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) @@ -536,7 +537,7 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) isolate_huge_page(page, private); unlock: - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); #else BUG(); #endif @@ -1125,7 +1126,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, tmp = *from; while (!nodes_empty(tmp)) { int s,d; - int source = -1; + int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { @@ -1160,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, if (!node_isset(dest, tmp)) break; } - if (source == -1) + if (source == NUMA_NO_NODE) break; node_clear(source, tmp); @@ -1196,14 +1197,16 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * break; vma = vma->vm_next; } + + if (PageHuge(page)) { + if (vma) + return alloc_huge_page_noerr(vma, address, 1); + else + return NULL; + } /* - * queue_pages_range() confirms that @page belongs to some vma, - * so vma shouldn't be NULL. + * if !vma, alloc_page_vma() will use task or system default policy */ - BUG_ON(!vma); - - if (PageHuge(page)) - return alloc_huge_page_noerr(vma, address, 1); return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); } #else @@ -1317,7 +1320,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } else - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); up_write(&mm->mmap_sem); mpol_out: @@ -1679,6 +1682,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task, return pol; } +bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) +{ + struct mempolicy *pol = get_task_policy(task); + if (vma) { + if (vma->vm_ops && vma->vm_ops->get_policy) { + bool ret = false; + + pol = vma->vm_ops->get_policy(vma, vma->vm_start); + if (pol && (pol->flags & MPOL_F_MOF)) + ret = true; + mpol_cond_put(pol); + + return ret; + } else if (vma->vm_policy) { + pol = vma->vm_policy; + } + } + + if (!pol) + return default_policy.flags & MPOL_F_MOF; + + return pol->flags & MPOL_F_MOF; +} + static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; @@ -1811,7 +1838,7 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; int c; - int nid = -1; + int nid = NUMA_NO_NODE; if (!nnodes) return numa_node_id(); @@ -1848,11 +1875,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol, /* * Return the bit number of a random bit set in the nodemask. - * (returns -1 if nodemask is empty) + * (returns NUMA_NO_NODE if nodemask is empty) */ int node_random(const nodemask_t *maskp) { - int w, bit = -1; + int w, bit = NUMA_NO_NODE; w = nodes_weight(*maskp); if (w) @@ -2277,6 +2304,35 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } +#ifdef CONFIG_NUMA_BALANCING +static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) +{ + /* Never defer a private fault */ + if (cpupid_match_pid(p, last_cpupid)) + return false; + + if (p->numa_migrate_deferred) { + p->numa_migrate_deferred--; + return true; + } + return false; +} + +static inline void defer_numa_migrate(struct task_struct *p) +{ + p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; +} +#else +static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) +{ + return false; +} + +static inline void defer_numa_migrate(struct task_struct *p) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + /** * mpol_misplaced - check whether current page node is valid in policy * @@ -2300,6 +2356,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long struct zone *zone; int curnid = page_to_nid(page); unsigned long pgoff; + int thiscpu = raw_smp_processor_id(); + int thisnid = cpu_to_node(thiscpu); int polnid = -1; int ret = -1; @@ -2348,9 +2406,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_nid; + int last_cpupid; + int this_cpupid; - polnid = numa_node_id(); + polnid = thisnid; + this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid); /* * Multi-stage node selection is used in conjunction @@ -2373,8 +2433,25 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * it less likely we act on an unlikely task<->page * relation. */ - last_nid = page_nid_xchg_last(page, polnid); - if (last_nid != polnid) + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { + + /* See sysctl_numa_balancing_migrate_deferred comment */ + if (!cpupid_match_pid(current, last_cpupid)) + defer_numa_migrate(current); + + goto out; + } + + /* + * The quadratic filter above reduces extraneous migration + * of shared pages somewhat. This code reduces it even more, + * reducing the overhead of page migrations of shared pages. + * This makes workloads with shared pages rely more on + * "move task near its memory", and less on "move memory + * towards its task", which is exactly what we want. + */ + if (numa_migrate_deferred(current, last_cpupid)) goto out; } @@ -2840,62 +2917,45 @@ out: * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * - * Convert a mempolicy into a string. - * Returns the number of characters in buffer (if positive) - * or an error (negative) + * Convert @pol into a string. If @buffer is too short, truncate the string. + * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the + * longest flag, "relative", and to display at least a few node ids. */ -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) +void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; - int l; - nodemask_t nodes; - unsigned short mode; - unsigned short flags = pol ? pol->flags : 0; - - /* - * Sanity check: room for longest mode, flag and some nodes - */ - VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16); + nodemask_t nodes = NODE_MASK_NONE; + unsigned short mode = MPOL_DEFAULT; + unsigned short flags = 0; - if (!pol || pol == &default_policy) - mode = MPOL_DEFAULT; - else + if (pol && pol != &default_policy) { mode = pol->mode; + flags = pol->flags; + } switch (mode) { case MPOL_DEFAULT: - nodes_clear(nodes); break; - case MPOL_PREFERRED: - nodes_clear(nodes); if (flags & MPOL_F_LOCAL) mode = MPOL_LOCAL; else node_set(pol->v.preferred_node, nodes); break; - case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: nodes = pol->v.nodes; break; - default: - return -EINVAL; + WARN_ON_ONCE(1); + snprintf(p, maxlen, "unknown"); + return; } - l = strlen(policy_modes[mode]); - if (buffer + maxlen < p + l + 1) - return -ENOSPC; - - strcpy(p, policy_modes[mode]); - p += l; + p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { - if (buffer + maxlen < p + 2) - return -ENOSPC; - *p++ = '='; + p += snprintf(p, buffer + maxlen - p, "="); /* * Currently, the only defined flags are mutually exclusive @@ -2907,10 +2967,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) } if (!nodes_empty(nodes)) { - if (buffer + maxlen < p + 2) - return -ENOSPC; - *p++ = ':'; + p += snprintf(p, buffer + maxlen - p, ":"); p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); } - return p - buffer; } diff --git a/mm/migrate.c b/mm/migrate.c index 9c8d5f59d30b..9194375b2307 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -36,6 +36,7 @@ #include <linux/hugetlb_cgroup.h> #include <linux/gfp.h> #include <linux/balloon_compaction.h> +#include <linux/mmu_notifier.h> #include <asm/tlbflush.h> @@ -107,7 +108,7 @@ void putback_movable_pages(struct list_head *l) list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - if (unlikely(balloon_page_movable(page))) + if (unlikely(isolated_balloon_page(page))) balloon_page_putback(page); else putback_lru_page(page); @@ -130,7 +131,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, addr); if (!ptep) goto out; - ptl = &mm->page_table_lock; + ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep); } else { pmd = mm_find_pmd(mm, addr); if (!pmd) @@ -161,6 +162,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, get_page(new); pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + if (pte_swp_soft_dirty(*ptep)) + pte = pte_mksoft_dirty(pte); if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte); #ifdef CONFIG_HUGETLB_PAGE @@ -247,9 +250,10 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, __migration_entry_wait(mm, ptep, ptl); } -void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte) +void migration_entry_wait_huge(struct vm_area_struct *vma, + struct mm_struct *mm, pte_t *pte) { - spinlock_t *ptl = &(mm)->page_table_lock; + spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); __migration_entry_wait(mm, pte, ptl); } @@ -313,14 +317,15 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, */ int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, - struct buffer_head *head, enum migrate_mode mode) + struct buffer_head *head, enum migrate_mode mode, + int extra_count) { - int expected_count = 0; + int expected_count = 1 + extra_count; void **pslot; if (!mapping) { /* Anonymous page without mapping */ - if (page_count(page) != 1) + if (page_count(page) != expected_count) return -EAGAIN; return MIGRATEPAGE_SUCCESS; } @@ -330,7 +335,7 @@ int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->page_tree, page_index(page)); - expected_count = 2 + page_has_private(page); + expected_count += 1 + page_has_private(page); if (page_count(page) != expected_count || radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); @@ -439,10 +444,60 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, } /* + * Gigantic pages are so large that we do not guarantee that page++ pointer + * arithmetic will work across the entire page. We need something more + * specialized. + */ +static void __copy_gigantic_page(struct page *dst, struct page *src, + int nr_pages) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < nr_pages; ) { + cond_resched(); + copy_highpage(dst, src); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +static void copy_huge_page(struct page *dst, struct page *src) +{ + int i; + int nr_pages; + + if (PageHuge(src)) { + /* hugetlbfs page */ + struct hstate *h = page_hstate(src); + nr_pages = pages_per_huge_page(h); + + if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { + __copy_gigantic_page(dst, src, nr_pages); + return; + } + } else { + /* thp page */ + BUG_ON(!PageTransHuge(src)); + nr_pages = hpage_nr_pages(src); + } + + for (i = 0; i < nr_pages; i++) { + cond_resched(); + copy_highpage(dst + i, src + i); + } +} + +/* * Copy the page to its new location */ void migrate_page_copy(struct page *newpage, struct page *page) { + int cpupid; + if (PageHuge(page) || PageTransHuge(page)) copy_huge_page(newpage, page); else @@ -479,6 +534,13 @@ void migrate_page_copy(struct page *newpage, struct page *page) __set_page_dirty_nobuffers(newpage); } + /* + * Copy NUMA information to the new page, to prevent over-eager + * future migrations of this same page. + */ + cpupid = page_cpupid_xchg_last(page, -1); + page_cpupid_xchg_last(newpage, cpupid); + mlock_migrate_page(newpage, page); ksm_migrate_page(newpage, page); /* @@ -523,7 +585,7 @@ int migrate_page(struct address_space *mapping, BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; @@ -550,7 +612,7 @@ int buffer_migrate_page(struct address_space *mapping, head = page_buffers(page); - rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); + rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; @@ -1498,7 +1560,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOWARN) & ~GFP_IOFS, 0); if (newpage) - page_nid_xchg_last(newpage, page_nid_last(page)); + page_cpupid_xchg_last(newpage, page_cpupid_last(page)); return newpage; } @@ -1594,12 +1656,25 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) return 1; } +bool pmd_trans_migrating(pmd_t pmd) +{ + struct page *page = pmd_page(pmd); + return PageLocked(page); +} + +void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) +{ + struct page *page = pmd_page(*pmd); + wait_on_page_locked(page); +} + /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on * the page that will be dropped by this function before returning. */ -int migrate_misplaced_page(struct page *page, int node) +int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, + int node) { pg_data_t *pgdat = NODE_DATA(node); int isolated; @@ -1607,10 +1682,11 @@ int migrate_misplaced_page(struct page *page, int node) LIST_HEAD(migratepages); /* - * Don't migrate pages that are mapped in multiple processes. - * TODO: Handle false sharing detection instead of this hammer + * Don't migrate file pages that are mapped in multiple processes + * with execute permissions as they are probably shared libraries. */ - if (page_mapcount(page) != 1) + if (page_mapcount(page) != 1 && page_is_file_cache(page) && + (vma->vm_flags & VM_EXEC)) goto out; /* @@ -1653,19 +1729,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, unsigned long address, struct page *page, int node) { - unsigned long haddr = address & HPAGE_PMD_MASK; + spinlock_t *ptl; pg_data_t *pgdat = NODE_DATA(node); int isolated = 0; struct page *new_page = NULL; struct mem_cgroup *memcg = NULL; int page_lru = page_is_file_cache(page); - - /* - * Don't migrate pages that are mapped in multiple processes. - * TODO: Handle false sharing detection instead of this hammer - */ - if (page_mapcount(page) != 1) - goto out_dropref; + unsigned long mmun_start = address & HPAGE_PMD_MASK; + unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; + pmd_t orig_entry; /* * Rate-limit the amount of data that is being migrated to a node. @@ -1680,7 +1752,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, if (!new_page) goto out_fail; - page_nid_xchg_last(new_page, page_nid_last(page)); + page_cpupid_xchg_last(new_page, page_cpupid_last(page)); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { @@ -1688,6 +1760,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, goto out_fail; } + if (mm_tlb_flush_pending(mm)) + flush_tlb_range(vma, mmun_start, mmun_end); + /* Prepare a page as a migration target */ __set_page_locked(new_page); SetPageSwapBacked(new_page); @@ -1699,9 +1774,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, WARN_ON(PageLRU(new_page)); /* Recheck the target PMD */ - spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(*pmd, entry))) { - spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) { +fail_putback: + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); /* Reverse changes made by migrate_page_copy() */ if (TestClearPageActive(new_page)) @@ -1713,12 +1791,13 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, unlock_page(new_page); put_page(new_page); /* Free it */ - unlock_page(page); + /* Retake the callers reference and putback on LRU */ + get_page(page); putback_lru_page(page); + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); - isolated = 0; - goto out; + goto out_unlock; } /* @@ -1730,23 +1809,43 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, */ mem_cgroup_prepare_migration(page, new_page, &memcg); + orig_entry = *pmd; entry = mk_pmd(new_page, vma->vm_page_prot); - entry = pmd_mknonnuma(entry); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); entry = pmd_mkhuge(entry); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - page_add_new_anon_rmap(new_page, vma, haddr); - - set_pmd_at(mm, haddr, pmd, entry); + /* + * Clear the old entry under pagetable lock and establish the new PTE. + * Any parallel GUP will either observe the old page blocking on the + * page lock, block on the page table lock or observe the new page. + * The SetPageUptodate on the new page and page_add_new_anon_rmap + * guarantee the copy is visible before the pagetable update. + */ + flush_cache_range(vma, mmun_start, mmun_end); + page_add_new_anon_rmap(new_page, vma, mmun_start); + pmdp_clear_flush(vma, mmun_start, pmd); + set_pmd_at(mm, mmun_start, pmd, entry); + flush_tlb_range(vma, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); + + if (page_count(page) != 2) { + set_pmd_at(mm, mmun_start, pmd, orig_entry); + flush_tlb_range(vma, mmun_start, mmun_end); + update_mmu_cache_pmd(vma, address, &entry); + page_remove_rmap(new_page); + goto fail_putback; + } + page_remove_rmap(page); + /* * Finish the charge transaction under the page table lock to * prevent split_huge_page() from dividing up the charge * before it's fully transferred to the new page. */ mem_cgroup_end_migration(memcg, page, new_page, true); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(new_page); unlock_page(page); @@ -1756,7 +1855,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); -out: mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); @@ -1765,6 +1863,15 @@ out: out_fail: count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); out_dropref: + ptl = pmd_lock(mm, pmd); + if (pmd_same(*pmd, entry)) { + entry = pmd_mknonnuma(entry); + set_pmd_at(mm, mmun_start, pmd, entry); + update_mmu_cache_pmd(vma, address, &entry); + } + spin_unlock(ptl); + +out_unlock: unlock_page(page); put_page(page); return 0; diff --git a/mm/mlock.c b/mm/mlock.c index d63802663242..192e6eebe4f2 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -133,7 +133,10 @@ static void __munlock_isolation_failed(struct page *page) /** * munlock_vma_page - munlock a vma page - * @page - page to be unlocked + * @page - page to be unlocked, either a normal page or THP page head + * + * returns the size of the page as a page mask (0 for normal page, + * HPAGE_PMD_NR - 1 for THP head page) * * called from munlock()/munmap() path with page supposedly on the LRU. * When we munlock a page, because the vma where we found the page is being @@ -148,21 +151,30 @@ static void __munlock_isolation_failed(struct page *page) */ unsigned int munlock_vma_page(struct page *page) { - unsigned int page_mask = 0; + unsigned int nr_pages; BUG_ON(!PageLocked(page)); if (TestClearPageMlocked(page)) { - unsigned int nr_pages = hpage_nr_pages(page); + nr_pages = hpage_nr_pages(page); mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - page_mask = nr_pages - 1; if (!isolate_lru_page(page)) __munlock_isolated_page(page); else __munlock_isolation_failed(page); + } else { + nr_pages = hpage_nr_pages(page); } - return page_mask; + /* + * Regardless of the original PageMlocked flag, we determine nr_pages + * after touching the flag. This leaves a possible race with a THP page + * split, such that a whole THP page was munlocked, but nr_pages == 1. + * Returning a smaller mask due to that is OK, the worst that can + * happen is subsequent useless scanning of the former tail pages. + * The NR_MLOCK accounting can however become broken. + */ + return nr_pages - 1; } /** @@ -286,10 +298,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) { int i; int nr = pagevec_count(pvec); - int delta_munlocked = -nr; + int delta_munlocked; struct pagevec pvec_putback; int pgrescued = 0; + pagevec_init(&pvec_putback, 0); + /* Phase 1: page isolation */ spin_lock_irq(&zone->lru_lock); for (i = 0; i < nr; i++) { @@ -318,18 +332,21 @@ skip_munlock: /* * We won't be munlocking this page in the next phase * but we still need to release the follow_page_mask() - * pin. + * pin. We cannot do it under lru_lock however. If it's + * the last pin, __page_cache_release would deadlock. */ + pagevec_add(&pvec_putback, pvec->pages[i]); pvec->pages[i] = NULL; - put_page(page); - delta_munlocked++; } } + delta_munlocked = -nr + pagevec_count(&pvec_putback); __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); spin_unlock_irq(&zone->lru_lock); + /* Now we can release pins of pages that we are not munlocking */ + pagevec_release(&pvec_putback); + /* Phase 2: page munlock */ - pagevec_init(&pvec_putback, 0); for (i = 0; i < nr; i++) { struct page *page = pvec->pages[i]; @@ -379,10 +396,14 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, /* * Initialize pte walk starting at the already pinned page where we - * are sure that there is a pte. + * are sure that there is a pte, as it was pinned under the same + * mmap_sem write op. */ pte = get_locked_pte(vma->vm_mm, start, &ptl); - end = min(end, pmd_addr_end(start, end)); + /* Make sure we do not cross the page table boundary */ + end = pgd_addr_end(start, end); + end = pud_addr_end(start, end); + end = pmd_addr_end(start, end); /* The page next to the pinned page is the first we will try to get */ start += PAGE_SIZE; @@ -436,7 +457,8 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, while (start < end) { struct page *page = NULL; - unsigned int page_mask, page_increm; + unsigned int page_mask; + unsigned long page_increm; struct pagevec pvec; struct zone *zone; int zoneid; @@ -486,7 +508,9 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, goto next; } } - page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + /* It's a bug to munlock in the middle of a THP page */ + VM_BUG_ON((start >> PAGE_SHIFT) & page_mask); + page_increm = 1 + page_mask; start += page_increm * PAGE_SIZE; next: cond_resched(); @@ -736,6 +760,7 @@ static int do_mlockall(int flags) /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); + cond_resched(); } out: return 0; diff --git a/mm/mm_init.c b/mm/mm_init.c index 633c08863fd8..68562e92d50c 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Lastnid %d Flags %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, - LAST_NID_WIDTH, + LAST_CPUPID_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastnid %d\n", + "Section %d Node %d Zone %d Lastcpupid %d\n", SECTIONS_SHIFT, NODES_SHIFT, ZONES_SHIFT, - LAST_NID_SHIFT); + LAST_CPUPID_SHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", - "Section %lu Node %lu Zone %lu Lastnid %lu\n", + "Section %lu Node %lu Zone %lu Lastcpupid %lu\n", (unsigned long)SECTIONS_PGSHIFT, (unsigned long)NODES_PGSHIFT, (unsigned long)ZONES_PGSHIFT, - (unsigned long)LAST_NID_PGSHIFT); + (unsigned long)LAST_CPUPID_PGSHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", "Node/Zone ID: %lu -> %lu\n", (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), @@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void) mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", "Node not in page flags"); #endif -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", - "Last nid not in page flags"); + "Last cpupid not in page flags"); #endif if (SECTIONS_WIDTH) { diff --git a/mm/mmap.c b/mm/mmap.c index 9d548512ff8a..834b2d785f1e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -179,14 +179,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) goto error; } - allowed = (totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100; + allowed = vm_commit_limit(); /* * Reserve some for root */ if (!cap_sys_admin) allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - allowed += total_swap_pages; /* * Don't let a single process grow so big a user can't recover @@ -1299,7 +1297,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, vm_flags &= ~VM_MAYEXEC; } - if (!file->f_op || !file->f_op->mmap) + if (!file->f_op->mmap) return -ENODEV; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; @@ -1856,7 +1854,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_area_struct *vma; struct vm_unmapped_area_info info; - if (len > TASK_SIZE) + if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -1865,14 +1863,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vma->vm_start)) return addr; } info.flags = 0; info.length = len; - info.low_limit = TASK_UNMAPPED_BASE; + info.low_limit = mm->mmap_base; info.high_limit = TASK_SIZE; info.align_mask = 0; return vm_unmapped_area(&info); @@ -1895,7 +1893,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, struct vm_unmapped_area_info info; /* requested length too big for entire address space */ - if (len > TASK_SIZE) + if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -1905,14 +1903,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vma->vm_start)) return addr; } info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = PAGE_SIZE; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); info.high_limit = mm->mmap_base; info.align_mask = 0; addr = vm_unmapped_area(&info); @@ -1951,7 +1949,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return -ENOMEM; get_area = current->mm->get_unmapped_area; - if (file && file->f_op && file->f_op->get_unmapped_area) + if (file && file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; addr = get_area(file, addr, len, pgoff, flags); if (IS_ERR_VALUE(addr)) @@ -2726,7 +2724,8 @@ void exit_mmap(struct mm_struct *mm) } vm_unacct_memory(nr_accounted); - WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + WARN_ON(atomic_long_read(&mm->nr_ptes) > + (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } /* Insert vm structure into process list sorted by address diff --git a/mm/mmzone.c b/mm/mmzone.c index 2ac0afbd68f3..bf34fb8556db 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec) INIT_LIST_HEAD(&lruvec->lists[lru]); } -#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) -int page_nid_xchg_last(struct page *page, int nid) +#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) +int page_cpupid_xchg_last(struct page *page, int cpupid) { unsigned long old_flags, flags; - int last_nid; + int last_cpupid; do { old_flags = flags = page->flags; - last_nid = page_nid_last(page); + last_cpupid = page_cpupid_last(page); - flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); - flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; + flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); + flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); - return last_nid; + return last_cpupid; } #endif diff --git a/mm/mprotect.c b/mm/mprotect.c index 94722a4d6b43..bb53a6591aea 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -37,14 +37,12 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa, bool *ret_all_same_node) + int dirty_accountable, int prot_numa) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; - bool all_same_node = true; - int last_nid = -1; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -54,25 +52,21 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_t ptent; bool updated = false; - ptent = ptep_modify_prot_start(mm, addr, pte); if (!prot_numa) { + ptent = ptep_modify_prot_start(mm, addr, pte); + if (pte_numa(ptent)) + ptent = pte_mknonnuma(ptent); ptent = pte_modify(ptent, newprot); updated = true; } else { struct page *page; + ptent = *pte; page = vm_normal_page(vma, addr, oldpte); if (page) { - int this_nid = page_to_nid(page); - if (last_nid == -1) - last_nid = this_nid; - if (last_nid != this_nid) - all_same_node = false; - - /* only check non-shared pages */ - if (!pte_numa(oldpte) && - page_mapcount(page) == 1) { + if (!pte_numa(oldpte)) { ptent = pte_mknuma(ptent); + set_pte_at(mm, addr, pte, ptent); updated = true; } } @@ -89,45 +83,35 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (updated) pages++; - ptep_modify_prot_commit(mm, addr, pte, ptent); + + /* Only !prot_numa always clears the pte */ + if (!prot_numa) + ptep_modify_prot_commit(mm, addr, pte, ptent); } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); if (is_write_migration_entry(entry)) { + pte_t newpte; /* * A protection check is difficult so * just be safe and disable write */ make_migration_entry_read(&entry); - set_pte_at(mm, addr, pte, - swp_entry_to_pte(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + set_pte_at(mm, addr, pte, newpte); + + pages++; } - pages++; } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); - *ret_all_same_node = all_same_node; return pages; } -#ifdef CONFIG_NUMA_BALANCING -static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) -{ - spin_lock(&mm->page_table_lock); - set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); - spin_unlock(&mm->page_table_lock); -} -#else -static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) -{ - BUG(); -} -#endif /* CONFIG_NUMA_BALANCING */ - static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) @@ -135,36 +119,39 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pmd_t *pmd; unsigned long next; unsigned long pages = 0; - bool all_same_node; + unsigned long nr_huge_updates = 0; pmd = pmd_offset(pud, addr); do { + unsigned long this_pages; + next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot, - prot_numa)) { - pages += HPAGE_PMD_NR; - continue; + else { + int nr_ptes = change_huge_pmd(vma, pmd, addr, + newprot, prot_numa); + + if (nr_ptes) { + if (nr_ptes == HPAGE_PMD_NR) { + pages += HPAGE_PMD_NR; + nr_huge_updates++; + } + continue; + } } /* fall through */ } if (pmd_none_or_clear_bad(pmd)) continue; - pages += change_pte_range(vma, pmd, addr, next, newprot, - dirty_accountable, prot_numa, &all_same_node); - - /* - * If we are changing protections for NUMA hinting faults then - * set pmd_numa if the examined pages were all on the same - * node. This allows a regular PMD to be handled as one fault - * and effectively batches the taking of the PTL - */ - if (prot_numa && all_same_node) - change_pmd_protnuma(vma->vm_mm, addr, pmd); + this_pages = change_pte_range(vma, pmd, addr, next, newprot, + dirty_accountable, prot_numa); + pages += this_pages; } while (pmd++, addr = next, addr != end); + if (nr_huge_updates) + count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); return pages; } @@ -201,6 +188,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); + set_tlb_flush_pending(mm); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) @@ -212,6 +200,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, /* Only flush the TLB if we actually modified any entries: */ if (pages) flush_tlb_range(vma, start, end); + clear_tlb_flush_pending(mm); return pages; } diff --git a/mm/mremap.c b/mm/mremap.c index 91b13d6a16d4..0843feb66f3d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -25,7 +25,6 @@ #include <asm/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> -#include <asm/pgalloc.h> #include "internal.h" @@ -63,10 +62,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return NULL; pmd = pmd_alloc(mm, pud, addr); - if (!pmd) { - pud_free(mm, pud); + if (!pmd) return NULL; - } VM_BUG_ON(pmd_trans_huge(*pmd)); diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 61107cf55bb3..2c254d374655 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -82,27 +82,18 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) static void __init __free_pages_memory(unsigned long start, unsigned long end) { - unsigned long i, start_aligned, end_aligned; - int order = ilog2(BITS_PER_LONG); + int order; - start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); - end_aligned = end & ~(BITS_PER_LONG - 1); + while (start < end) { + order = min(MAX_ORDER - 1UL, __ffs(start)); - if (end_aligned <= start_aligned) { - for (i = start; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); + while (start + (1UL << order) > end) + order--; - return; - } - - for (i = start; i < start_aligned; i++) - __free_pages_bootmem(pfn_to_page(i), 0); + __free_pages_bootmem(pfn_to_page(start), order); - for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) - __free_pages_bootmem(pfn_to_page(i), order); - - for (i = end_aligned; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); + start += (1UL << order); + } } static unsigned long __init __free_memory_core(phys_addr_t start, diff --git a/mm/nommu.c b/mm/nommu.c index ecd1f158548e..fec093adad9c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -937,7 +937,7 @@ static int validate_mmap_request(struct file *file, struct address_space *mapping; /* files must support mmap */ - if (!file->f_op || !file->f_op->mmap) + if (!file->f_op->mmap) return -ENODEV; /* work out if what we've got could possibly be shared @@ -1948,13 +1948,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) goto error; } - allowed = totalram_pages * sysctl_overcommit_ratio / 100; + allowed = vm_commit_limit(); /* * Reserve some 3% for root */ if (!cap_sys_admin) allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - allowed += total_swap_pages; /* * Don't let a single process grow so big a user can't recover diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 314e9d274381..1e4a600a6163 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -161,7 +161,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + p->mm->nr_ptes + + points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + get_mm_counter(p->mm, MM_SWAPENTS); task_unlock(p); @@ -364,10 +364,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - task->mm->nr_ptes, + atomic_long_read(&task->mm->nr_ptes), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); @@ -680,7 +680,7 @@ void pagefault_out_of_memory(void) { struct zonelist *zonelist; - if (mem_cgroup_oom_synchronize()) + if (mem_cgroup_oom_synchronize(true)) return; zonelist = node_zonelist(first_online_node, GFP_KERNEL); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f5236f804aa6..63807583d8e8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1210,11 +1210,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty, return 1; } -static long bdi_max_pause(struct backing_dev_info *bdi, - unsigned long bdi_dirty) +static unsigned long bdi_max_pause(struct backing_dev_info *bdi, + unsigned long bdi_dirty) { - long bw = bdi->avg_write_bandwidth; - long t; + unsigned long bw = bdi->avg_write_bandwidth; + unsigned long t; /* * Limit pause time for small memory systems. If sleeping for too long @@ -1226,7 +1226,7 @@ static long bdi_max_pause(struct backing_dev_info *bdi, t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); t++; - return min_t(long, t, MAX_PAUSE); + return min_t(unsigned long, t, MAX_PAUSE); } static long bdi_min_pause(struct backing_dev_info *bdi, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0ee638f76ebe..5248fe070aa4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -234,8 +234,8 @@ int page_group_by_mobility_disabled __read_mostly; void set_pageblock_migratetype(struct page *page, int migratetype) { - - if (unlikely(page_group_by_mobility_disabled)) + if (unlikely(page_group_by_mobility_disabled && + migratetype < MIGRATE_PCPTYPES)) migratetype = MIGRATE_UNMOVABLE; set_pageblock_flags_group(page, (unsigned long)migratetype, @@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page) bad_page(page); return 1; } - page_nid_reset_last(page); + page_cpupid_reset_last(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -1027,6 +1027,10 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, { int current_order = page_order(page); + /* + * When borrowing from MIGRATE_CMA, we need to release the excess + * buddy pages to CMA itself. + */ if (is_migrate_cma(fallback_type)) return fallback_type; @@ -1091,21 +1095,11 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) list_del(&page->lru); rmv_page_order(page); - /* - * Borrow the excess buddy pages as well, irrespective - * of whether we stole freepages, or took ownership of - * the pageblock or not. - * - * Exception: When borrowing from MIGRATE_CMA, release - * the excess buddy pages to CMA itself. - */ expand(zone, page, order, current_order, area, - is_migrate_cma(migratetype) - ? migratetype : start_migratetype); + new_type); - trace_mm_page_alloc_extfrag(page, order, - current_order, start_migratetype, migratetype, - new_type == start_migratetype); + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, migratetype, new_type); return page; } @@ -1711,7 +1705,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, * comments in mmzone.h. Reduces cache footprint of zonelist scans * that have to skip over a lot of full or unallowed zones. * - * If the zonelist cache is present in the passed in zonelist, then + * If the zonelist cache is present in the passed zonelist, then * returns a pointer to the allowed node mask (either the current * tasks mems_allowed, or node_states[N_MEMORY].) * @@ -1822,7 +1816,7 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) static bool zone_local(struct zone *local_zone, struct zone *zone) { - return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE; + return local_zone->node == zone->node; } static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) @@ -1919,18 +1913,17 @@ zonelist_scan: * page was allocated in should have no effect on the * time the page has in memory before being reclaimed. * - * When zone_reclaim_mode is enabled, try to stay in - * local zones in the fastpath. If that fails, the - * slowpath is entered, which will do another pass - * starting with the local zones, but ultimately fall - * back to remote zones that do not partake in the - * fairness round-robin cycle of this zonelist. + * Try to stay in local zones in the fastpath. If + * that fails, the slowpath is entered, which will do + * another pass starting with the local zones, but + * ultimately fall back to remote zones that do not + * partake in the fairness round-robin cycle of this + * zonelist. */ if (alloc_flags & ALLOC_WMARK_LOW) { if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) continue; - if (zone_reclaim_mode && - !zone_local(preferred_zone, zone)) + if (!zone_local(preferred_zone, zone)) continue; } /* @@ -2396,7 +2389,7 @@ static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, * thrash fairness information for zones that are not * actually part of this zonelist's round-robin cycle. */ - if (zone_reclaim_mode && !zone_local(preferred_zone, zone)) + if (!zone_local(preferred_zone, zone)) continue; mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - @@ -2593,7 +2586,7 @@ rebalance: * running out of options and have to consider going OOM */ if (!did_some_progress) { - if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + if (oom_gfp_allowed(gfp_mask)) { if (oom_killer_disabled) goto nopage; /* Coredumps can quickly deplete all memory reserves */ @@ -3881,8 +3874,6 @@ static inline unsigned long wait_table_bits(unsigned long size) return ffz(~size); } -#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) - /* * Check if a pageblock contains reserved pages */ @@ -4015,7 +4006,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); - page_nid_reset_last(page); + page_cpupid_reset_last(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for @@ -4266,7 +4257,7 @@ static __meminit void zone_pcp_init(struct zone *zone) */ zone->pageset = &boot_pageset; - if (zone->present_pages) + if (populated_zone(zone)) printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", zone->name, zone->present_pages, zone_batchsize(zone)); @@ -5160,7 +5151,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; - if (zone->present_pages) { + if (populated_zone(zone)) { node_set_state(nid, N_HIGH_MEMORY); if (N_NORMAL_MEMORY != N_HIGH_MEMORY && zone_type <= ZONE_NORMAL) @@ -6366,10 +6357,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) list_del(&page->lru); rmv_page_order(page); zone->free_area[order].nr_free--; -#ifdef CONFIG_HIGHMEM - if (PageHighMem(page)) - totalhigh_pages -= 1 << order; -#endif for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); pfn += (1 << order); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 5da2cbcfdbb5..2beeabf502c5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -242,7 +242,7 @@ int walk_page_range(unsigned long addr, unsigned long end, if (err) break; pgd++; - } while (addr = next, addr != end); + } while (addr = next, addr < end); return err; } diff --git a/mm/percpu.c b/mm/percpu.c index 8c8e08f3a692..0d10defe951e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1706,8 +1706,9 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, out_free_areas: for (group = 0; group < ai->nr_groups; group++) - free_fn(areas[group], - ai->groups[group].nr_units * ai->unit_size); + if (areas[group]) + free_fn(areas[group], + ai->groups[group].nr_units * ai->unit_size); out_free: pcpu_free_alloc_info(ai); if (areas) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 3929a40bd6c0..a8b919925934 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -110,9 +110,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { + struct mm_struct *mm = (vma)->vm_mm; pte_t pte; - pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); - if (pte_accessible(pte)) + pte = ptep_get_and_clear(mm, address, ptep); + if (pte_accessible(mm, pte)) flush_tlb_page(vma, address); return pte; } @@ -151,14 +152,14 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ - if (!mm->pmd_huge_pte) + if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(&pgtable->lru); else - list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); - mm->pmd_huge_pte = pgtable; + list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); + pmd_huge_pte(mm, pmdp) = pgtable; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif @@ -170,14 +171,14 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ - pgtable = mm->pmd_huge_pte; + pgtable = pmd_huge_pte(mm, pmdp); if (list_empty(&pgtable->lru)) - mm->pmd_huge_pte = NULL; + pmd_huge_pte(mm, pmdp) = NULL; else { - mm->pmd_huge_pte = list_entry(pgtable->lru.next, + pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next, struct page, lru); list_del(&pgtable->lru); } @@ -191,6 +192,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { + pmd_t entry = *pmdp; + if (pmd_numa(entry)) + entry = pmd_mknonnuma(entry); set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } diff --git a/mm/readahead.c b/mm/readahead.c index e4ed04149785..7cdbb44aa90b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -401,6 +401,7 @@ ondemand_readahead(struct address_space *mapping, unsigned long req_size) { unsigned long max = max_sane_readahead(ra->ra_pages); + pgoff_t prev_offset; /* * start of file @@ -452,8 +453,11 @@ ondemand_readahead(struct address_space *mapping, /* * sequential cache miss + * trivial case: (offset - prev_offset) == 1 + * unaligned reads: (offset - prev_offset) == 0 */ - if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) + prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT; + if (offset - prev_offset <= 1UL) goto initial_readahead; /* @@ -569,7 +573,7 @@ static ssize_t do_readahead(struct address_space *mapping, struct file *filp, pgoff_t index, unsigned long nr) { - if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) + if (!mapping || !mapping->a_ops) return -EINVAL; force_page_cache_readahead(mapping, filp, index, nr); diff --git a/mm/rmap.c b/mm/rmap.c index fd3ee7a54a13..068522d8502a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -600,8 +600,12 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, spinlock_t *ptl; if (unlikely(PageHuge(page))) { + /* when pud is not present, pte will be NULL */ pte = huge_pte_offset(mm, address); - ptl = &mm->page_table_lock; + if (!pte) + return NULL; + + ptl = huge_pte_lockptr(page_hstate(page), mm, pte); goto check; } @@ -665,25 +669,23 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; int referenced = 0; if (unlikely(PageTransHuge(page))) { pmd_t *pmd; - spin_lock(&mm->page_table_lock); /* * rmap might return false positives; we must filter * these out using page_check_address_pmd(). */ pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_FLAG); - if (!pmd) { - spin_unlock(&mm->page_table_lock); + PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); + if (!pmd) goto out; - } if (vma->vm_flags & VM_LOCKED) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); *mapcount = 0; /* break early from loop */ *vm_flags |= VM_LOCKED; goto out; @@ -692,10 +694,9 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, /* go ahead even if the pmd is pmd_trans_splitting() */ if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); } else { pte_t *pte; - spinlock_t *ptl; /* * rmap might return false positives; we must filter diff --git a/mm/shmem.c b/mm/shmem.c index 8297623fcaed..902a14842b74 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2918,13 +2918,8 @@ static struct dentry_operations anon_ops = { .d_dname = simple_dname }; -/** - * shmem_file_setup - get an unlinked file living in tmpfs - * @name: name for dentry (to be seen in /proc/<pid>/maps - * @size: size to be set for the file - * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size - */ -struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +static struct file *__shmem_file_setup(const char *name, loff_t size, + unsigned long flags, unsigned int i_flags) { struct file *res; struct inode *inode; @@ -2957,6 +2952,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags if (!inode) goto put_dentry; + inode->i_flags |= i_flags; d_instantiate(path.dentry, inode); inode->i_size = size; clear_nlink(inode); /* It is unlinked */ @@ -2977,6 +2973,32 @@ put_memory: shmem_unacct_size(flags, size); return res; } + +/** + * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be + * kernel internal. There will be NO LSM permission checks against the + * underlying inode. So users of this interface must do LSM checks at a + * higher layer. The one user is the big_key implementation. LSM checks + * are provided at the key level rather than the inode level. + * @name: name for dentry (to be seen in /proc/<pid>/maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(name, size, flags, S_PRIVATE); +} + +/** + * shmem_file_setup - get an unlinked file living in tmpfs + * @name: name for dentry (to be seen in /proc/<pid>/maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(name, size, flags, 0); +} EXPORT_SYMBOL_GPL(shmem_file_setup); /** diff --git a/mm/slab.c b/mm/slab.c index 2580db062df9..eb043bf05f4c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -164,72 +164,6 @@ static bool pfmemalloc_active __read_mostly; /* - * kmem_bufctl_t: - * - * Bufctl's are used for linking objs within a slab - * linked offsets. - * - * This implementation relies on "struct page" for locating the cache & - * slab an object belongs to. - * This allows the bufctl structure to be small (one int), but limits - * the number of objects a slab (not a cache) can contain when off-slab - * bufctls are used. The limit is the size of the largest general cache - * that does not use off-slab slabs. - * For 32bit archs with 4 kB pages, is this 56. - * This is not serious, as it is only for large objects, when it is unwise - * to have too many per slab. - * Note: This limit can be raised by introducing a general cache whose size - * is less than 512 (PAGE_SIZE<<3), but greater than 256. - */ - -typedef unsigned int kmem_bufctl_t; -#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) -#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) -#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) -#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) - -/* - * struct slab_rcu - * - * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to - * arrange for kmem_freepages to be called via RCU. This is useful if - * we need to approach a kernel structure obliquely, from its address - * obtained without the usual locking. We can lock the structure to - * stabilize it and check it's still at the given address, only if we - * can be sure that the memory has not been meanwhile reused for some - * other kind of object (which our subsystem's lock might corrupt). - * - * rcu_read_lock before reading the address, then rcu_read_unlock after - * taking the spinlock within the structure expected at that address. - */ -struct slab_rcu { - struct rcu_head head; - struct kmem_cache *cachep; - void *addr; -}; - -/* - * struct slab - * - * Manages the objs in a slab. Placed either at the beginning of mem allocated - * for a slab, or allocated from an general cache. - * Slabs are chained into three list: fully used, partial, fully free slabs. - */ -struct slab { - union { - struct { - struct list_head list; - unsigned long colouroff; - void *s_mem; /* including colour offset */ - unsigned int inuse; /* num of objs active in slab */ - kmem_bufctl_t free; - unsigned short nodeid; - }; - struct slab_rcu __slab_cover_slab_rcu; - }; -}; - -/* * struct array_cache * * Purpose: @@ -456,18 +390,10 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) return page->slab_cache; } -static inline struct slab *virt_to_slab(const void *obj) -{ - struct page *page = virt_to_head_page(obj); - - VM_BUG_ON(!PageSlab(page)); - return page->slab_page; -} - -static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, +static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, unsigned int idx) { - return slab->s_mem + cache->size * idx; + return page->s_mem + cache->size * idx; } /* @@ -477,9 +403,9 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, * reciprocal_divide(offset, cache->reciprocal_buffer_size) */ static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) + const struct page *page, void *obj) { - u32 offset = (obj - slab->s_mem); + u32 offset = (obj - page->s_mem); return reciprocal_divide(offset, cache->reciprocal_buffer_size); } @@ -641,7 +567,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) static size_t slab_mgmt_size(size_t nr_objs, size_t align) { - return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); + return ALIGN(nr_objs * sizeof(unsigned int), align); } /* @@ -660,8 +586,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * on it. For the latter case, the memory allocated for a * slab is used for: * - * - The struct slab - * - One kmem_bufctl_t for each object + * - One unsigned int for each object * - Padding to respect alignment of @align * - @buffer_size bytes for each object * @@ -674,8 +599,6 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, mgmt_size = 0; nr_objs = slab_size / buffer_size; - if (nr_objs > SLAB_LIMIT) - nr_objs = SLAB_LIMIT; } else { /* * Ignore padding for the initial guess. The padding @@ -685,8 +608,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * into the memory allocation when taking the padding * into account. */ - nr_objs = (slab_size - sizeof(struct slab)) / - (buffer_size + sizeof(kmem_bufctl_t)); + nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int)); /* * This calculated number will be either the right @@ -696,9 +618,6 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, > slab_size) nr_objs--; - if (nr_objs > SLAB_LIMIT) - nr_objs = SLAB_LIMIT; - mgmt_size = slab_mgmt_size(nr_objs, align); } *num = nr_objs; @@ -829,10 +748,8 @@ static struct array_cache *alloc_arraycache(int node, int entries, return nc; } -static inline bool is_slab_pfmemalloc(struct slab *slabp) +static inline bool is_slab_pfmemalloc(struct page *page) { - struct page *page = virt_to_page(slabp->s_mem); - return PageSlabPfmemalloc(page); } @@ -841,23 +758,23 @@ static void recheck_pfmemalloc_active(struct kmem_cache *cachep, struct array_cache *ac) { struct kmem_cache_node *n = cachep->node[numa_mem_id()]; - struct slab *slabp; + struct page *page; unsigned long flags; if (!pfmemalloc_active) return; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(slabp, &n->slabs_full, list) - if (is_slab_pfmemalloc(slabp)) + list_for_each_entry(page, &n->slabs_full, lru) + if (is_slab_pfmemalloc(page)) goto out; - list_for_each_entry(slabp, &n->slabs_partial, list) - if (is_slab_pfmemalloc(slabp)) + list_for_each_entry(page, &n->slabs_partial, lru) + if (is_slab_pfmemalloc(page)) goto out; - list_for_each_entry(slabp, &n->slabs_free, list) - if (is_slab_pfmemalloc(slabp)) + list_for_each_entry(page, &n->slabs_free, lru) + if (is_slab_pfmemalloc(page)) goto out; pfmemalloc_active = false; @@ -897,8 +814,8 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, */ n = cachep->node[numa_mem_id()]; if (!list_empty(&n->slabs_free) && force_refill) { - struct slab *slabp = virt_to_slab(objp); - ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem)); + struct page *page = virt_to_head_page(objp); + ClearPageSlabPfmemalloc(page); clear_obj_pfmemalloc(&objp); recheck_pfmemalloc_active(cachep, ac); return objp; @@ -1099,8 +1016,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) { - struct slab *slabp = virt_to_slab(objp); - int nodeid = slabp->nodeid; + int nodeid = page_to_nid(virt_to_page(objp)); struct kmem_cache_node *n; struct array_cache *alien = NULL; int node; @@ -1111,7 +1027,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) * Make sure we are not freeing a object from another node to the array * cache on this cpu. */ - if (likely(slabp->nodeid == node)) + if (likely(nodeid == node)) return 0; n = cachep->node[node]; @@ -1512,6 +1428,8 @@ void __init kmem_cache_init(void) { int i; + BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < + sizeof(struct rcu_head)); kmem_cache = &kmem_cache_boot; setup_node_pointer(kmem_cache); @@ -1687,7 +1605,7 @@ static noinline void slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) { struct kmem_cache_node *n; - struct slab *slabp; + struct page *page; unsigned long flags; int node; @@ -1706,15 +1624,15 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) continue; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(slabp, &n->slabs_full, list) { + list_for_each_entry(page, &n->slabs_full, lru) { active_objs += cachep->num; active_slabs++; } - list_for_each_entry(slabp, &n->slabs_partial, list) { - active_objs += slabp->inuse; + list_for_each_entry(page, &n->slabs_partial, lru) { + active_objs += page->active; active_slabs++; } - list_for_each_entry(slabp, &n->slabs_free, list) + list_for_each_entry(page, &n->slabs_free, lru) num_slabs++; free_objects += n->free_objects; @@ -1736,19 +1654,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) * did not request dmaable memory, we might get it, but that * would be relatively rare and ignorable. */ -static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) +static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, + int nodeid) { struct page *page; int nr_pages; - int i; - -#ifndef CONFIG_MMU - /* - * Nommu uses slab's for process anonymous memory allocations, and thus - * requires __GFP_COMP to properly refcount higher order allocations - */ - flags |= __GFP_COMP; -#endif flags |= cachep->allocflags; if (cachep->flags & SLAB_RECLAIM_ACCOUNT) @@ -1772,12 +1682,9 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) else add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); - for (i = 0; i < nr_pages; i++) { - __SetPageSlab(page + i); - - if (page->pfmemalloc) - SetPageSlabPfmemalloc(page + i); - } + __SetPageSlab(page); + if (page->pfmemalloc) + SetPageSlabPfmemalloc(page); memcg_bind_pages(cachep, cachep->gfporder); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { @@ -1789,17 +1696,15 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) kmemcheck_mark_unallocated_pages(page, nr_pages); } - return page_address(page); + return page; } /* * Interface to system's page release. */ -static void kmem_freepages(struct kmem_cache *cachep, void *addr) +static void kmem_freepages(struct kmem_cache *cachep, struct page *page) { - unsigned long i = (1 << cachep->gfporder); - struct page *page = virt_to_page(addr); - const unsigned long nr_freed = i; + const unsigned long nr_freed = (1 << cachep->gfporder); kmemcheck_free_shadow(page, cachep->gfporder); @@ -1809,27 +1714,28 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) else sub_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_freed); - while (i--) { - BUG_ON(!PageSlab(page)); - __ClearPageSlabPfmemalloc(page); - __ClearPageSlab(page); - page++; - } + + BUG_ON(!PageSlab(page)); + __ClearPageSlabPfmemalloc(page); + __ClearPageSlab(page); + page_mapcount_reset(page); + page->mapping = NULL; memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder); + __free_memcg_kmem_pages(page, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) { - struct slab_rcu *slab_rcu = (struct slab_rcu *)head; - struct kmem_cache *cachep = slab_rcu->cachep; + struct kmem_cache *cachep; + struct page *page; - kmem_freepages(cachep, slab_rcu->addr); - if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->slabp_cache, slab_rcu); + page = container_of(head, struct page, rcu_head); + cachep = page->slab_cache; + + kmem_freepages(cachep, page); } #if DEBUG @@ -1978,19 +1884,19 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print some data about the neighboring objects, if they * exist: */ - struct slab *slabp = virt_to_slab(objp); + struct page *page = virt_to_head_page(objp); unsigned int objnr; - objnr = obj_to_index(cachep, slabp, objp); + objnr = obj_to_index(cachep, page, objp); if (objnr) { - objp = index_to_obj(cachep, slabp, objnr - 1); + objp = index_to_obj(cachep, page, objnr - 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Prev obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { - objp = index_to_obj(cachep, slabp, objnr + 1); + objp = index_to_obj(cachep, page, objnr + 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Next obj: start=%p, len=%d\n", realobj, size); @@ -2001,11 +1907,12 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) #endif #if DEBUG -static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, + struct page *page) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, slabp, i); + void *objp = index_to_obj(cachep, page, i); if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC @@ -2030,7 +1937,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab } } #else -static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, + struct page *page) { } #endif @@ -2044,23 +1952,34 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab * Before calling the slab must have been unlinked from the cache. The * cache-lock is not held/needed. */ -static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy(struct kmem_cache *cachep, struct page *page) { - void *addr = slabp->s_mem - slabp->colouroff; + void *freelist; - slab_destroy_debugcheck(cachep, slabp); + freelist = page->freelist; + slab_destroy_debugcheck(cachep, page); if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { - struct slab_rcu *slab_rcu; + struct rcu_head *head; + + /* + * RCU free overloads the RCU head over the LRU. + * slab_page has been overloeaded over the LRU, + * however it is not used from now on so that + * we can use it safely. + */ + head = (void *)&page->rcu_head; + call_rcu(head, kmem_rcu_free); - slab_rcu = (struct slab_rcu *)slabp; - slab_rcu->cachep = cachep; - slab_rcu->addr = addr; - call_rcu(&slab_rcu->head, kmem_rcu_free); } else { - kmem_freepages(cachep, addr); - if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->slabp_cache, slabp); + kmem_freepages(cachep, page); } + + /* + * From now on, we don't use freelist + * although actual page can be freed in rcu context + */ + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->freelist_cache, freelist); } /** @@ -2097,8 +2016,8 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). */ - offslab_limit = size - sizeof(struct slab); - offslab_limit /= sizeof(kmem_bufctl_t); + offslab_limit = size; + offslab_limit /= sizeof(unsigned int); if (num > offslab_limit) break; @@ -2220,7 +2139,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) int __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { - size_t left_over, slab_size, ralign; + size_t left_over, freelist_size, ralign; gfp_t gfp; int err; size_t size = cachep->size; @@ -2339,22 +2258,21 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (!cachep->num) return -E2BIG; - slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) - + sizeof(struct slab), cachep->align); + freelist_size = + ALIGN(cachep->num * sizeof(unsigned int), cachep->align); /* * If the slab has been placed off-slab, and we have enough space then * move it on-slab. This is at the expense of any extra colouring. */ - if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { + if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { flags &= ~CFLGS_OFF_SLAB; - left_over -= slab_size; + left_over -= freelist_size; } if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ - slab_size = - cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); + freelist_size = cachep->num * sizeof(unsigned int); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() @@ -2371,16 +2289,16 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (cachep->colour_off < cachep->align) cachep->colour_off = cachep->align; cachep->colour = left_over / cachep->colour_off; - cachep->slab_size = slab_size; + cachep->freelist_size = freelist_size; cachep->flags = flags; - cachep->allocflags = 0; + cachep->allocflags = __GFP_COMP; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) cachep->allocflags |= GFP_DMA; cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { - cachep->slabp_cache = kmalloc_slab(slab_size, 0u); + cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); /* * This is a possibility for one of the malloc_sizes caches. * But since we go off slab only for object size greater than @@ -2388,7 +2306,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * this should not happen at all. * But leave a BUG_ON for some lucky dude. */ - BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); + BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); } err = setup_cpu_cache(cachep, gfp); @@ -2494,7 +2412,7 @@ static int drain_freelist(struct kmem_cache *cache, { struct list_head *p; int nr_freed; - struct slab *slabp; + struct page *page; nr_freed = 0; while (nr_freed < tofree && !list_empty(&n->slabs_free)) { @@ -2506,18 +2424,18 @@ static int drain_freelist(struct kmem_cache *cache, goto out; } - slabp = list_entry(p, struct slab, list); + page = list_entry(p, struct page, lru); #if DEBUG - BUG_ON(slabp->inuse); + BUG_ON(page->active); #endif - list_del(&slabp->list); + list_del(&page->lru); /* * Safe to drop the lock. The slab is no longer linked * to the cache. */ n->free_objects -= cache->num; spin_unlock_irq(&n->list_lock); - slab_destroy(cache, slabp); + slab_destroy(cache, page); nr_freed++; } out: @@ -2600,52 +2518,42 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) * descriptors in kmem_cache_create, we search through the malloc_sizes array. * If we are creating a malloc_sizes cache here it would not be visible to * kmem_find_general_cachep till the initialization is complete. - * Hence we cannot have slabp_cache same as the original cache. + * Hence we cannot have freelist_cache same as the original cache. */ -static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, - int colour_off, gfp_t local_flags, - int nodeid) +static void *alloc_slabmgmt(struct kmem_cache *cachep, + struct page *page, int colour_off, + gfp_t local_flags, int nodeid) { - struct slab *slabp; + void *freelist; + void *addr = page_address(page); if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ - slabp = kmem_cache_alloc_node(cachep->slabp_cache, + freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); - /* - * If the first object in the slab is leaked (it's allocated - * but no one has a reference to it), we want to make sure - * kmemleak does not treat the ->s_mem pointer as a reference - * to the object. Otherwise we will not report the leak. - */ - kmemleak_scan_area(&slabp->list, sizeof(struct list_head), - local_flags); - if (!slabp) + if (!freelist) return NULL; } else { - slabp = objp + colour_off; - colour_off += cachep->slab_size; + freelist = addr + colour_off; + colour_off += cachep->freelist_size; } - slabp->inuse = 0; - slabp->colouroff = colour_off; - slabp->s_mem = objp + colour_off; - slabp->nodeid = nodeid; - slabp->free = 0; - return slabp; + page->active = 0; + page->s_mem = addr + colour_off; + return freelist; } -static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) +static inline unsigned int *slab_freelist(struct page *page) { - return (kmem_bufctl_t *) (slabp + 1); + return (unsigned int *)(page->freelist); } static void cache_init_objs(struct kmem_cache *cachep, - struct slab *slabp) + struct page *page) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, slabp, i); + void *objp = index_to_obj(cachep, page, i); #if DEBUG /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) @@ -2681,9 +2589,8 @@ static void cache_init_objs(struct kmem_cache *cachep, if (cachep->ctor) cachep->ctor(objp); #endif - slab_bufctl(slabp)[i] = i + 1; + slab_freelist(page)[i] = i; } - slab_bufctl(slabp)[i - 1] = BUFCTL_END; } static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) @@ -2696,41 +2603,41 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) } } -static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, +static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, int nodeid) { - void *objp = index_to_obj(cachep, slabp, slabp->free); - kmem_bufctl_t next; + void *objp; - slabp->inuse++; - next = slab_bufctl(slabp)[slabp->free]; + objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]); + page->active++; #if DEBUG - slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; - WARN_ON(slabp->nodeid != nodeid); + WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); #endif - slabp->free = next; return objp; } -static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, +static void slab_put_obj(struct kmem_cache *cachep, struct page *page, void *objp, int nodeid) { - unsigned int objnr = obj_to_index(cachep, slabp, objp); - + unsigned int objnr = obj_to_index(cachep, page, objp); #if DEBUG + unsigned int i; + /* Verify that the slab belongs to the intended node */ - WARN_ON(slabp->nodeid != nodeid); + WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); - if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { - printk(KERN_ERR "slab: double free detected in cache " - "'%s', objp %p\n", cachep->name, objp); - BUG(); + /* Verify double free bug */ + for (i = page->active; i < cachep->num; i++) { + if (slab_freelist(page)[i] == objnr) { + printk(KERN_ERR "slab: double free detected in cache " + "'%s', objp %p\n", cachep->name, objp); + BUG(); + } } #endif - slab_bufctl(slabp)[objnr] = slabp->free; - slabp->free = objnr; - slabp->inuse--; + page->active--; + slab_freelist(page)[page->active] = objnr; } /* @@ -2738,23 +2645,11 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, * for the slab allocator to be able to lookup the cache and slab of a * virtual address for kfree, ksize, and slab debugging. */ -static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, - void *addr) +static void slab_map_pages(struct kmem_cache *cache, struct page *page, + void *freelist) { - int nr_pages; - struct page *page; - - page = virt_to_page(addr); - - nr_pages = 1; - if (likely(!PageCompound(page))) - nr_pages <<= cache->gfporder; - - do { - page->slab_cache = cache; - page->slab_page = slab; - page++; - } while (--nr_pages); + page->slab_cache = cache; + page->freelist = freelist; } /* @@ -2762,9 +2657,9 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, * kmem_cache_alloc() when there are no active objs left in a cache. */ static int cache_grow(struct kmem_cache *cachep, - gfp_t flags, int nodeid, void *objp) + gfp_t flags, int nodeid, struct page *page) { - struct slab *slabp; + void *freelist; size_t offset; gfp_t local_flags; struct kmem_cache_node *n; @@ -2805,20 +2700,20 @@ static int cache_grow(struct kmem_cache *cachep, * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ - if (!objp) - objp = kmem_getpages(cachep, local_flags, nodeid); - if (!objp) + if (!page) + page = kmem_getpages(cachep, local_flags, nodeid); + if (!page) goto failed; /* Get slab management. */ - slabp = alloc_slabmgmt(cachep, objp, offset, + freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid); - if (!slabp) + if (!freelist) goto opps1; - slab_map_pages(cachep, slabp, objp); + slab_map_pages(cachep, page, freelist); - cache_init_objs(cachep, slabp); + cache_init_objs(cachep, page); if (local_flags & __GFP_WAIT) local_irq_disable(); @@ -2826,13 +2721,13 @@ static int cache_grow(struct kmem_cache *cachep, spin_lock(&n->list_lock); /* Make slab active. */ - list_add_tail(&slabp->list, &(n->slabs_free)); + list_add_tail(&page->lru, &(n->slabs_free)); STATS_INC_GROWN(cachep); n->free_objects += cachep->num; spin_unlock(&n->list_lock); return 1; opps1: - kmem_freepages(cachep, objp); + kmem_freepages(cachep, page); failed: if (local_flags & __GFP_WAIT) local_irq_disable(); @@ -2880,9 +2775,8 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, unsigned long caller) { - struct page *page; unsigned int objnr; - struct slab *slabp; + struct page *page; BUG_ON(virt_to_cache(objp) != cachep); @@ -2890,8 +2784,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, kfree_debugcheck(objp); page = virt_to_head_page(objp); - slabp = page->slab_page; - if (cachep->flags & SLAB_RED_ZONE) { verify_redzone_free(cachep, objp); *dbg_redzone1(cachep, objp) = RED_INACTIVE; @@ -2900,14 +2792,11 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = (void *)caller; - objnr = obj_to_index(cachep, slabp, objp); + objnr = obj_to_index(cachep, page, objp); BUG_ON(objnr >= cachep->num); - BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); + BUG_ON(objp != index_to_obj(cachep, page, objnr)); -#ifdef CONFIG_DEBUG_SLAB_LEAK - slab_bufctl(slabp)[objnr] = BUFCTL_FREE; -#endif if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { @@ -2924,33 +2813,9 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, return objp; } -static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) -{ - kmem_bufctl_t i; - int entries = 0; - - /* Check slab's freelist to see if this obj is there. */ - for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { - entries++; - if (entries > cachep->num || i >= cachep->num) - goto bad; - } - if (entries != cachep->num - slabp->inuse) { -bad: - printk(KERN_ERR "slab: Internal list corruption detected in " - "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n", - cachep->name, cachep->num, slabp, slabp->inuse, - print_tainted()); - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, - sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), - 1); - BUG(); - } -} #else #define kfree_debugcheck(x) do { } while(0) #define cache_free_debugcheck(x,objp,z) (objp) -#define check_slabp(x,y) do { } while(0) #endif static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, @@ -2989,7 +2854,7 @@ retry: while (batchcount > 0) { struct list_head *entry; - struct slab *slabp; + struct page *page; /* Get slab alloc is to come from. */ entry = n->slabs_partial.next; if (entry == &n->slabs_partial) { @@ -2999,8 +2864,7 @@ retry: goto must_grow; } - slabp = list_entry(entry, struct slab, list); - check_slabp(cachep, slabp); + page = list_entry(entry, struct page, lru); check_spinlock_acquired(cachep); /* @@ -3008,24 +2872,23 @@ retry: * there must be at least one object available for * allocation. */ - BUG_ON(slabp->inuse >= cachep->num); + BUG_ON(page->active >= cachep->num); - while (slabp->inuse < cachep->num && batchcount--) { + while (page->active < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp, + ac_put_obj(cachep, ac, slab_get_obj(cachep, page, node)); } - check_slabp(cachep, slabp); /* move slabp to correct slabp list: */ - list_del(&slabp->list); - if (slabp->free == BUFCTL_END) - list_add(&slabp->list, &n->slabs_full); + list_del(&page->lru); + if (page->active == cachep->num) + list_add(&page->list, &n->slabs_full); else - list_add(&slabp->list, &n->slabs_partial); + list_add(&page->list, &n->slabs_partial); } must_grow: @@ -3097,16 +2960,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; } -#ifdef CONFIG_DEBUG_SLAB_LEAK - { - struct slab *slabp; - unsigned objnr; - - slabp = virt_to_head_page(objp)->slab_page; - objnr = (unsigned)(objp - slabp->s_mem) / cachep->size; - slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; - } -#endif objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); @@ -3248,18 +3101,20 @@ retry: * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ + struct page *page; + if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, numa_mem_id()); + page = kmem_getpages(cache, local_flags, numa_mem_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); - if (obj) { + if (page) { /* * Insert into the appropriate per node queues */ - nid = page_to_nid(virt_to_page(obj)); - if (cache_grow(cache, flags, nid, obj)) { + nid = page_to_nid(page); + if (cache_grow(cache, flags, nid, page)) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); if (!obj) @@ -3288,7 +3143,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { struct list_head *entry; - struct slab *slabp; + struct page *page; struct kmem_cache_node *n; void *obj; int x; @@ -3308,26 +3163,24 @@ retry: goto must_grow; } - slabp = list_entry(entry, struct slab, list); + page = list_entry(entry, struct page, lru); check_spinlock_acquired_node(cachep, nodeid); - check_slabp(cachep, slabp); STATS_INC_NODEALLOCS(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - BUG_ON(slabp->inuse == cachep->num); + BUG_ON(page->active == cachep->num); - obj = slab_get_obj(cachep, slabp, nodeid); - check_slabp(cachep, slabp); + obj = slab_get_obj(cachep, page, nodeid); n->free_objects--; /* move slabp to correct slabp list: */ - list_del(&slabp->list); + list_del(&page->lru); - if (slabp->free == BUFCTL_END) - list_add(&slabp->list, &n->slabs_full); + if (page->active == cachep->num) + list_add(&page->lru, &n->slabs_full); else - list_add(&slabp->list, &n->slabs_partial); + list_add(&page->lru, &n->slabs_partial); spin_unlock(&n->list_lock); goto done; @@ -3477,23 +3330,21 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, for (i = 0; i < nr_objects; i++) { void *objp; - struct slab *slabp; + struct page *page; clear_obj_pfmemalloc(&objpp[i]); objp = objpp[i]; - slabp = virt_to_slab(objp); + page = virt_to_head_page(objp); n = cachep->node[node]; - list_del(&slabp->list); + list_del(&page->lru); check_spinlock_acquired_node(cachep, node); - check_slabp(cachep, slabp); - slab_put_obj(cachep, slabp, objp, node); + slab_put_obj(cachep, page, objp, node); STATS_DEC_ACTIVE(cachep); n->free_objects++; - check_slabp(cachep, slabp); /* fixup slab chains */ - if (slabp->inuse == 0) { + if (page->active == 0) { if (n->free_objects > n->free_limit) { n->free_objects -= cachep->num; /* No need to drop any previously held @@ -3502,16 +3353,16 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, * a different cache, refer to comments before * alloc_slabmgmt. */ - slab_destroy(cachep, slabp); + slab_destroy(cachep, page); } else { - list_add(&slabp->list, &n->slabs_free); + list_add(&page->lru, &n->slabs_free); } } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. */ - list_add_tail(&slabp->list, &n->slabs_partial); + list_add_tail(&page->lru, &n->slabs_partial); } } } @@ -3551,10 +3402,10 @@ free_done: p = n->slabs_free.next; while (p != &(n->slabs_free)) { - struct slab *slabp; + struct page *page; - slabp = list_entry(p, struct slab, list); - BUG_ON(slabp->inuse); + page = list_entry(p, struct page, lru); + BUG_ON(page->active); i++; p = p->next; @@ -3982,7 +3833,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, VM_BUG_ON(!mutex_is_locked(&slab_mutex)); for_each_memcg_cache_index(i) { - c = cache_from_memcg(cachep, i); + c = cache_from_memcg_idx(cachep, i); if (c) /* return value determined by the parent cache only */ __do_tune_cpucache(c, limit, batchcount, shared, gfp); @@ -4158,7 +4009,7 @@ out: #ifdef CONFIG_SLABINFO void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { - struct slab *slabp; + struct page *page; unsigned long active_objs; unsigned long num_objs; unsigned long active_slabs = 0; @@ -4178,23 +4029,23 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) check_irq_on(); spin_lock_irq(&n->list_lock); - list_for_each_entry(slabp, &n->slabs_full, list) { - if (slabp->inuse != cachep->num && !error) + list_for_each_entry(page, &n->slabs_full, lru) { + if (page->active != cachep->num && !error) error = "slabs_full accounting error"; active_objs += cachep->num; active_slabs++; } - list_for_each_entry(slabp, &n->slabs_partial, list) { - if (slabp->inuse == cachep->num && !error) - error = "slabs_partial inuse accounting error"; - if (!slabp->inuse && !error) - error = "slabs_partial/inuse accounting error"; - active_objs += slabp->inuse; + list_for_each_entry(page, &n->slabs_partial, lru) { + if (page->active == cachep->num && !error) + error = "slabs_partial accounting error"; + if (!page->active && !error) + error = "slabs_partial accounting error"; + active_objs += page->active; active_slabs++; } - list_for_each_entry(slabp, &n->slabs_free, list) { - if (slabp->inuse && !error) - error = "slabs_free/inuse accounting error"; + list_for_each_entry(page, &n->slabs_free, lru) { + if (page->active && !error) + error = "slabs_free accounting error"; num_slabs++; } free_objects += n->free_objects; @@ -4346,15 +4197,27 @@ static inline int add_caller(unsigned long *n, unsigned long v) return 1; } -static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) +static void handle_slab(unsigned long *n, struct kmem_cache *c, + struct page *page) { void *p; - int i; + int i, j; + if (n[0] == n[1]) return; - for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) { - if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) + for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { + bool active = true; + + for (j = page->active; j < c->num; j++) { + /* Skip freed item */ + if (slab_freelist(page)[j] == i) { + active = false; + break; + } + } + if (!active) continue; + if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) return; } @@ -4379,7 +4242,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) static int leaks_show(struct seq_file *m, void *p) { struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); - struct slab *slabp; + struct page *page; struct kmem_cache_node *n; const char *name; unsigned long *x = m->private; @@ -4403,10 +4266,10 @@ static int leaks_show(struct seq_file *m, void *p) check_irq_on(); spin_lock_irq(&n->list_lock); - list_for_each_entry(slabp, &n->slabs_full, list) - handle_slab(x, cachep, slabp); - list_for_each_entry(slabp, &n->slabs_partial, list) - handle_slab(x, cachep, slabp); + list_for_each_entry(page, &n->slabs_full, lru) + handle_slab(x, cachep, page); + list_for_each_entry(page, &n->slabs_partial, lru) + handle_slab(x, cachep, page); spin_unlock_irq(&n->list_lock); } name = cachep->name; diff --git a/mm/slab.h b/mm/slab.h index a535033f7e9a..0859c4241ba1 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -160,7 +160,8 @@ static inline const char *cache_name(struct kmem_cache *s) return s->name; } -static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +static inline struct kmem_cache * +cache_from_memcg_idx(struct kmem_cache *s, int idx) { if (!s->memcg_params) return NULL; @@ -204,7 +205,8 @@ static inline const char *cache_name(struct kmem_cache *s) return s->name; } -static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +static inline struct kmem_cache * +cache_from_memcg_idx(struct kmem_cache *s, int idx) { return NULL; } diff --git a/mm/slab_common.c b/mm/slab_common.c index a3443278ce3a..0b7bb399b0e4 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -56,6 +56,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, continue; } +#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON) /* * For simplicity, we won't check this in the list of memcg * caches. We have control over memcg naming, and if there @@ -69,6 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, s = NULL; return -EINVAL; } +#endif } WARN_ON(strchr(name, ' ')); /* It confuses parsers */ @@ -569,7 +571,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) return; for_each_memcg_cache_index(i) { - c = cache_from_memcg(s, i); + c = cache_from_memcg_idx(s, i); if (!c) continue; diff --git a/mm/slub.c b/mm/slub.c index c3eb3d3ca835..545a170ebf9f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -155,7 +155,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* * Maximum number of desirable partial slabs. * The existence of more partial slabs makes kmem_cache_shrink - * sort the partial list by the number of objects in the. + * sort the partial list by the number of objects in use. */ #define MAX_PARTIAL 10 @@ -933,6 +933,16 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. */ +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +{ + kmemleak_alloc(ptr, size, 1, flags); +} + +static inline void kfree_hook(const void *x) +{ + kmemleak_free(x); +} + static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { flags &= gfp_allowed_mask; @@ -955,7 +965,7 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) kmemleak_free_recursive(x, s->flags); /* - * Trouble is that we may no longer disable interupts in the fast path + * Trouble is that we may no longer disable interrupts in the fast path * So in order to make the debug calls that expect irqs to be * disabled we need to disable interrupts temporarily. */ @@ -1217,8 +1227,8 @@ static unsigned long kmem_cache_flags(unsigned long object_size, /* * Enable debugging if selected on the kernel commandline. */ - if (slub_debug && (!slub_debug_slabs || - !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) + if (slub_debug && (!slub_debug_slabs || (name && + !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) flags |= slub_debug; return flags; @@ -1260,13 +1270,30 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +{ + kmemleak_alloc(ptr, size, 1, flags); +} + +static inline void kfree_hook(const void *x) +{ + kmemleak_free(x); +} + static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { return 0; } static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) {} + void *object) +{ + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, + flags & gfp_allowed_mask); +} -static inline void slab_free_hook(struct kmem_cache *s, void *x) {} +static inline void slab_free_hook(struct kmem_cache *s, void *x) +{ + kmemleak_free_recursive(x, s->flags); +} #endif /* CONFIG_SLUB_DEBUG */ @@ -2829,8 +2856,8 @@ static struct kmem_cache *kmem_cache_node; * slab on the node for this slabcache. There are no concurrent accesses * possible. * - * Note that this function only works on the kmalloc_node_cache - * when allocating for the kmalloc_node_cache. This is used for bootstrapping + * Note that this function only works on the kmem_cache_node + * when allocating for the kmem_cache_node. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ static void early_kmem_cache_node_alloc(int node) @@ -3272,7 +3299,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) if (page) ptr = page_address(page); - kmemleak_alloc(ptr, size, 1, flags); + kmalloc_large_node_hook(ptr, size, flags); return ptr; } @@ -3336,7 +3363,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); - kmemleak_free(x); + kfree_hook(x); __free_memcg_kmem_pages(page, compound_order(page)); return; } @@ -4983,7 +5010,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, * through the descendants with best-effort propagation. */ for_each_memcg_cache_index(i) { - struct kmem_cache *c = cache_from_memcg(s, i); + struct kmem_cache *c = cache_from_memcg_idx(s, i); if (c) attribute->store(c, buf, len); } diff --git a/mm/sparse.c b/mm/sparse.c index 4ac1d7ef548f..8cc7be0e9590 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -590,33 +590,32 @@ void __init sparse_init(void) #ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_SPARSEMEM_VMEMMAP -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, - unsigned long nr_pages) +static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) { /* This will make the necessary allocations eventually. */ return sparse_mem_map_populate(pnum, nid); } -static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) +static void __kfree_section_memmap(struct page *memmap) { unsigned long start = (unsigned long)memmap; - unsigned long end = (unsigned long)(memmap + nr_pages); + unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); vmemmap_free(start, end); } #ifdef CONFIG_MEMORY_HOTREMOVE -static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) +static void free_map_bootmem(struct page *memmap) { unsigned long start = (unsigned long)memmap; - unsigned long end = (unsigned long)(memmap + nr_pages); + unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); vmemmap_free(start, end); } #endif /* CONFIG_MEMORY_HOTREMOVE */ #else -static struct page *__kmalloc_section_memmap(unsigned long nr_pages) +static struct page *__kmalloc_section_memmap(void) { struct page *page, *ret; - unsigned long memmap_size = sizeof(struct page) * nr_pages; + unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION; page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); if (page) @@ -634,28 +633,30 @@ got_map_ptr: return ret; } -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, - unsigned long nr_pages) +static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) { - return __kmalloc_section_memmap(nr_pages); + return __kmalloc_section_memmap(); } -static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) +static void __kfree_section_memmap(struct page *memmap) { if (is_vmalloc_addr(memmap)) vfree(memmap); else free_pages((unsigned long)memmap, - get_order(sizeof(struct page) * nr_pages)); + get_order(sizeof(struct page) * PAGES_PER_SECTION)); } #ifdef CONFIG_MEMORY_HOTREMOVE -static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) +static void free_map_bootmem(struct page *memmap) { unsigned long maps_section_nr, removing_section_nr, i; - unsigned long magic; + unsigned long magic, nr_pages; struct page *page = virt_to_page(memmap); + nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) + >> PAGE_SHIFT; + for (i = 0; i < nr_pages; i++, page++) { magic = (unsigned long) page->lru.next; @@ -684,8 +685,7 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, - int nr_pages) +int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) { unsigned long section_nr = pfn_to_section_nr(start_pfn); struct pglist_data *pgdat = zone->zone_pgdat; @@ -702,12 +702,12 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, ret = sparse_index_init(section_nr, pgdat->node_id); if (ret < 0 && ret != -EEXIST) return ret; - memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); + memmap = kmalloc_section_memmap(section_nr, pgdat->node_id); if (!memmap) return -ENOMEM; usemap = __kmalloc_section_usemap(); if (!usemap) { - __kfree_section_memmap(memmap, nr_pages); + __kfree_section_memmap(memmap); return -ENOMEM; } @@ -719,7 +719,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, goto out; } - memset(memmap, 0, sizeof(struct page) * nr_pages); + memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); ms->section_mem_map |= SECTION_MARKED_PRESENT; @@ -729,7 +729,7 @@ out: pgdat_resize_unlock(pgdat, &flags); if (ret <= 0) { kfree(usemap); - __kfree_section_memmap(memmap, nr_pages); + __kfree_section_memmap(memmap); } return ret; } @@ -759,7 +759,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) static void free_section_usemap(struct page *memmap, unsigned long *usemap) { struct page *usemap_page; - unsigned long nr_pages; if (!usemap) return; @@ -771,7 +770,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) if (PageSlab(usemap_page) || PageCompound(usemap_page)) { kfree(usemap); if (memmap) - __kfree_section_memmap(memmap, PAGES_PER_SECTION); + __kfree_section_memmap(memmap); return; } @@ -780,12 +779,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) * on the section which has pgdat at boot time. Just keep it as is now. */ - if (memmap) { - nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) - >> PAGE_SHIFT; - - free_map_bootmem(memmap, nr_pages); - } + if (memmap) + free_map_bootmem(memmap); } void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) diff --git a/mm/swap.c b/mm/swap.c index 759c3caf44bd..84b26aaabd03 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -82,19 +82,6 @@ static void __put_compound_page(struct page *page) static void put_compound_page(struct page *page) { - /* - * hugetlbfs pages cannot be split from under us. If this is a - * hugetlbfs page, check refcount on head page and release the page if - * the refcount becomes zero. - */ - if (PageHuge(page)) { - page = compound_head(page); - if (put_page_testzero(page)) - __put_compound_page(page); - - return; - } - if (unlikely(PageTail(page))) { /* __split_huge_page_refcount can run under us */ struct page *page_head = compound_trans_head(page); @@ -111,14 +98,31 @@ static void put_compound_page(struct page *page) * still hot on arches that do not support * this_cpu_cmpxchg_double(). */ - if (PageSlab(page_head)) { - if (PageTail(page)) { + if (PageSlab(page_head) || PageHeadHuge(page_head)) { + if (likely(PageTail(page))) { + /* + * __split_huge_page_refcount + * cannot race here. + */ + VM_BUG_ON(!PageHead(page_head)); + atomic_dec(&page->_mapcount); if (put_page_testzero(page_head)) VM_BUG_ON(1); - - atomic_dec(&page->_mapcount); - goto skip_lock_tail; + if (put_page_testzero(page_head)) + __put_compound_page(page_head); + return; } else + /* + * __split_huge_page_refcount + * run before us, "page" was a + * THP tail. The split + * page_head has been freed + * and reallocated as slab or + * hugetlbfs page of smaller + * order (only possible if + * reallocated as slab on + * x86). + */ goto skip_lock; } /* @@ -132,8 +136,27 @@ static void put_compound_page(struct page *page) /* __split_huge_page_refcount run before us */ compound_unlock_irqrestore(page_head, flags); skip_lock: - if (put_page_testzero(page_head)) - __put_single_page(page_head); + if (put_page_testzero(page_head)) { + /* + * The head page may have been + * freed and reallocated as a + * compound page of smaller + * order and then freed again. + * All we know is that it + * cannot have become: a THP + * page, a compound page of + * higher order, a tail page. + * That is because we still + * hold the refcount of the + * split THP tail and + * page_head was the THP head + * before the split. + */ + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } out_put_single: if (put_page_testzero(page)) __put_single_page(page); @@ -155,7 +178,6 @@ out_put_single: VM_BUG_ON(atomic_read(&page->_count) != 0); compound_unlock_irqrestore(page_head, flags); -skip_lock_tail: if (put_page_testzero(page_head)) { if (PageHead(page_head)) __put_compound_page(page_head); @@ -198,51 +220,52 @@ bool __get_page_tail(struct page *page) * proper PT lock that already serializes against * split_huge_page(). */ + unsigned long flags; bool got = false; - struct page *page_head; - - /* - * If this is a hugetlbfs page it cannot be split under us. Simply - * increment refcount for the head page. - */ - if (PageHuge(page)) { - page_head = compound_head(page); - atomic_inc(&page_head->_count); - got = true; - } else { - unsigned long flags; + struct page *page_head = compound_trans_head(page); - page_head = compound_trans_head(page); - if (likely(page != page_head && - get_page_unless_zero(page_head))) { - - /* Ref to put_compound_page() comment. */ - if (PageSlab(page_head)) { - if (likely(PageTail(page))) { - __get_page_tail_foll(page, false); - return true; - } else { - put_page(page_head); - return false; - } - } - - /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. - */ - flags = compound_lock_irqsave(page_head); - /* here __split_huge_page_refcount won't run anymore */ + if (likely(page != page_head && get_page_unless_zero(page_head))) { + /* Ref to put_compound_page() comment. */ + if (PageSlab(page_head) || PageHeadHuge(page_head)) { if (likely(PageTail(page))) { + /* + * This is a hugetlbfs page or a slab + * page. __split_huge_page_refcount + * cannot race here. + */ + VM_BUG_ON(!PageHead(page_head)); __get_page_tail_foll(page, false); - got = true; - } - compound_unlock_irqrestore(page_head, flags); - if (unlikely(!got)) + return true; + } else { + /* + * __split_huge_page_refcount run + * before us, "page" was a THP + * tail. The split page_head has been + * freed and reallocated as slab or + * hugetlbfs page of smaller order + * (only possible if reallocated as + * slab on x86). + */ put_page(page_head); + return false; + } + } + + /* + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. + */ + flags = compound_lock_irqsave(page_head); + /* here __split_huge_page_refcount won't run anymore */ + if (likely(PageTail(page))) { + __get_page_tail_foll(page, false); + got = true; } + compound_unlock_irqrestore(page_head, flags); + if (unlikely(!got)) + put_page(page_head); } return got; } @@ -934,7 +957,8 @@ void __init swap_setup(void) #ifdef CONFIG_SWAP int i; - bdi_init(swapper_spaces[0].backing_dev_info); + if (bdi_init(swapper_spaces[0].backing_dev_info)) + panic("Failed to init swap bdi"); for (i = 0; i < MAX_SWAPFILES; i++) { spin_lock_init(&swapper_spaces[i].tree_lock); INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); diff --git a/mm/swapfile.c b/mm/swapfile.c index 3963fc24fcc1..612a7c9795f6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -707,7 +707,7 @@ noswap: return (swp_entry_t) {0}; } -/* The only caller of this function is now susupend routine */ +/* The only caller of this function is now suspend routine */ swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si; @@ -845,7 +845,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, } /* - * Caller has made sure that the swapdevice corresponding to entry + * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled. */ void swap_free(swp_entry_t entry) @@ -947,7 +947,7 @@ int try_to_free_swap(struct page *page) * original page might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * - * Hibration suspends storage while it is writing the image + * Hibernation suspends storage while it is writing the image * to disk so check that here. */ if (pm_suspended_storage()) @@ -1179,7 +1179,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * some architectures (e.g. x86_32 with PAE) we might catch a glimpse * of unmatched parts which look like swp_pte, so unuse_pte must * recheck under pte lock. Scanning without pte lock lets it be - * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. + * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. */ pte = pte_offset_map(pmd, addr); do { @@ -1824,6 +1824,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct filename *pathname; int i, type, prev; int err; + unsigned int old_block_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1914,6 +1915,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) } swap_file = p->swap_file; + old_block_size = p->old_block_size; p->swap_file = NULL; p->max = 0; swap_map = p->swap_map; @@ -1922,23 +1924,23 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->cluster_info = NULL; p->flags = 0; frontswap_map = frontswap_map_get(p); - frontswap_map_set(p, NULL); spin_unlock(&p->lock); spin_unlock(&swap_lock); frontswap_invalidate_area(type); + frontswap_map_set(p, NULL); mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; vfree(swap_map); vfree(cluster_info); vfree(frontswap_map); - /* Destroy swap account informatin */ + /* Destroy swap account information */ swap_cgroup_swapoff(type); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); - set_blocksize(bdev, p->old_block_size); + set_blocksize(bdev, old_block_size); blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } else { mutex_lock(&inode->i_mutex); @@ -2784,8 +2786,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) /* * We are fortunate that although vmalloc_to_page uses pte_offset_map, - * no architecture is using highmem pages for kernel pagetables: so it - * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. + * no architecture is using highmem pages for kernel page tables: so it + * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. */ head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; diff --git a/mm/util.c b/mm/util.c index eaf63fc2c92f..f7bc2096071c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -7,6 +7,9 @@ #include <linux/security.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/mman.h> +#include <linux/hugetlb.h> + #include <asm/uaccess.h> #include "internal.h" @@ -398,6 +401,16 @@ struct address_space *page_mapping(struct page *page) return mapping; } +/* + * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used + */ +unsigned long vm_commit_limit(void) +{ + return ((totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100) + total_swap_pages; +} + + /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 107454312d5e..0fdf96803c5b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -359,6 +359,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, if (unlikely(!va)) return ERR_PTR(-ENOMEM); + /* + * Only scan the relevant parts containing pointers to other objects + * to avoid false negatives. + */ + kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); + retry: spin_lock(&vmap_area_lock); /* @@ -1546,7 +1552,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node, const void *caller) + pgprot_t prot, int node) { const int order = 0; struct page **pages; @@ -1560,13 +1566,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, - PAGE_KERNEL, node, caller); + PAGE_KERNEL, node, area->caller); area->flags |= VM_VPAGES; } else { pages = kmalloc_node(array_size, nested_gfp, node); } area->pages = pages; - area->caller = caller; if (!area->pages) { remove_vm_area(area->addr); kfree(area); @@ -1577,7 +1582,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; - if (node < 0) + if (node == NUMA_NO_NODE) page = alloc_page(tmp_mask); else page = alloc_pages_node(node, tmp_mask, order); @@ -1634,9 +1639,9 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!area) goto fail; - addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); + addr = __vmalloc_area_node(area, gfp_mask, prot, node); if (!addr) - goto fail; + return NULL; /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED @@ -1646,11 +1651,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, clear_vm_uninitialized_flag(area); /* - * A ref_count = 3 is needed because the vm_struct and vmap_area - * structures allocated in the __get_vm_area_node() function contain - * references to the virtual address of the vmalloc'ed block. + * A ref_count = 2 is needed because vm_struct allocated in + * __get_vm_area_node() contains a reference to the virtual address of + * the vmalloc'ed block. */ - kmemleak_alloc(addr, real_size, 3, gfp_mask); + kmemleak_alloc(addr, real_size, 2, gfp_mask); return addr; @@ -2563,6 +2568,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) if (!counters) return; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); + if (v->flags & VM_UNINITIALIZED) + return; + memset(counters, 0, nr_node_ids * sizeof(unsigned int)); for (nr = 0; nr < v->nr_pages; nr++) @@ -2579,23 +2589,15 @@ static int s_show(struct seq_file *m, void *p) struct vmap_area *va = p; struct vm_struct *v; - if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) + /* + * s_show can encounter race with remove_vm_area, !VM_VM_AREA on + * behalf of vmap area is being tear down or vm_map_ram allocation. + */ + if (!(va->flags & VM_VM_AREA)) return 0; - if (!(va->flags & VM_VM_AREA)) { - seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", - (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); - return 0; - } - v = va->vm; - /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ - smp_rmb(); - if (v->flags & VM_UNINITIALIZED) - return 0; - seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ed1b775bdc9..eea668d9cff6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -48,6 +48,7 @@ #include <asm/div64.h> #include <linux/swapops.h> +#include <linux/balloon_compaction.h> #include "internal.h" @@ -139,23 +140,11 @@ static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; } - -static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) -{ - struct mem_cgroup *root = sc->target_mem_cgroup; - return !mem_cgroup_disabled() && - mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE; -} #else static bool global_reclaim(struct scan_control *sc) { return true; } - -static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) -{ - return false; -} #endif unsigned long zone_reclaimable_pages(struct zone *zone) @@ -222,6 +211,7 @@ void unregister_shrinker(struct shrinker *shrinker) down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); + kfree(shrinker->nr_deferred); } EXPORT_SYMBOL(unregister_shrinker); @@ -1125,7 +1115,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, LIST_HEAD(clean_pages); list_for_each_entry_safe(page, next, page_list, lru) { - if (page_is_file_cache(page) && !PageDirty(page)) { + if (page_is_file_cache(page) && !PageDirty(page) && + !isolated_balloon_page(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } @@ -2176,11 +2167,9 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static int -__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) +static void shrink_zone(struct zone *zone, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned; - int groups_scanned = 0; do { struct mem_cgroup *root = sc->target_mem_cgroup; @@ -2188,17 +2177,15 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) .zone = zone, .priority = sc->priority, }; - struct mem_cgroup *memcg = NULL; - mem_cgroup_iter_filter filter = (soft_reclaim) ? - mem_cgroup_soft_reclaim_eligible : NULL; + struct mem_cgroup *memcg; nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { struct lruvec *lruvec; - groups_scanned++; lruvec = mem_cgroup_zone_lruvec(zone, memcg); shrink_lruvec(lruvec, sc); @@ -2218,7 +2205,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) mem_cgroup_iter_break(root, memcg); break; } - } + memcg = mem_cgroup_iter(root, memcg, &reclaim); + } while (memcg); vmpressure(sc->gfp_mask, sc->target_mem_cgroup, sc->nr_scanned - nr_scanned, @@ -2226,37 +2214,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); - - return groups_scanned; -} - - -static void shrink_zone(struct zone *zone, struct scan_control *sc) -{ - bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); - unsigned long nr_scanned = sc->nr_scanned; - int scanned_groups; - - scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim); - /* - * memcg iterator might race with other reclaimer or start from - * a incomplete tree walk so the tree walk in __shrink_zone - * might have missed groups that are above the soft limit. Try - * another loop to catch up with others. Do it just once to - * prevent from reclaim latencies when other reclaimers always - * preempt this one. - */ - if (do_soft_reclaim && !scanned_groups) - __shrink_zone(zone, sc, do_soft_reclaim); - - /* - * No group is over the soft limit or those that are do not have - * pages in the zone we are reclaiming so we have to reclaim everybody - */ - if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) { - __shrink_zone(zone, sc, false); - return; - } } /* Returns true if compaction should go ahead for a high-order request */ @@ -2320,6 +2277,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; bool aborted_reclaim = false; /* @@ -2359,6 +2318,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) continue; } } + /* + * This steals pages from memory cgroups over softlimit + * and returns the number of reclaimed pages and + * scanned pages. This works for global memory pressure + * and balancing, not for a memcg's limit. + */ + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + sc->order, sc->gfp_mask, + &nr_soft_scanned); + sc->nr_reclaimed += nr_soft_reclaimed; + sc->nr_scanned += nr_soft_scanned; /* need some check for avoid more shrink_zone() */ } @@ -2952,6 +2923,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, { int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .priority = DEF_PRIORITY, @@ -3066,6 +3039,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, sc.nr_scanned = 0; + nr_soft_scanned = 0; + /* + * Call soft limit reclaim before calling shrink_zone. + */ + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + order, sc.gfp_mask, + &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; + /* * There should be no need to raise the scanning * priority if enough pages are already being scanned diff --git a/mm/vmstat.c b/mm/vmstat.c index 9bb314577911..72496140ac08 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -812,6 +812,7 @@ const char * const vmstat_text[] = { #ifdef CONFIG_NUMA_BALANCING "numa_pte_updates", + "numa_huge_pte_updates", "numa_hint_faults", "numa_hint_faults_local", "numa_pages_migrated", @@ -1229,6 +1230,20 @@ static void start_cpu_timer(int cpu) schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); } +static void vmstat_cpu_dead(int node) +{ + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + if (cpu_to_node(cpu) == node) + goto end; + + node_clear_state(node, N_CPU); +end: + put_online_cpus(); +} + /* * Use the cpu notifier to insure that the thresholds are recalculated * when necessary. @@ -1258,6 +1273,7 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb, case CPU_DEAD: case CPU_DEAD_FROZEN: refresh_zone_stat_thresholds(); + vmstat_cpu_dead(cpu_to_node(cpu)); break; default: break; @@ -1276,8 +1292,12 @@ static int __init setup_vmstat(void) register_cpu_notifier(&vmstat_notifier); - for_each_online_cpu(cpu) + get_online_cpus(); + for_each_online_cpu(cpu) { start_cpu_timer(cpu); + node_set_state(cpu_to_node(cpu), N_CPU); + } + put_online_cpus(); #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); diff --git a/mm/zswap.c b/mm/zswap.c index 841e35f1db22..5a63f78a5601 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -217,6 +217,7 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) if (!entry) return NULL; entry->refcount = 1; + RB_CLEAR_NODE(&entry->rbnode); return entry; } @@ -225,19 +226,6 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } -/* caller must hold the tree lock */ -static void zswap_entry_get(struct zswap_entry *entry) -{ - entry->refcount++; -} - -/* caller must hold the tree lock */ -static int zswap_entry_put(struct zswap_entry *entry) -{ - entry->refcount--; - return entry->refcount; -} - /********************************* * rbtree functions **********************************/ @@ -285,6 +273,61 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, return 0; } +static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) +{ + if (!RB_EMPTY_NODE(&entry->rbnode)) { + rb_erase(&entry->rbnode, root); + RB_CLEAR_NODE(&entry->rbnode); + } +} + +/* + * Carries out the common pattern of freeing and entry's zsmalloc allocation, + * freeing the entry itself, and decrementing the number of stored pages. + */ +static void zswap_free_entry(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + zbud_free(tree->pool, entry->handle); + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); + zswap_pool_pages = zbud_get_pool_size(tree->pool); +} + +/* caller must hold the tree lock */ +static void zswap_entry_get(struct zswap_entry *entry) +{ + entry->refcount++; +} + +/* caller must hold the tree lock +* remove from the tree and free it, if nobody reference the entry +*/ +static void zswap_entry_put(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + int refcount = --entry->refcount; + + BUG_ON(refcount < 0); + if (refcount == 0) { + zswap_rb_erase(&tree->rbroot, entry); + zswap_free_entry(tree, entry); + } +} + +/* caller must hold the tree lock */ +static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, + pgoff_t offset) +{ + struct zswap_entry *entry = NULL; + + entry = zswap_rb_search(root, offset); + if (entry) + zswap_entry_get(entry); + + return entry; +} + /********************************* * per-cpu code **********************************/ @@ -368,18 +411,6 @@ static bool zswap_is_full(void) zswap_pool_pages); } -/* - * Carries out the common pattern of freeing and entry's zsmalloc allocation, - * freeing the entry itself, and decrementing the number of stored pages. - */ -static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry) -{ - zbud_free(tree->pool, entry->handle); - zswap_entry_cache_free(entry); - atomic_dec(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(tree->pool); -} - /********************************* * writeback code **********************************/ @@ -387,7 +418,7 @@ static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry) enum zswap_get_swap_ret { ZSWAP_SWAPCACHE_NEW, ZSWAP_SWAPCACHE_EXIST, - ZSWAP_SWAPCACHE_NOMEM + ZSWAP_SWAPCACHE_FAIL, }; /* @@ -401,9 +432,10 @@ enum zswap_get_swap_ret { * added to the swap cache, and returned in retpage. * * If success, the swap cache page is returned in retpage - * Returns 0 if page was already in the swap cache, page is not locked - * Returns 1 if the new page needs to be populated, page is locked - * Returns <0 on error + * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache + * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, + * the new page is added to swapcache and locked + * Returns ZSWAP_SWAPCACHE_FAIL on error */ static int zswap_get_swap_cache_page(swp_entry_t entry, struct page **retpage) @@ -475,7 +507,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, if (new_page) page_cache_release(new_page); if (!found_page) - return ZSWAP_SWAPCACHE_NOMEM; + return ZSWAP_SWAPCACHE_FAIL; *retpage = found_page; return ZSWAP_SWAPCACHE_EXIST; } @@ -502,7 +534,7 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) struct page *page; u8 *src, *dst; unsigned int dlen; - int ret, refcount; + int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; @@ -517,23 +549,22 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) /* find and ref zswap entry */ spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); + entry = zswap_entry_find_get(&tree->rbroot, offset); if (!entry) { /* entry was invalidated */ spin_unlock(&tree->lock); return 0; } - zswap_entry_get(entry); spin_unlock(&tree->lock); BUG_ON(offset != entry->offset); /* try to allocate swap cache page */ switch (zswap_get_swap_cache_page(swpentry, &page)) { - case ZSWAP_SWAPCACHE_NOMEM: /* no memory */ + case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ ret = -ENOMEM; goto fail; - case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */ + case ZSWAP_SWAPCACHE_EXIST: /* page is already in the swap cache, ignore for now */ page_cache_release(page); ret = -EEXIST; @@ -556,43 +587,44 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) SetPageUptodate(page); } + /* move it to the tail of the inactive list after end_writeback */ + SetPageReclaim(page); + /* start writeback */ __swap_writepage(page, &wbc, end_swap_bio_write); page_cache_release(page); zswap_written_back_pages++; spin_lock(&tree->lock); - /* drop local reference */ - zswap_entry_put(entry); - /* drop the initial reference from entry creation */ - refcount = zswap_entry_put(entry); + zswap_entry_put(tree, entry); /* - * There are three possible values for refcount here: - * (1) refcount is 1, load is in progress, unlink from rbtree, - * load will free - * (2) refcount is 0, (normal case) entry is valid, - * remove from rbtree and free entry - * (3) refcount is -1, invalidate happened during writeback, - * free entry - */ - if (refcount >= 0) { - /* no invalidate yet, remove from rbtree */ - rb_erase(&entry->rbnode, &tree->rbroot); - } + * There are two possible situations for entry here: + * (1) refcount is 1(normal case), entry is valid and on the tree + * (2) refcount is 0, entry is freed and not on the tree + * because invalidate happened during writeback + * search the tree and free the entry if find entry + */ + if (entry == zswap_rb_search(&tree->rbroot, offset)) + zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - if (refcount <= 0) { - /* free the entry */ - zswap_free_entry(tree, entry); - return 0; - } - return -EAGAIN; + goto end; + + /* + * if we get here due to ZSWAP_SWAPCACHE_EXIST + * a load may happening concurrently + * it is safe and okay to not free the entry + * if we free the entry in the following put + * it it either okay to return !0 + */ fail: spin_lock(&tree->lock); - zswap_entry_put(entry); + zswap_entry_put(tree, entry); spin_unlock(&tree->lock); + +end: return ret; } @@ -676,11 +708,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, if (ret == -EEXIST) { zswap_duplicate_entry++; /* remove from rbtree */ - rb_erase(&dupentry->rbnode, &tree->rbroot); - if (!zswap_entry_put(dupentry)) { - /* free */ - zswap_free_entry(tree, dupentry); - } + zswap_rb_erase(&tree->rbroot, dupentry); + zswap_entry_put(tree, dupentry); } } while (ret == -EEXIST); spin_unlock(&tree->lock); @@ -709,17 +738,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, struct zswap_entry *entry; u8 *src, *dst; unsigned int dlen; - int refcount, ret; + int ret; /* find */ spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); + entry = zswap_entry_find_get(&tree->rbroot, offset); if (!entry) { /* entry was written back */ spin_unlock(&tree->lock); return -1; } - zswap_entry_get(entry); spin_unlock(&tree->lock); /* decompress */ @@ -734,22 +762,9 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, BUG_ON(ret); spin_lock(&tree->lock); - refcount = zswap_entry_put(entry); - if (likely(refcount)) { - spin_unlock(&tree->lock); - return 0; - } + zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - /* - * We don't have to unlink from the rbtree because - * zswap_writeback_entry() or zswap_frontswap_invalidate page() - * has already done this for us if we are the last reference. - */ - /* free */ - - zswap_free_entry(tree, entry); - return 0; } @@ -758,7 +773,6 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; - int refcount; /* find */ spin_lock(&tree->lock); @@ -770,20 +784,12 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) } /* remove from rbtree */ - rb_erase(&entry->rbnode, &tree->rbroot); + zswap_rb_erase(&tree->rbroot, entry); /* drop the initial reference from entry creation */ - refcount = zswap_entry_put(entry); + zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - - if (refcount) { - /* writeback in progress, writeback will free */ - return; - } - - /* free */ - zswap_free_entry(tree, entry); } /* frees all zswap entries for the given swap type */ @@ -797,13 +803,14 @@ static void zswap_frontswap_invalidate_area(unsigned type) /* walk the tree and free everything */ spin_lock(&tree->lock); - rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) { - zbud_free(tree->pool, entry->handle); - zswap_entry_cache_free(entry); - atomic_dec(&zswap_stored_pages); - } + rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) + zswap_free_entry(tree, entry); tree->rbroot = RB_ROOT; spin_unlock(&tree->lock); + + zbud_destroy_pool(tree->pool); + kfree(tree); + zswap_trees[type] = NULL; } static struct zbud_ops zswap_zbud_ops = { |