diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 72 | ||||
-rw-r--r-- | mm/kmemleak-test.c | 6 | ||||
-rw-r--r-- | mm/kmemleak.c | 13 | ||||
-rw-r--r-- | mm/memblock.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 98 | ||||
-rw-r--r-- | mm/memory-failure.c | 94 | ||||
-rw-r--r-- | mm/memory.c | 34 | ||||
-rw-r--r-- | mm/mempolicy.c | 16 | ||||
-rw-r--r-- | mm/migrate.c | 15 | ||||
-rw-r--r-- | mm/mlock.c | 7 | ||||
-rw-r--r-- | mm/mremap.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 23 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 1 | ||||
-rw-r--r-- | mm/slab.c | 47 | ||||
-rw-r--r-- | mm/slob.c | 6 | ||||
-rw-r--r-- | mm/slub.c | 132 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/truncate.c | 2 | ||||
-rw-r--r-- | mm/vmscan.c | 39 |
20 files changed, 382 insertions, 233 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 3ad483bdf505..e9c0c61f2ddd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS config COMPACTION bool "Allow for memory compaction" select MIGRATION - depends on EXPERIMENTAL && HUGETLB_PAGE && MMU + depends on MMU help Allows the compaction of memory for the allocation of huge pages. diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e187454d82f6..dbe99a5f2073 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -650,10 +650,10 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag) static inline struct page *alloc_hugepage_vma(int defrag, struct vm_area_struct *vma, - unsigned long haddr) + unsigned long haddr, int nd) { return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), - HPAGE_PMD_ORDER, vma, haddr); + HPAGE_PMD_ORDER, vma, haddr, nd); } #ifndef CONFIG_NUMA @@ -678,7 +678,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(khugepaged_enter(vma))) return VM_FAULT_OOM; page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr); + vma, haddr, numa_node_id()); if (unlikely(!page)) goto out; if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { @@ -799,8 +799,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, } for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, - vma, address); + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, + vma, address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_newpage_charge(pages[i], mm, GFP_KERNEL))) { @@ -902,7 +902,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr); + vma, haddr, numa_node_id()); else new_page = NULL; @@ -1162,7 +1162,12 @@ static void __split_huge_page_refcount(struct page *page) /* after clearing PageTail the gup refcount can be released */ smp_mb(); - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + /* + * retain hwpoison flag of the poisoned tail page: + * fix for the unsuitable process killed on Guest Machine(KVM) + * by the memory-failure. + */ + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; page_tail->flags |= (page->flags & ((1L << PG_referenced) | (1L << PG_swapbacked) | @@ -1740,7 +1745,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + int node) { pgd_t *pgd; pud_t *pud; @@ -1768,7 +1774,8 @@ static void collapse_huge_page(struct mm_struct *mm, * mmap_sem in read mode is good idea also to allow greater * scalability. */ - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, + node); if (unlikely(!new_page)) { up_read(&mm->mmap_sem); *hpage = ERR_PTR(-ENOMEM); @@ -1806,6 +1813,8 @@ static void collapse_huge_page(struct mm_struct *mm, /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ if (!vma->anon_vma || vma->vm_ops || vma->vm_file) goto out; + if (is_vma_temporary_stack(vma)) + goto out; VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); pgd = pgd_offset(mm, address); @@ -1847,7 +1856,6 @@ static void collapse_huge_page(struct mm_struct *mm, set_pmd_at(mm, address, pmd, _pmd); spin_unlock(&mm->page_table_lock); anon_vma_unlock(vma->anon_vma); - mem_cgroup_uncharge_page(new_page); goto out; } @@ -1893,6 +1901,7 @@ out_up_write: return; out: + mem_cgroup_uncharge_page(new_page); #ifdef CONFIG_NUMA put_page(new_page); #endif @@ -1912,6 +1921,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct page *page; unsigned long _address; spinlock_t *ptl; + int node = -1; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1942,6 +1952,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, page = vm_normal_page(vma, _address, pteval); if (unlikely(!page)) goto out_unmap; + /* + * Chose the node of the first page. This could + * be more sophisticated and look at more pages, + * but isn't for now. + */ + if (node == -1) + node = page_to_nid(page); VM_BUG_ON(PageCompound(page)); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; @@ -1958,7 +1975,7 @@ out_unmap: pte_unmap_unlock(pte, ptl); if (ret) /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, vma); + collapse_huge_page(mm, address, hpage, vma, node); out: return ret; } @@ -2027,32 +2044,27 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || (vma->vm_flags & VM_NOHUGEPAGE)) { + skip: progress++; continue; } - /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ - if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { - khugepaged_scan.address = vma->vm_end; - progress++; - continue; - } + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + goto skip; + if (is_vma_temporary_stack(vma)) + goto skip; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; - if (hstart >= hend) { - progress++; - continue; - } + if (hstart >= hend) + goto skip; + if (khugepaged_scan.address > hend) + goto skip; if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; - if (khugepaged_scan.address > hend) { - khugepaged_scan.address = hend + HPAGE_PMD_SIZE; - progress++; - continue; - } - BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { int ret; @@ -2081,7 +2093,7 @@ breakouterloop: breakouterloop_mmap_sem: spin_lock(&khugepaged_mm_lock); - BUG_ON(khugepaged_scan.mm_slot != mm_slot); + VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); /* * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. @@ -2236,9 +2248,9 @@ static int khugepaged(void *none) for (;;) { mutex_unlock(&khugepaged_mutex); - BUG_ON(khugepaged_thread != current); + VM_BUG_ON(khugepaged_thread != current); khugepaged_loop(); - BUG_ON(khugepaged_thread != current); + VM_BUG_ON(khugepaged_thread != current); mutex_lock(&khugepaged_mutex); if (!khugepaged_enabled()) diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index 177a5169bbde..ff0d9779cec8 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void) * after the module is removed. */ for (i = 0; i < 10; i++) { - elem = kmalloc(sizeof(*elem), GFP_KERNEL); - pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); + elem = kzalloc(sizeof(*elem), GFP_KERNEL); + pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem); if (!elem) return -ENOMEM; - memset(elem, 0, sizeof(*elem)); INIT_LIST_HEAD(&elem->list); - list_add_tail(&elem->list, &test_list); } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index bd9bc214091b..84225f3b7190 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -113,7 +113,9 @@ #define BYTES_PER_POINTER sizeof(void *) /* GFP bitmask for kmemleak internal allocations */ -#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ + __GFP_NORETRY | __GFP_NOMEMALLOC | \ + __GFP_NOWARN) /* scanning area inside a memory block */ struct kmemleak_scan_area { @@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, struct kmemleak_object *object; struct prio_tree_node *node; - object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); + object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { - kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); + pr_warning("Cannot allocate a kmemleak_object structure\n"); + kmemleak_disable(); return NULL; } @@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) return; } - area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); + area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); if (!area) { - kmemleak_warn("Cannot allocate a scan area\n"); + pr_warning("Cannot allocate a scan area\n"); goto out; } diff --git a/mm/memblock.c b/mm/memblock.c index bdba245d8afd..4618fda975a0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -137,8 +137,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size, BUG_ON(0 == size); - size = memblock_align_up(size, align); - /* Pump up max_addr */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) end = memblock.current_limit; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index db76ef726293..da53a252b259 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -612,8 +612,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); - else + else { __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); + nr_pages = -nr_pages; /* for event */ + } __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); @@ -1111,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) return false; } +/** + * mem_cgroup_check_margin - check if the memory cgroup allows charging + * @mem: memory cgroup to check + * @bytes: the number of bytes the caller intends to charge + * + * Returns a boolean value on whether @mem can be charged @bytes or + * whether this would exceed the limit. + */ +static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) +{ + if (!res_counter_check_margin(&mem->res, bytes)) + return false; + if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) + return false; + return true; +} + static unsigned int get_swappiness(struct mem_cgroup *memcg) { struct cgroup *cgrp = memcg->css.cgroup; @@ -1832,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, if (likely(!ret)) return CHARGE_OK; + res_counter_uncharge(&mem->res, csize); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); - - if (csize > PAGE_SIZE) /* change csize and retry */ + /* + * csize can be either a huge page (HPAGE_SIZE), a batch of + * regular pages (CHARGE_SIZE), or a single regular page + * (PAGE_SIZE). + * + * Never reclaim on behalf of optional batching, retry with a + * single page instead. + */ + if (csize == CHARGE_SIZE) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags); + gfp_mask, flags); + if (mem_cgroup_check_margin(mem_over_limit, csize)) + return CHARGE_RETRY; /* - * try_to_free_mem_cgroup_pages() might not give us a full - * picture of reclaim. Some pages are reclaimed and might be - * moved to swap cache or just unmapped from the cgroup. - * Check the limit again to see if the reclaim reduced the - * current usage of the cgroup before giving up + * Even though the limit is exceeded at this point, reclaim + * may have been able to free some pages. Retry the charge + * before killing the task. + * + * Only for regular pages, though: huge pages are rather + * unlikely to succeed so close to the limit, and we fall back + * to regular pages anyway in case of failure. */ - if (ret || mem_cgroup_check_under_limit(mem_over_limit)) + if (csize == PAGE_SIZE && ret) return CHARGE_RETRY; /* @@ -2144,6 +2175,8 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) struct page_cgroup *tail_pc = lookup_page_cgroup(tail); unsigned long flags; + if (mem_cgroup_disabled()) + return; /* * We have no races with charge/uncharge but will have races with * page state accounting. @@ -2233,7 +2266,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, { int ret = -EINVAL; unsigned long flags; - + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) return -EBUSY; @@ -2265,7 +2303,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, struct cgroup *cg = child->css.cgroup; struct cgroup *pcg = cg->parent; struct mem_cgroup *parent; - int charge = PAGE_SIZE; + int page_size = PAGE_SIZE; unsigned long flags; int ret; @@ -2278,23 +2316,26 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, goto out; if (isolate_lru_page(page)) goto put; - /* The page is isolated from LRU and we have no race with splitting */ - charge = PAGE_SIZE << compound_order(page); + + if (PageTransHuge(page)) + page_size = HPAGE_SIZE; parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, + &parent, false, page_size); if (ret || !parent) goto put_back; - if (charge > PAGE_SIZE) + if (page_size > PAGE_SIZE) flags = compound_lock_irqsave(page); - ret = mem_cgroup_move_account(pc, child, parent, true, charge); + ret = mem_cgroup_move_account(pc, child, parent, true, page_size); if (ret) - mem_cgroup_cancel_charge(parent, charge); -put_back: - if (charge > PAGE_SIZE) + mem_cgroup_cancel_charge(parent, page_size); + + if (page_size > PAGE_SIZE) compound_unlock_irqrestore(page, flags); +put_back: putback_lru_page(page); put: put_page(page); @@ -2312,13 +2353,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, enum charge_type ctype) { struct mem_cgroup *mem = NULL; + int page_size = PAGE_SIZE; struct page_cgroup *pc; + bool oom = true; int ret; - int page_size = PAGE_SIZE; if (PageTransHuge(page)) { page_size <<= compound_order(page); VM_BUG_ON(!PageTransHuge(page)); + /* + * Never OOM-kill a process for a huge page. The + * fault handler will fall back to regular pages. + */ + oom = false; } pc = lookup_page_cgroup(page); @@ -2327,7 +2374,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, return 0; prefetchw(pc); - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); if (ret || !mem) return ret; @@ -5013,9 +5060,9 @@ struct cgroup_subsys mem_cgroup_subsys = { static int __init enable_swap_account(char *s) { /* consider enabled if no parameter or 1 is given */ - if (!s || !strcmp(s, "1")) + if (!(*s) || !strcmp(s, "=1")) really_do_swap_account = 1; - else if (!strcmp(s, "0")) + else if (!strcmp(s, "=0")) really_do_swap_account = 0; return 1; } @@ -5023,7 +5070,8 @@ __setup("swapaccount", enable_swap_account); static int __init disable_swap_account(char *s) { - enable_swap_account("0"); + printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); + enable_swap_account("=0"); return 1; } __setup("noswapaccount", disable_swap_account); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 548fbd70f026..0207c2f6f8bd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -233,8 +233,8 @@ void shake_page(struct page *p, int access) } /* - * Only all shrink_slab here (which would also - * shrink other caches) if access is not potentially fatal. + * Only call shrink_slab here (which would also shrink other caches) if + * access is not potentially fatal. */ if (access) { int nr; @@ -386,8 +386,6 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct task_struct *tsk; struct anon_vma *av; - if (!PageHuge(page) && unlikely(split_huge_page(page))) - return; read_lock(&tasklist_lock); av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ @@ -856,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int ret; int kill = 1; struct page *hpage = compound_head(p); + struct page *ppage; if (PageReserved(p) || PageSlab(p)) return SWAP_SUCCESS; @@ -897,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } /* + * ppage: poisoned page + * if p is regular page(4k page) + * ppage == real poisoned page; + * else p is hugetlb or THP, ppage == head page. + */ + ppage = hpage; + + if (PageTransHuge(hpage)) { + /* + * Verify that this isn't a hugetlbfs head page, the check for + * PageAnon is just for avoid tripping a split_huge_page + * internal debug check, as split_huge_page refuses to deal with + * anything that isn't an anon page. PageAnon can't go away fro + * under us because we hold a refcount on the hpage, without a + * refcount on the hpage. split_huge_page can't be safely called + * in the first place, having a refcount on the tail isn't + * enough * to be safe. + */ + if (!PageHuge(hpage) && PageAnon(hpage)) { + if (unlikely(split_huge_page(hpage))) { + /* + * FIXME: if splitting THP is failed, it is + * better to stop the following operation rather + * than causing panic by unmapping. System might + * survive if the page is freed later. + */ + printk(KERN_INFO + "MCE %#lx: failed to split THP\n", pfn); + + BUG_ON(!PageHWPoison(p)); + return SWAP_FAIL; + } + /* THP is split, so ppage should be the real poisoned page. */ + ppage = p; + } + } + + /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, * because ttu takes the rmap data structures down. @@ -905,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(hpage, &tokill); + collect_procs(ppage, &tokill); + + if (hpage != ppage) + lock_page_nosync(ppage); - ret = try_to_unmap(hpage, ttu); + ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(hpage)); + pfn, page_mapcount(ppage)); + + if (hpage != ppage) + unlock_page(ppage); /* * Now that the dirty bit has been propagated to the @@ -921,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, + kill_procs_ao(&tokill, !!PageDirty(ppage), trapno, ret != SWAP_SUCCESS, p, pfn); return ret; @@ -1022,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - if (!PageLRU(p) && !PageHuge(p)) - shake_page(p, 0); - if (!PageLRU(p) && !PageHuge(p)) { - /* - * shake_page could have turned it free. - */ - if (is_free_buddy_page(p)) { - action_result(pfn, "free buddy, 2nd try", DELAYED); - return 0; + if (!PageHuge(p) && !PageTransCompound(p)) { + if (!PageLRU(p)) + shake_page(p, 0); + if (!PageLRU(p)) { + /* + * shake_page could have turned it free. + */ + if (is_free_buddy_page(p)) { + action_result(pfn, "free buddy, 2nd try", + DELAYED); + return 0; + } + action_result(pfn, "non LRU", IGNORED); + put_page(p); + return -EBUSY; } - action_result(pfn, "non LRU", IGNORED); - put_page(p); - return -EBUSY; } /* @@ -1064,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * For error on the tail page, we should set PG_hwpoison * on the head page to show that the hugepage is hwpoisoned */ - if (PageTail(p) && TestSetPageHWPoison(hpage)) { + if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { action_result(pfn, "hugepage already hardware poisoned", IGNORED); unlock_page(hpage); @@ -1295,7 +1341,10 @@ static int soft_offline_huge_page(struct page *page, int flags) ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, true); if (ret) { - putback_lru_pages(&pagelist); + struct page *page1, *page2; + list_for_each_entry_safe(page1, page2, &pagelist, lru) + put_page(page1); + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) @@ -1419,6 +1468,7 @@ int soft_offline_page(struct page *page, int flags) ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, true); if (ret) { + putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) diff --git a/mm/memory.c b/mm/memory.c index 31250faff390..5823698c2b71 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2219,7 +2219,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - page_cache_release(old_page); goto unlock; } page_cache_release(old_page); @@ -2289,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - page_cache_release(old_page); goto unlock; } @@ -2367,16 +2365,6 @@ gotten: } __SetPageUptodate(new_page); - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if ((vma->vm_flags & VM_LOCKED) && old_page) { - lock_page(old_page); /* for LRU manipulation */ - clear_page_mlock(old_page); - unlock_page(old_page); - } - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; @@ -2444,10 +2432,20 @@ gotten: if (new_page) page_cache_release(new_page); - if (old_page) - page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); + if (old_page) { + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { + lock_page(old_page); /* LRU manipulation */ + munlock_vma_page(old_page); + unlock_page(old_page); + } + page_cache_release(old_page); + } return ret; oom_free_new: page_cache_release(new_page); @@ -2650,6 +2648,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; details.i_mmap_lock = &mapping->i_mmap_lock; + mutex_lock(&mapping->unmap_mutex); spin_lock(&mapping->i_mmap_lock); /* Protect against endless unmapping loops */ @@ -2666,6 +2665,7 @@ void unmap_mapping_range(struct address_space *mapping, if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->unmap_mutex); } EXPORT_SYMBOL(unmap_mapping_range); @@ -3053,12 +3053,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } charged = 1; - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (vma->vm_flags & VM_LOCKED) - clear_page_mlock(vmf.page); copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 368fc9d23610..b53ec99f1428 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) } /* Return a zonelist indicated by gfp for node representing a mempolicy */ -static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) +static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, + int nd) { - int nd = numa_node_id(); - switch (policy->mode) { case MPOL_PREFERRED: if (!(policy->flags & MPOL_F_LOCAL)) @@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, zl = node_zonelist(interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))), gfp_flags); } else { - zl = policy_zonelist(gfp_flags, *mpol); + zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); if ((*mpol)->mode == MPOL_BIND) *nodemask = &(*mpol)->v.nodes; } @@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, int node) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; @@ -1830,13 +1829,13 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); put_mems_allowed(); return page; } - zl = policy_zonelist(gfp, pol); + zl = policy_zonelist(gfp, pol, node); if (unlikely(mpol_needs_cond_ref(pol))) { /* * slow path: ref counted shared policy @@ -1892,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); + policy_zonelist(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol)); put_mems_allowed(); return page; } diff --git a/mm/migrate.c b/mm/migrate.c index 46fe8cc13d67..352de555626c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -772,6 +772,7 @@ uncharge: unlock: unlock_page(page); +move_newpage: if (rc != -EAGAIN) { /* * A page that has been migrated has all references @@ -785,8 +786,6 @@ unlock: putback_lru_page(page); } -move_newpage: - /* * Move the new page to the LRU. If migration was not successful * then this will free the page. @@ -888,7 +887,7 @@ out: * are movable anymore because to has become empty * or no retryable pages exist anymore. * Caller should call putback_lru_pages to return pages to the LRU - * or free list. + * or free list only if ret != 0. * * Return: Number of pages not migrated or error code. */ @@ -981,10 +980,6 @@ int migrate_huge_pages(struct list_head *from, } rc = 0; out: - - list_for_each_entry_safe(page, page2, from, lru) - put_page(page); - if (rc) return rc; @@ -1292,14 +1287,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, return -EPERM; /* Find the mm_struct */ - read_lock(&tasklist_lock); + rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); return -ESRCH; } mm = get_task_mm(task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!mm) return -EINVAL; diff --git a/mm/mlock.c b/mm/mlock.c index 13e81ee8be9d..c3924c7f00be 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -178,6 +178,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; + /* + * We want mlock to succeed for regions that have any permissions + * other than PROT_NONE. + */ + if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + gup_flags |= FOLL_FORCE; + if (vma->vm_flags & VM_LOCKED) gup_flags |= FOLL_MLOCK; diff --git a/mm/mremap.c b/mm/mremap.c index 9925b6391b80..1de98d492ddc 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -94,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, */ mapping = vma->vm_file->f_mapping; spin_lock(&mapping->i_mmap_lock); - if (new_vma->vm_truncate_count && - new_vma->vm_truncate_count != vma->vm_truncate_count) - new_vma->vm_truncate_count = 0; + new_vma->vm_truncate_count = 0; } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 90c1439549fd..cdef1d4b4e47 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1088,8 +1088,10 @@ static void drain_pages(unsigned int cpu) pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - free_pcppages_bulk(zone, pcp->count, pcp); - pcp->count = 0; + if (pcp->count) { + free_pcppages_bulk(zone, pcp->count, pcp); + pcp->count = 0; + } local_irq_restore(flags); } } @@ -2034,6 +2036,14 @@ restart: */ alloc_flags = gfp_to_alloc_flags(gfp_mask); + /* + * Find the true preferred zone if the allocation is unconstrained by + * cpusets. + */ + if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) + first_zones_zonelist(zonelist, high_zoneidx, NULL, + &preferred_zone); + /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -2192,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, get_mems_allowed(); /* The preferred zone is used for statistics later */ - first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); + first_zones_zonelist(zonelist, high_zoneidx, + nodemask ? : &cpuset_current_mems_allowed, + &preferred_zone); if (!preferred_zone) { put_mems_allowed(); return NULL; @@ -5364,10 +5376,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { unsigned long check = pfn + iter; - if (!pfn_valid_within(check)) { - iter++; + if (!pfn_valid_within(check)) continue; - } + page = pfn_to_page(check); if (!page_count(page)) { if (PageBuddy(page)) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 0369f5b3ba1b..eb663fb533e0 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -6,6 +6,7 @@ * Copyright (C) 2010 Linus Torvalds */ +#include <linux/pagemap.h> #include <asm/tlb.h> #include <asm-generic/pgtable.h> diff --git a/mm/slab.c b/mm/slab.c index 4c6e2e31ced0..a18ba57517af 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -191,22 +191,6 @@ typedef unsigned int kmem_bufctl_t; #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) /* - * struct slab - * - * Manages the objs in a slab. Placed either at the beginning of mem allocated - * for a slab, or allocated from an general cache. - * Slabs are chained into three list: fully used, partial, fully free slabs. - */ -struct slab { - struct list_head list; - unsigned long colouroff; - void *s_mem; /* including colour offset */ - unsigned int inuse; /* num of objs active in slab */ - kmem_bufctl_t free; - unsigned short nodeid; -}; - -/* * struct slab_rcu * * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to @@ -219,8 +203,6 @@ struct slab { * * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. - * - * We assume struct slab_rcu can overlay struct slab when destroying. */ struct slab_rcu { struct rcu_head head; @@ -229,6 +211,27 @@ struct slab_rcu { }; /* + * struct slab + * + * Manages the objs in a slab. Placed either at the beginning of mem allocated + * for a slab, or allocated from an general cache. + * Slabs are chained into three list: fully used, partial, fully free slabs. + */ +struct slab { + union { + struct { + struct list_head list; + unsigned long colouroff; + void *s_mem; /* including colour offset */ + unsigned int inuse; /* num of objs active in slab */ + kmem_bufctl_t free; + unsigned short nodeid; + }; + struct slab_rcu __slab_cover_slab_rcu; + }; +}; + +/* * struct array_cache * * Purpose: @@ -2147,8 +2150,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) * * @name must be valid until the cache is destroyed. This implies that * the module calling this has to destroy the cache before getting unloaded. - * Note that kmem_cache_name() is not guaranteed to return the same pointer, - * therefore applications must manage it themselves. * * The flags are * @@ -3840,12 +3841,6 @@ unsigned int kmem_cache_size(struct kmem_cache *cachep) } EXPORT_SYMBOL(kmem_cache_size); -const char *kmem_cache_name(struct kmem_cache *cachep) -{ - return cachep->name; -} -EXPORT_SYMBOL_GPL(kmem_cache_name); - /* * This initializes kmem_list3 or resizes various caches for all nodes. */ diff --git a/mm/slob.c b/mm/slob.c index 3588eaaef726..46e0aee33a23 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -666,12 +666,6 @@ unsigned int kmem_cache_size(struct kmem_cache *c) } EXPORT_SYMBOL(kmem_cache_size); -const char *kmem_cache_name(struct kmem_cache *c) -{ - return c->name; -} -EXPORT_SYMBOL(kmem_cache_name); - int kmem_cache_shrink(struct kmem_cache *d) { return 0; diff --git a/mm/slub.c b/mm/slub.c index e15aa7f193c9..e841d8921c22 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -281,11 +281,40 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->objsize; + +#endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +} + +static inline int order_objects(int order, unsigned long size, int reserved) +{ + return ((PAGE_SIZE << order) - reserved) / size; +} + static inline struct kmem_cache_order_objects oo_make(int order, - unsigned long size) + unsigned long size, int reserved) { struct kmem_cache_order_objects x = { - (order << OO_SHIFT) + (PAGE_SIZE << order) / size + (order << OO_SHIFT) + order_objects(order, size, reserved) }; return x; @@ -617,7 +646,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = (PAGE_SIZE << compound_order(page)); + length = (PAGE_SIZE << compound_order(page)) - s->reserved; end = start + length; remainder = length % s->size; if (!remainder) @@ -698,7 +727,7 @@ static int check_slab(struct kmem_cache *s, struct page *page) return 0; } - maxobj = (PAGE_SIZE << compound_order(page)) / s->size; + maxobj = order_objects(compound_order(page), s->size, s->reserved); if (page->objects > maxobj) { slab_err(s, page, "objects %u > max %u", s->name, page->objects, maxobj); @@ -748,7 +777,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - max_objects = (PAGE_SIZE << compound_order(page)) / s->size; + max_objects = order_objects(compound_order(page), s->size, s->reserved); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -800,7 +829,7 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) { flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, s->objsize); + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); } @@ -1249,21 +1278,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __free_pages(page, order); } +#define need_reserve_slab_rcu \ + (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) + static void rcu_free_slab(struct rcu_head *h) { struct page *page; - page = container_of((struct list_head *)h, struct page, lru); + if (need_reserve_slab_rcu) + page = virt_to_head_page(h); + else + page = container_of((struct list_head *)h, struct page, lru); + __free_slab(page->slab, page); } static void free_slab(struct kmem_cache *s, struct page *page) { if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { - /* - * RCU free overloads the RCU head over the LRU - */ - struct rcu_head *head = (void *)&page->lru; + struct rcu_head *head; + + if (need_reserve_slab_rcu) { + int order = compound_order(page); + int offset = (PAGE_SIZE << order) - s->reserved; + + VM_BUG_ON(s->reserved != sizeof(*head)); + head = page_address(page) + offset; + } else { + /* + * RCU free overloads the RCU head over the LRU + */ + head = (void *)&page->lru; + } call_rcu(head, rcu_free_slab); } else @@ -1988,13 +2034,13 @@ static int slub_nomerge; * the smallest order which will fit the object. */ static inline int slab_order(int size, int min_objects, - int max_order, int fract_leftover) + int max_order, int fract_leftover, int reserved) { int order; int rem; int min_order = slub_min_order; - if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) + if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; for (order = max(min_order, @@ -2003,10 +2049,10 @@ static inline int slab_order(int size, int min_objects, unsigned long slab_size = PAGE_SIZE << order; - if (slab_size < min_objects * size) + if (slab_size < min_objects * size + reserved) continue; - rem = slab_size % size; + rem = (slab_size - reserved) % size; if (rem <= slab_size / fract_leftover) break; @@ -2016,7 +2062,7 @@ static inline int slab_order(int size, int min_objects, return order; } -static inline int calculate_order(int size) +static inline int calculate_order(int size, int reserved) { int order; int min_objects; @@ -2034,14 +2080,14 @@ static inline int calculate_order(int size) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); - max_objects = (PAGE_SIZE << slub_max_order)/size; + max_objects = order_objects(slub_max_order, size, reserved); min_objects = min(min_objects, max_objects); while (min_objects > 1) { fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, - slub_max_order, fraction); + slub_max_order, fraction, reserved); if (order <= slub_max_order) return order; fraction /= 2; @@ -2053,14 +2099,14 @@ static inline int calculate_order(int size) * We were unable to place multiple objects in a slab. Now * lets see if we can place a single object there. */ - order = slab_order(size, 1, slub_max_order, 1); + order = slab_order(size, 1, slub_max_order, 1, reserved); if (order <= slub_max_order) return order; /* * Doh this slab cannot be placed using slub_max_order. */ - order = slab_order(size, 1, MAX_ORDER, 1); + order = slab_order(size, 1, MAX_ORDER, 1, reserved); if (order < MAX_ORDER) return order; return -ENOSYS; @@ -2311,7 +2357,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) if (forced_order >= 0) order = forced_order; else - order = calculate_order(size); + order = calculate_order(size, s->reserved); if (order < 0) return 0; @@ -2329,8 +2375,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) /* * Determine the number of objects per slab */ - s->oo = oo_make(order, size); - s->min = oo_make(get_order(size), size); + s->oo = oo_make(order, size, s->reserved); + s->min = oo_make(get_order(size), size, s->reserved); if (oo_objects(s->oo) > oo_objects(s->max)) s->max = s->oo; @@ -2349,6 +2395,10 @@ static int kmem_cache_open(struct kmem_cache *s, s->objsize = size; s->align = align; s->flags = kmem_cache_flags(size, flags, name, ctor); + s->reserved = 0; + + if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) + s->reserved = sizeof(struct rcu_head); if (!calculate_sizes(s, -1)) goto error; @@ -2399,12 +2449,6 @@ unsigned int kmem_cache_size(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_size); -const char *kmem_cache_name(struct kmem_cache *s) -{ - return s->name; -} -EXPORT_SYMBOL(kmem_cache_name); - static void list_slab_objects(struct kmem_cache *s, struct page *page, const char *text) { @@ -2696,7 +2740,6 @@ EXPORT_SYMBOL(__kmalloc_node); size_t ksize(const void *object) { struct page *page; - struct kmem_cache *s; if (unlikely(object == ZERO_SIZE_PTR)) return 0; @@ -2707,28 +2750,8 @@ size_t ksize(const void *object) WARN_ON(!PageCompound(page)); return PAGE_SIZE << compound_order(page); } - s = page->slab; - -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->objsize; -#endif - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; + return slab_ksize(page->slab); } EXPORT_SYMBOL(ksize); @@ -4017,6 +4040,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(destroy_by_rcu); +static ssize_t reserved_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->reserved); +} +SLAB_ATTR_RO(reserved); + #ifdef CONFIG_SLUB_DEBUG static ssize_t slabs_show(struct kmem_cache *s, char *buf) { @@ -4303,6 +4332,7 @@ static struct attribute *slab_attrs[] = { &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, &shrink_attr.attr, + &reserved_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, &slabs_attr.attr, diff --git a/mm/swapfile.c b/mm/swapfile.c index 07a458d72fa8..0341c5700e34 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1940,7 +1940,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EINVAL; if (S_ISBLK(inode->i_mode)) { - bdev = I_BDEV(inode); + bdev = bdgrab(I_BDEV(inode)); error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, sys_swapon); if (error < 0) { diff --git a/mm/truncate.c b/mm/truncate.c index 49feb46e77b8..d64296be00d3 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -225,6 +225,7 @@ void truncate_inode_pages_range(struct address_space *mapping, next = start; while (next <= end && pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index = page->index; @@ -247,6 +248,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); } pagevec_release(&pvec); + mem_cgroup_uncharge_end(); cond_resched(); } diff --git a/mm/vmscan.c b/mm/vmscan.c index f5d90dedebba..6771ea70bfe7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1841,16 +1841,28 @@ static inline bool should_continue_reclaim(struct zone *zone, if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) return false; - /* - * If we failed to reclaim and have scanned the full list, stop. - * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far - * faster but obviously would be less likely to succeed - * allocation. If this is desirable, use GFP_REPEAT to decide - * if both reclaimed and scanned should be checked or just - * reclaimed - */ - if (!nr_reclaimed && !nr_scanned) - return false; + /* Consider stopping depending on scan and reclaim activity */ + if (sc->gfp_mask & __GFP_REPEAT) { + /* + * For __GFP_REPEAT allocations, stop reclaiming if the + * full LRU list has been scanned and we are still failing + * to reclaim pages. This full LRU scan is potentially + * expensive but a __GFP_REPEAT caller really wants to succeed + */ + if (!nr_reclaimed && !nr_scanned) + return false; + } else { + /* + * For non-__GFP_REPEAT allocations which can presumably + * fail without consequence, stop if we failed to reclaim + * any pages from the last SWAP_CLUSTER_MAX number of + * pages that were scanned. This will return to the + * caller faster at the risk reclaim/compaction and + * the resulting allocation attempt fails + */ + if (!nr_reclaimed) + return false; + } /* * If we have not reclaimed enough pages for compaction and the @@ -1882,12 +1894,12 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list l; - unsigned long nr_reclaimed; + unsigned long nr_reclaimed, nr_scanned; unsigned long nr_to_reclaim = sc->nr_to_reclaim; - unsigned long nr_scanned = sc->nr_scanned; restart: nr_reclaimed = 0; + nr_scanned = sc->nr_scanned; get_scan_count(zone, sc, nr, priority); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -2083,7 +2095,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zone *preferred_zone; first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), - NULL, &preferred_zone); + &cpuset_current_mems_allowed, + &preferred_zone); wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); } } |