diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 5 | ||||
-rw-r--r-- | mm/filemap.c | 18 | ||||
-rw-r--r-- | mm/huge_memory.c | 93 | ||||
-rw-r--r-- | mm/ksm.c | 11 | ||||
-rw-r--r-- | mm/memcontrol.c | 1102 | ||||
-rw-r--r-- | mm/memory-failure.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 4 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/migrate.c | 173 | ||||
-rw-r--r-- | mm/oom_kill.c | 42 | ||||
-rw-r--r-- | mm/page_alloc.c | 55 | ||||
-rw-r--r-- | mm/page_cgroup.c | 164 | ||||
-rw-r--r-- | mm/rmap.c | 20 | ||||
-rw-r--r-- | mm/slub.c | 9 | ||||
-rw-r--r-- | mm/swap.c | 79 | ||||
-rw-r--r-- | mm/swap_state.c | 10 | ||||
-rw-r--r-- | mm/swapfile.c | 9 | ||||
-rw-r--r-- | mm/vmalloc.c | 9 | ||||
-rw-r--r-- | mm/vmscan.c | 680 | ||||
-rw-r--r-- | mm/vmstat.c | 2 |
21 files changed, 1306 insertions, 1185 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index e6670c34eb49..71a58f67f481 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -350,7 +350,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } if (!cc->sync) - mode |= ISOLATE_CLEAN; + mode |= ISOLATE_ASYNC_MIGRATE; /* Try isolate the page */ if (__isolate_lru_page(page, mode, 0) != 0) @@ -557,7 +557,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, (unsigned long)cc, false, - cc->sync); + cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -671,6 +671,7 @@ static int compact_node(int nid) .nr_freepages = 0, .nr_migratepages = 0, .order = -1, + .sync = true, }; zone = &pgdat->node_zones[zoneid]; diff --git a/mm/filemap.c b/mm/filemap.c index c4ee2e918bea..97f49ed35bd2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -393,24 +393,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { int error; - struct mem_cgroup *memcg = NULL; VM_BUG_ON(!PageLocked(old)); VM_BUG_ON(!PageLocked(new)); VM_BUG_ON(new->mapping); - /* - * This is not page migration, but prepare_migration and - * end_migration does enough work for charge replacement. - * - * In the longer term we probably want a specialized function - * for moving the charge from old to new in a more efficient - * manner. - */ - error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); - if (error) - return error; - error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (!error) { struct address_space *mapping = old->mapping; @@ -432,13 +419,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); + /* mem_cgroup codes must not be called under tree_lock */ + mem_cgroup_replace_page_cache(old, new); radix_tree_preload_end(); if (freepage) freepage(old); page_cache_release(old); - mem_cgroup_end_migration(memcg, old, new, true); - } else { - mem_cgroup_end_migration(memcg, old, new, false); } return error; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 36b3d988b4ef..b3ffc21ce801 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -487,41 +487,68 @@ static struct attribute_group khugepaged_attr_group = { .attrs = khugepaged_attr, .name = "khugepaged", }; -#endif /* CONFIG_SYSFS */ -static int __init hugepage_init(void) +static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) { int err; -#ifdef CONFIG_SYSFS - static struct kobject *hugepage_kobj; -#endif - - err = -EINVAL; - if (!has_transparent_hugepage()) { - transparent_hugepage_flags = 0; - goto out; - } -#ifdef CONFIG_SYSFS - err = -ENOMEM; - hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); - if (unlikely(!hugepage_kobj)) { + *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); + if (unlikely(!*hugepage_kobj)) { printk(KERN_ERR "hugepage: failed kobject create\n"); - goto out; + return -ENOMEM; } - err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); + err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); if (err) { printk(KERN_ERR "hugepage: failed register hugeage group\n"); - goto out; + goto delete_obj; } - err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); + err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); if (err) { printk(KERN_ERR "hugepage: failed register hugeage group\n"); - goto out; + goto remove_hp_group; } -#endif + + return 0; + +remove_hp_group: + sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); +delete_obj: + kobject_put(*hugepage_kobj); + return err; +} + +static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) +{ + sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); + sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); + kobject_put(hugepage_kobj); +} +#else +static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) +{ + return 0; +} + +static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) +{ +} +#endif /* CONFIG_SYSFS */ + +static int __init hugepage_init(void) +{ + int err; + struct kobject *hugepage_kobj; + + if (!has_transparent_hugepage()) { + transparent_hugepage_flags = 0; + return -EINVAL; + } + + err = hugepage_init_sysfs(&hugepage_kobj); + if (err) + return err; err = khugepaged_slab_init(); if (err) @@ -545,7 +572,9 @@ static int __init hugepage_init(void) set_recommended_min_free_kbytes(); + return 0; out: + hugepage_exit_sysfs(hugepage_kobj); return err; } module_init(hugepage_init) @@ -997,7 +1026,7 @@ out: } int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, - pmd_t *pmd) + pmd_t *pmd, unsigned long addr) { int ret = 0; @@ -1013,6 +1042,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pgtable = get_pmd_huge_pte(tlb->mm); page = pmd_page(*pmd); pmd_clear(pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); page_remove_rmap(page); VM_BUG_ON(page_mapcount(page) < 0); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); @@ -1116,7 +1146,6 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_modify(entry, newprot); set_pmd_at(mm, addr, pmd, entry); spin_unlock(&vma->vm_mm->page_table_lock); - flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); ret = 1; } } else @@ -1199,16 +1228,16 @@ static int __split_huge_page_splitting(struct page *page, static void __split_huge_page_refcount(struct page *page) { int i; - unsigned long head_index = page->index; struct zone *zone = page_zone(page); - int zonestat; int tail_count = 0; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); compound_lock(page); + /* complete memcg works before add pages to LRU */ + mem_cgroup_split_huge_fixup(page); - for (i = 1; i < HPAGE_PMD_NR; i++) { + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { struct page *page_tail = page + i; /* tail_page->_mapcount cannot change */ @@ -1271,14 +1300,13 @@ static void __split_huge_page_refcount(struct page *page) BUG_ON(page_tail->mapping); page_tail->mapping = page->mapping; - page_tail->index = ++head_index; + page_tail->index = page->index + i; BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); BUG_ON(!PageDirty(page_tail)); BUG_ON(!PageSwapBacked(page_tail)); - mem_cgroup_split_huge_fixup(page, page_tail); lru_add_page_tail(zone, page, page_tail); } @@ -1288,15 +1316,6 @@ static void __split_huge_page_refcount(struct page *page) __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); - /* - * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, - * so adjust those appropriately if this page is on the LRU. - */ - if (PageLRU(page)) { - zonestat = NR_LRU_BASE + page_lru(page); - __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); - } - ClearPageCompound(page); compound_unlock(page); spin_unlock_irq(&zone->lru_lock); @@ -28,6 +28,7 @@ #include <linux/kthread.h> #include <linux/wait.h> #include <linux/slab.h> +#include <linux/memcontrol.h> #include <linux/rbtree.h> #include <linux/memory.h> #include <linux/mmu_notifier.h> @@ -1571,6 +1572,16 @@ struct page *ksm_does_need_to_copy(struct page *page, new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (new_page) { + /* + * The memcg-specific accounting when moving + * pages around the LRU lists relies on the + * page's owner (memcg) to be valid. Usually, + * pages are assigned to a new owner before + * being put on the LRU list, but since this + * is not the case here, the stale owner from + * a previous allocation cycle must be reset. + */ + mem_cgroup_reset_owner(new_page); copy_user_highpage(new_page, page, address, vma); SetPageDirty(new_page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d87aa3510c5e..602207be9853 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -123,16 +123,22 @@ struct mem_cgroup_stat_cpu { unsigned long targets[MEM_CGROUP_NTARGETS]; }; +struct mem_cgroup_reclaim_iter { + /* css_id of the last scanned hierarchy member */ + int position; + /* scan generation, increased every round-trip */ + unsigned int generation; +}; + /* * per-zone information in memory controller. */ struct mem_cgroup_per_zone { - /* - * spin_lock to protect the per cgroup LRU - */ - struct list_head lists[NR_LRU_LISTS]; + struct lruvec lruvec; unsigned long count[NR_LRU_LISTS]; + struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; + struct zone_reclaim_stat reclaim_stat; struct rb_node tree_node; /* RB tree node */ unsigned long long usage_in_excess;/* Set to the value by which */ @@ -233,11 +239,6 @@ struct mem_cgroup { * per zone LRU lists. */ struct mem_cgroup_lru_info info; - /* - * While reclaiming in a hierarchy, we cache the last child we - * reclaimed from. - */ - int last_scanned_child; int last_scanned_node; #if MAX_NUMNODES > 1 nodemask_t scan_nodes; @@ -366,8 +367,6 @@ enum charge_type { #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) -#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 -#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) static void mem_cgroup_get(struct mem_cgroup *memcg); static void mem_cgroup_put(struct mem_cgroup *memcg); @@ -566,7 +565,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; - for_each_node_state(node, N_POSSIBLE) { + for_each_node(node) { for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = mem_cgroup_zoneinfo(memcg, node, zone); mctz = soft_limit_tree_node_zone(node, zone); @@ -656,16 +655,6 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } -void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val) -{ - this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); -} - -void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val) -{ - this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); -} - static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { @@ -749,37 +738,32 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, return total; } -static bool __memcg_event_check(struct mem_cgroup *memcg, int target) +static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, + enum mem_cgroup_events_target target) { unsigned long val, next; val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); next = __this_cpu_read(memcg->stat->targets[target]); /* from time_after() in jiffies.h */ - return ((long)next - (long)val < 0); -} - -static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) -{ - unsigned long val, next; - - val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); - - switch (target) { - case MEM_CGROUP_TARGET_THRESH: - next = val + THRESHOLDS_EVENTS_TARGET; - break; - case MEM_CGROUP_TARGET_SOFTLIMIT: - next = val + SOFTLIMIT_EVENTS_TARGET; - break; - case MEM_CGROUP_TARGET_NUMAINFO: - next = val + NUMAINFO_EVENTS_TARGET; - break; - default: - return; + if ((long)next - (long)val < 0) { + switch (target) { + case MEM_CGROUP_TARGET_THRESH: + next = val + THRESHOLDS_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_NUMAINFO: + next = val + NUMAINFO_EVENTS_TARGET; + break; + default: + break; + } + __this_cpu_write(memcg->stat->targets[target], next); + return true; } - - __this_cpu_write(memcg->stat->targets[target], next); + return false; } /* @@ -790,25 +774,27 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { preempt_disable(); /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { + if (unlikely(mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_THRESH))) { + bool do_softlimit, do_numainfo; + + do_softlimit = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_SOFTLIMIT); +#if MAX_NUMNODES > 1 + do_numainfo = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_NUMAINFO); +#endif + preempt_enable(); + mem_cgroup_threshold(memcg); - __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); - if (unlikely(__memcg_event_check(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT))) { + if (unlikely(do_softlimit)) mem_cgroup_update_tree(memcg, page); - __mem_cgroup_target_update(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT); - } #if MAX_NUMNODES > 1 - if (unlikely(__memcg_event_check(memcg, - MEM_CGROUP_TARGET_NUMAINFO))) { + if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); - __mem_cgroup_target_update(memcg, - MEM_CGROUP_TARGET_NUMAINFO); - } #endif - } - preempt_enable(); + } else + preempt_enable(); } struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) @@ -853,83 +839,116 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } -/* The caller has to guarantee "mem" exists before calling this */ -static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) +/** + * mem_cgroup_iter - iterate over memory cgroup hierarchy + * @root: hierarchy root + * @prev: previously returned memcg, NULL on first invocation + * @reclaim: cookie for shared reclaim walks, NULL for full walks + * + * Returns references to children of the hierarchy below @root, or + * @root itself, or %NULL after a full round-trip. + * + * Caller must pass the return value in @prev on subsequent + * invocations for reference counting, or use mem_cgroup_iter_break() + * to cancel a hierarchy walk before the round-trip is complete. + * + * Reclaimers can specify a zone and a priority level in @reclaim to + * divide up the memcgs in the hierarchy among all concurrent + * reclaimers operating on the same zone and priority. + */ +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, + struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim) { - struct cgroup_subsys_state *css; - int found; + struct mem_cgroup *memcg = NULL; + int id = 0; - if (!memcg) /* ROOT cgroup has the smallest ID */ - return root_mem_cgroup; /*css_put/get against root is ignored*/ - if (!memcg->use_hierarchy) { - if (css_tryget(&memcg->css)) - return memcg; + if (mem_cgroup_disabled()) return NULL; - } - rcu_read_lock(); - /* - * searching a memory cgroup which has the smallest ID under given - * ROOT cgroup. (ID >= 1) - */ - css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found); - if (css && css_tryget(css)) - memcg = container_of(css, struct mem_cgroup, css); - else - memcg = NULL; - rcu_read_unlock(); - return memcg; -} -static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, - struct mem_cgroup *root, - bool cond) -{ - int nextid = css_id(&iter->css) + 1; - int found; - int hierarchy_used; - struct cgroup_subsys_state *css; + if (!root) + root = root_mem_cgroup; - hierarchy_used = iter->use_hierarchy; + if (prev && !reclaim) + id = css_id(&prev->css); - css_put(&iter->css); - /* If no ROOT, walk all, ignore hierarchy */ - if (!cond || (root && !hierarchy_used)) - return NULL; + if (prev && prev != root) + css_put(&prev->css); - if (!root) - root = root_mem_cgroup; + if (!root->use_hierarchy && root != root_mem_cgroup) { + if (prev) + return NULL; + return root; + } - do { - iter = NULL; - rcu_read_lock(); + while (!memcg) { + struct mem_cgroup_reclaim_iter *uninitialized_var(iter); + struct cgroup_subsys_state *css; + + if (reclaim) { + int nid = zone_to_nid(reclaim->zone); + int zid = zone_idx(reclaim->zone); + struct mem_cgroup_per_zone *mz; - css = css_get_next(&mem_cgroup_subsys, nextid, - &root->css, &found); - if (css && css_tryget(css)) - iter = container_of(css, struct mem_cgroup, css); + mz = mem_cgroup_zoneinfo(root, nid, zid); + iter = &mz->reclaim_iter[reclaim->priority]; + if (prev && reclaim->generation != iter->generation) + return NULL; + id = iter->position; + } + + rcu_read_lock(); + css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); + if (css) { + if (css == &root->css || css_tryget(css)) + memcg = container_of(css, + struct mem_cgroup, css); + } else + id = 0; rcu_read_unlock(); - /* If css is NULL, no more cgroups will be found */ - nextid = found + 1; - } while (css && !iter); - return iter; + if (reclaim) { + iter->position = id; + if (!css) + iter->generation++; + else if (!prev && memcg) + reclaim->generation = iter->generation; + } + + if (prev && !css) + return NULL; + } + return memcg; } -/* - * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please - * be careful that "break" loop is not allowed. We have reference count. - * Instead of that modify "cond" to be false and "continue" to exit the loop. - */ -#define for_each_mem_cgroup_tree_cond(iter, root, cond) \ - for (iter = mem_cgroup_start_loop(root);\ - iter != NULL;\ - iter = mem_cgroup_get_next(iter, root, cond)) -#define for_each_mem_cgroup_tree(iter, root) \ - for_each_mem_cgroup_tree_cond(iter, root, true) +/** + * mem_cgroup_iter_break - abort a hierarchy walk prematurely + * @root: hierarchy root + * @prev: last visited hierarchy member as returned by mem_cgroup_iter() + */ +void mem_cgroup_iter_break(struct mem_cgroup *root, + struct mem_cgroup *prev) +{ + if (!root) + root = root_mem_cgroup; + if (prev && prev != root) + css_put(&prev->css); +} -#define for_each_mem_cgroup_all(iter) \ - for_each_mem_cgroup_tree_cond(iter, NULL, true) +/* + * Iteration constructs for visiting all cgroups (under a tree). If + * loops are exited prematurely (break), mem_cgroup_iter_break() must + * be used for reference counting. + */ +#define for_each_mem_cgroup_tree(iter, root) \ + for (iter = mem_cgroup_iter(root, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(root, iter, NULL)) +#define for_each_mem_cgroup(iter) \ + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(NULL, iter, NULL)) static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { @@ -949,11 +968,11 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) goto out; switch (idx) { - case PGMAJFAULT: - mem_cgroup_pgmajfault(memcg, 1); - break; case PGFAULT: - mem_cgroup_pgfault(memcg, 1); + this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); + break; + case PGMAJFAULT: + this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); break; default: BUG(); @@ -963,6 +982,27 @@ out: } EXPORT_SYMBOL(mem_cgroup_count_vm_event); +/** + * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg + * @zone: zone of the wanted lruvec + * @mem: memcg of the wanted lruvec + * + * Returns the lru list vector holding pages for the given @zone and + * @mem. This can be the global zone lruvec, if the memory controller + * is disabled. + */ +struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, + struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_zone *mz; + + if (mem_cgroup_disabled()) + return &zone->lruvec; + + mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); + return &mz->lruvec; +} + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -977,180 +1017,91 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event); * When moving account, the page is not on LRU. It's isolated. */ -void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) -{ - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; - - if (mem_cgroup_disabled()) - return; - pc = lookup_page_cgroup(page); - /* can happen while we handle swapcache. */ - if (!TestClearPageCgroupAcctLRU(pc)) - return; - VM_BUG_ON(!pc->mem_cgroup); - /* - * We don't check PCG_USED bit. It's cleared when the "page" is finally - * removed from global LRU. - */ - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - /* huge page split is done under lru_lock. so, we have no races. */ - MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - VM_BUG_ON(list_empty(&pc->lru)); - list_del_init(&pc->lru); -} - -void mem_cgroup_del_lru(struct page *page) -{ - mem_cgroup_del_lru_list(page, page_lru(page)); -} - -/* - * Writeback is about to end against a page which has been marked for immediate - * reclaim. If it still appears to be reclaimable, move it to the tail of the - * inactive list. +/** + * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec + * @zone: zone of the page + * @page: the page + * @lru: current lru + * + * This function accounts for @page being added to @lru, and returns + * the lruvec for the given @zone and the memcg @page is charged to. + * + * The callsite is then responsible for physically linking the page to + * the returned lruvec->lists[@lru]. */ -void mem_cgroup_rotate_reclaimable_page(struct page *page) +struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, + enum lru_list lru) { struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; struct page_cgroup *pc; - enum lru_list lru = page_lru(page); if (mem_cgroup_disabled()) - return; + return &zone->lruvec; pc = lookup_page_cgroup(page); - /* unused or root page is not rotated. */ - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - list_move_tail(&pc->lru, &mz->lists[lru]); + memcg = pc->mem_cgroup; + mz = page_cgroup_zoneinfo(memcg, page); + /* compound_order() is stabilized through lru_lock */ + MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); + return &mz->lruvec; } -void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) +/** + * mem_cgroup_lru_del_list - account for removing an lru page + * @page: the page + * @lru: target lru + * + * This function accounts for @page being removed from @lru. + * + * The callsite is then responsible for physically unlinking + * @page->lru. + */ +void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) { struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; struct page_cgroup *pc; if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); - /* unused or root page is not rotated. */ - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - list_move(&pc->lru, &mz->lists[lru]); -} - -void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) -{ - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; - - if (mem_cgroup_disabled()) - return; - pc = lookup_page_cgroup(page); - VM_BUG_ON(PageCgroupAcctLRU(pc)); - /* - * putback: charge: - * SetPageLRU SetPageCgroupUsed - * smp_mb smp_mb - * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU - * - * Ensure that one of the two sides adds the page to the memcg - * LRU during a race. - */ - smp_mb(); - if (!PageCgroupUsed(pc)) - return; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); + memcg = pc->mem_cgroup; + VM_BUG_ON(!memcg); + mz = page_cgroup_zoneinfo(memcg, page); /* huge page split is done under lru_lock. so, we have no races. */ - MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); - SetPageCgroupAcctLRU(pc); - if (mem_cgroup_is_root(pc->mem_cgroup)) - return; - list_add(&pc->lru, &mz->lists[lru]); -} - -/* - * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed - * while it's linked to lru because the page may be reused after it's fully - * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. - * It's done under lock_page and expected that zone->lru_lock isnever held. - */ -static void mem_cgroup_lru_del_before_commit(struct page *page) -{ - unsigned long flags; - struct zone *zone = page_zone(page); - struct page_cgroup *pc = lookup_page_cgroup(page); - - /* - * Doing this check without taking ->lru_lock seems wrong but this - * is safe. Because if page_cgroup's USED bit is unset, the page - * will not be added to any memcg's LRU. If page_cgroup's USED bit is - * set, the commit after this will fail, anyway. - * This all charge/uncharge is done under some mutual execustion. - * So, we don't need to taking care of changes in USED bit. - */ - if (likely(!PageLRU(page))) - return; - - spin_lock_irqsave(&zone->lru_lock, flags); - /* - * Forget old LRU when this page_cgroup is *not* used. This Used bit - * is guarded by lock_page() because the page is SwapCache. - */ - if (!PageCgroupUsed(pc)) - mem_cgroup_del_lru_list(page, page_lru(page)); - spin_unlock_irqrestore(&zone->lru_lock, flags); + VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); + MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); } -static void mem_cgroup_lru_add_after_commit(struct page *page) +void mem_cgroup_lru_del(struct page *page) { - unsigned long flags; - struct zone *zone = page_zone(page); - struct page_cgroup *pc = lookup_page_cgroup(page); - /* - * putback: charge: - * SetPageLRU SetPageCgroupUsed - * smp_mb smp_mb - * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU - * - * Ensure that one of the two sides adds the page to the memcg - * LRU during a race. - */ - smp_mb(); - /* taking care of that the page is added to LRU while we commit it */ - if (likely(!PageLRU(page))) - return; - spin_lock_irqsave(&zone->lru_lock, flags); - /* link when the page is linked to LRU but page_cgroup isn't */ - if (PageLRU(page) && !PageCgroupAcctLRU(pc)) - mem_cgroup_add_lru_list(page, page_lru(page)); - spin_unlock_irqrestore(&zone->lru_lock, flags); + mem_cgroup_lru_del_list(page, page_lru(page)); } - -void mem_cgroup_move_lists(struct page *page, - enum lru_list from, enum lru_list to) +/** + * mem_cgroup_lru_move_lists - account for moving a page between lrus + * @zone: zone of the page + * @page: the page + * @from: current lru + * @to: target lru + * + * This function accounts for @page being moved between the lrus @from + * and @to, and returns the lruvec for the given @zone and the memcg + * @page is charged to. + * + * The callsite is then responsible for physically relinking + * @page->lru to the returned lruvec->lists[@to]. + */ +struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, + struct page *page, + enum lru_list from, + enum lru_list to) { - if (mem_cgroup_disabled()) - return; - mem_cgroup_del_lru_list(page, from); - mem_cgroup_add_lru_list(page, to); + /* XXX: Optimize this, especially for @from == @to */ + mem_cgroup_lru_del_list(page, from); + return mem_cgroup_lru_add_list(zone, page, to); } /* @@ -1175,10 +1126,21 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) struct task_struct *p; p = find_lock_task_mm(task); - if (!p) - return 0; - curr = try_get_mem_cgroup_from_mm(p->mm); - task_unlock(p); + if (p) { + curr = try_get_mem_cgroup_from_mm(p->mm); + task_unlock(p); + } else { + /* + * All threads may have already detached their mm's, but the oom + * killer still needs to detect if they have already been oom + * killed to prevent needlessly killing additional tasks. + */ + task_lock(task); + curr = mem_cgroup_from_task(task); + if (curr) + css_get(&curr->css); + task_unlock(task); + } if (!curr) return 0; /* @@ -1258,68 +1220,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return &mz->reclaim_stat; } -unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, - struct list_head *dst, - unsigned long *scanned, int order, - isolate_mode_t mode, - struct zone *z, - struct mem_cgroup *mem_cont, - int active, int file) -{ - unsigned long nr_taken = 0; - struct page *page; - unsigned long scan; - LIST_HEAD(pc_list); - struct list_head *src; - struct page_cgroup *pc, *tmp; - int nid = zone_to_nid(z); - int zid = zone_idx(z); - struct mem_cgroup_per_zone *mz; - int lru = LRU_FILE * file + active; - int ret; - - BUG_ON(!mem_cont); - mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); - src = &mz->lists[lru]; - - scan = 0; - list_for_each_entry_safe_reverse(pc, tmp, src, lru) { - if (scan >= nr_to_scan) - break; - - if (unlikely(!PageCgroupUsed(pc))) - continue; - - page = lookup_cgroup_page(pc); - - if (unlikely(!PageLRU(page))) - continue; - - scan++; - ret = __isolate_lru_page(page, mode, file); - switch (ret) { - case 0: - list_move(&page->lru, dst); - mem_cgroup_del_lru(page); - nr_taken += hpage_nr_pages(page); - break; - case -EBUSY: - /* we don't affect global LRU but rotate in our LRU */ - mem_cgroup_rotate_lru_list(page, page_lru(page)); - break; - default: - break; - } - } - - *scanned = scan; - - trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, - 0, 0, 0, mode); - - return nr_taken; -} - #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -1536,41 +1436,40 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) return min(limit, memsw); } -/* - * Visit the first child (need not be the first child as per the ordering - * of the cgroup list, since we track last_scanned_child) of @mem and use - * that to reclaim free pages from. - */ -static struct mem_cgroup * -mem_cgroup_select_victim(struct mem_cgroup *root_memcg) +static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, + gfp_t gfp_mask, + unsigned long flags) { - struct mem_cgroup *ret = NULL; - struct cgroup_subsys_state *css; - int nextid, found; - - if (!root_memcg->use_hierarchy) { - css_get(&root_memcg->css); - ret = root_memcg; - } + unsigned long total = 0; + bool noswap = false; + int loop; - while (!ret) { - rcu_read_lock(); - nextid = root_memcg->last_scanned_child + 1; - css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css, - &found); - if (css && css_tryget(css)) - ret = container_of(css, struct mem_cgroup, css); + if (flags & MEM_CGROUP_RECLAIM_NOSWAP) + noswap = true; + if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) + noswap = true; - rcu_read_unlock(); - /* Updates scanning parameter */ - if (!css) { - /* this means start scan from ID:1 */ - root_memcg->last_scanned_child = 0; - } else - root_memcg->last_scanned_child = found; + for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { + if (loop) + drain_all_stock_async(memcg); + total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); + /* + * Allow limit shrinkers, which are triggered directly + * by userspace, to catch signals and stop reclaim + * after minimal progress, regardless of the margin. + */ + if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) + break; + if (mem_cgroup_margin(memcg)) + break; + /* + * If nothing was reclaimed after two attempts, there + * may be no reclaimable pages in this hierarchy. + */ + if (loop && !total) + break; } - - return ret; + return total; } /** @@ -1710,61 +1609,35 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) } #endif -/* - * Scan the hierarchy if needed to reclaim memory. We remember the last child - * we reclaimed from, so that we don't end up penalizing one child extensively - * based on its position in the children list. - * - * root_memcg is the original ancestor that we've been reclaim from. - * - * We give up and return to the caller when we visit root_memcg twice. - * (other groups can be removed while we're walking....) - * - * If shrink==true, for avoiding to free too much, this returns immedieately. - */ -static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, - struct zone *zone, - gfp_t gfp_mask, - unsigned long reclaim_options, - unsigned long *total_scanned) -{ - struct mem_cgroup *victim; - int ret, total = 0; +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, + struct zone *zone, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + struct mem_cgroup *victim = NULL; + int total = 0; int loop = 0; - bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; - bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; - bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; unsigned long excess; unsigned long nr_scanned; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = 0, + }; excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; - /* If memsw_is_minimum==1, swap-out is of-no-use. */ - if (!check_soft && !shrink && root_memcg->memsw_is_minimum) - noswap = true; - while (1) { - victim = mem_cgroup_select_victim(root_memcg); - if (victim == root_memcg) { + victim = mem_cgroup_iter(root_memcg, victim, &reclaim); + if (!victim) { loop++; - /* - * We are not draining per cpu cached charges during - * soft limit reclaim because global reclaim doesn't - * care about charges. It tries to free some memory and - * charges will not give any. - */ - if (!check_soft && loop >= 1) - drain_all_stock_async(root_memcg); if (loop >= 2) { /* * If we have not been able to reclaim * anything, it might because there are * no reclaimable pages under this hierarchy */ - if (!check_soft || !total) { - css_put(&victim->css); + if (!total) break; - } /* * We want to do more targeted reclaim. * excess >> 2 is not to excessive so as to @@ -1772,40 +1645,20 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, * coming back to reclaim from this cgroup */ if (total >= (excess >> 2) || - (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { - css_put(&victim->css); + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) break; - } } - } - if (!mem_cgroup_reclaimable(victim, noswap)) { - /* this cgroup's local usage == 0 */ - css_put(&victim->css); continue; } - /* we use swappiness of local cgroup */ - if (check_soft) { - ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, zone, &nr_scanned); - *total_scanned += nr_scanned; - } else - ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap); - css_put(&victim->css); - /* - * At shrinking usage, we can't check we should stop here or - * reclaim more. It's depends on callers. last_scanned_child - * will work enough for keeping fairness under tree. - */ - if (shrink) - return ret; - total += ret; - if (check_soft) { - if (!res_counter_soft_limit_excess(&root_memcg->res)) - return total; - } else if (mem_cgroup_margin(root_memcg)) - return total; + if (!mem_cgroup_reclaimable(victim, false)) + continue; + total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, + zone, &nr_scanned); + *total_scanned += nr_scanned; + if (!res_counter_soft_limit_excess(&root_memcg->res)) + break; } + mem_cgroup_iter_break(root_memcg, victim); return total; } @@ -1817,16 +1670,16 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) { struct mem_cgroup *iter, *failed = NULL; - bool cond = true; - for_each_mem_cgroup_tree_cond(iter, memcg, cond) { + for_each_mem_cgroup_tree(iter, memcg) { if (iter->oom_lock) { /* * this subtree of our hierarchy is already locked * so we cannot give a lock. */ failed = iter; - cond = false; + mem_cgroup_iter_break(memcg, iter); + break; } else iter->oom_lock = true; } @@ -1838,11 +1691,10 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) * OK, we failed to lock the whole subtree so we have to clean up * what we set up to the failing subtree */ - cond = true; - for_each_mem_cgroup_tree_cond(iter, memcg, cond) { + for_each_mem_cgroup_tree(iter, memcg) { if (iter == failed) { - cond = false; - continue; + mem_cgroup_iter_break(memcg, iter); + break; } iter->oom_lock = false; } @@ -2007,7 +1859,7 @@ void mem_cgroup_update_page_stat(struct page *page, bool need_unlock = false; unsigned long uninitialized_var(flags); - if (unlikely(!pc)) + if (mem_cgroup_disabled()) return; rcu_read_lock(); @@ -2238,7 +2090,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, struct mem_cgroup *iter; if ((action == CPU_ONLINE)) { - for_each_mem_cgroup_all(iter) + for_each_mem_cgroup(iter) synchronize_mem_cgroup_on_move(iter, cpu); return NOTIFY_OK; } @@ -2246,7 +2098,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) return NOTIFY_OK; - for_each_mem_cgroup_all(iter) + for_each_mem_cgroup(iter) mem_cgroup_drain_pcp_counter(iter, cpu); stock = &per_cpu(memcg_stock, cpu); @@ -2300,8 +2152,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; - ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags, NULL); + ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; /* @@ -2334,8 +2185,25 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, } /* - * Unlike exported interface, "oom" parameter is added. if oom==true, - * oom-killer can be invoked. + * __mem_cgroup_try_charge() does + * 1. detect memcg to be charged against from passed *mm and *ptr, + * 2. update res_counter + * 3. call memory reclaim if necessary. + * + * In some special case, if the task is fatal, fatal_signal_pending() or + * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup + * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon + * as possible without any hazards. 2: all pages should have a valid + * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg + * pointer, that is treated as a charge to root_mem_cgroup. + * + * So __mem_cgroup_try_charge() will return + * 0 ... on success, filling *ptr with a valid memcg pointer. + * -ENOMEM ... charge failure because of resource limits. + * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. + * + * Unlike the exported interface, an "oom" parameter is added. if oom==true, + * the oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, gfp_t gfp_mask, @@ -2364,7 +2232,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, * set, if so charge the init_mm (happens for pagecache usage). */ if (!*ptr && !mm) - goto bypass; + *ptr = root_mem_cgroup; again: if (*ptr) { /* css should be a valid one */ memcg = *ptr; @@ -2390,7 +2258,9 @@ again: * task-struct. So, mm->owner can be NULL. */ memcg = mem_cgroup_from_task(p); - if (!memcg || mem_cgroup_is_root(memcg)) { + if (!memcg) + memcg = root_mem_cgroup; + if (mem_cgroup_is_root(memcg)) { rcu_read_unlock(); goto done; } @@ -2465,8 +2335,8 @@ nomem: *ptr = NULL; return -ENOMEM; bypass: - *ptr = NULL; - return 0; + *ptr = root_mem_cgroup; + return -EINTR; } /* @@ -2522,7 +2392,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) memcg = NULL; } else if (PageSwapCache(page)) { ent.val = page_private(page); - id = lookup_swap_cgroup(ent); + id = lookup_swap_cgroup_id(ent); rcu_read_lock(); memcg = mem_cgroup_lookup(id); if (memcg && !css_tryget(&memcg->css)) @@ -2574,6 +2444,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); unlock_page_cgroup(pc); + WARN_ON_ONCE(PageLRU(page)); /* * "charge_statistics" updated event counter. Then, check it. * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. @@ -2585,44 +2456,29 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ - (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) + (1 << PCG_MIGRATION)) /* * Because tail pages are not marked as "used", set it. We're under - * zone->lru_lock, 'splitting on pmd' and compund_lock. + * zone->lru_lock, 'splitting on pmd' and compound_lock. + * charge/uncharge will be never happen and move_account() is done under + * compound_lock(), so we don't have to take care of races. */ -void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) +void mem_cgroup_split_huge_fixup(struct page *head) { struct page_cgroup *head_pc = lookup_page_cgroup(head); - struct page_cgroup *tail_pc = lookup_page_cgroup(tail); - unsigned long flags; + struct page_cgroup *pc; + int i; if (mem_cgroup_disabled()) return; - /* - * We have no races with charge/uncharge but will have races with - * page state accounting. - */ - move_lock_page_cgroup(head_pc, &flags); - - tail_pc->mem_cgroup = head_pc->mem_cgroup; - smp_wmb(); /* see __commit_charge() */ - if (PageCgroupAcctLRU(head_pc)) { - enum lru_list lru; - struct mem_cgroup_per_zone *mz; - - /* - * LRU flags cannot be copied because we need to add tail - *.page to LRU by generic call and our hook will be called. - * We hold lru_lock, then, reduce counter directly. - */ - lru = page_lru(head); - mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); - MEM_CGROUP_ZSTAT(mz, lru) -= 1; + for (i = 1; i < HPAGE_PMD_NR; i++) { + pc = head_pc + i; + pc->mem_cgroup = head_pc->mem_cgroup; + smp_wmb();/* see __commit_charge() */ + pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; } - tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; - move_unlock_page_cgroup(head_pc, &flags); } -#endif +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /** * mem_cgroup_move_account - move account of the page @@ -2737,7 +2593,7 @@ static int mem_cgroup_move_parent(struct page *page, parent = mem_cgroup_from_cont(pcg); ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); - if (ret || !parent) + if (ret) goto put_back; if (nr_pages > 1) @@ -2783,12 +2639,9 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, } pc = lookup_page_cgroup(page); - BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ - ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); - if (ret || !memcg) + if (ret == -ENOMEM) return ret; - __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); return 0; } @@ -2798,19 +2651,11 @@ int mem_cgroup_newpage_charge(struct page *page, { if (mem_cgroup_disabled()) return 0; - /* - * If already mapped, we don't have to account. - * If page cache, page->mapping has address_space. - * But page->mapping may have out-of-use anon_vma pointer, - * detecit it by PageAnon() check. newly-mapped-anon's page->mapping - * is NULL. - */ - if (page_mapped(page) || (page->mapping && !PageAnon(page))) - return 0; - if (unlikely(!mm)) - mm = &init_mm; + VM_BUG_ON(page_mapped(page)); + VM_BUG_ON(page->mapping && !PageAnon(page)); + VM_BUG_ON(!mm); return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + MEM_CGROUP_CHARGE_TYPE_MAPPED); } static void @@ -2822,14 +2667,27 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg, enum charge_type ctype) { struct page_cgroup *pc = lookup_page_cgroup(page); + struct zone *zone = page_zone(page); + unsigned long flags; + bool removed = false; + /* * In some case, SwapCache, FUSE(splice_buf->radixtree), the page * is already on LRU. It means the page may on some other page_cgroup's * LRU. Take care of it. */ - mem_cgroup_lru_del_before_commit(page); + spin_lock_irqsave(&zone->lru_lock, flags); + if (PageLRU(page)) { + del_page_from_lru_list(zone, page, page_lru(page)); + ClearPageLRU(page); + removed = true; + } __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); - mem_cgroup_lru_add_after_commit(page); + if (removed) { + add_page_to_lru_list(zone, page, page_lru(page)); + SetPageLRU(page); + } + spin_unlock_irqrestore(&zone->lru_lock, flags); return; } @@ -2837,6 +2695,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { struct mem_cgroup *memcg = NULL; + enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; int ret; if (mem_cgroup_disabled()) @@ -2846,31 +2705,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (unlikely(!mm)) mm = &init_mm; + if (!page_is_file_cache(page)) + type = MEM_CGROUP_CHARGE_TYPE_SHMEM; - if (page_is_file_cache(page)) { - ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); - if (ret || !memcg) - return ret; - - /* - * FUSE reuses pages without going through the final - * put that would remove them from the LRU list, make - * sure that they get relinked properly. - */ - __mem_cgroup_commit_charge_lrucare(page, memcg, - MEM_CGROUP_CHARGE_TYPE_CACHE); - return ret; - } - /* shmem */ - if (PageSwapCache(page)) { + if (!PageSwapCache(page)) + ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); + else { /* page is swapcache/shmem */ ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); if (!ret) - __mem_cgroup_commit_charge_swapin(page, memcg, - MEM_CGROUP_CHARGE_TYPE_SHMEM); - } else - ret = mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_SHMEM); - + __mem_cgroup_commit_charge_swapin(page, memcg, type); + } return ret; } @@ -2882,12 +2726,12 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, */ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, - gfp_t mask, struct mem_cgroup **ptr) + gfp_t mask, struct mem_cgroup **memcgp) { struct mem_cgroup *memcg; int ret; - *ptr = NULL; + *memcgp = NULL; if (mem_cgroup_disabled()) return 0; @@ -2905,27 +2749,32 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, memcg = try_get_mem_cgroup_from_page(page); if (!memcg) goto charge_cur_mm; - *ptr = memcg; - ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); + *memcgp = memcg; + ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); css_put(&memcg->css); + if (ret == -EINTR) + ret = 0; return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); + ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); + if (ret == -EINTR) + ret = 0; + return ret; } static void -__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, +__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, enum charge_type ctype) { if (mem_cgroup_disabled()) return; - if (!ptr) + if (!memcg) return; - cgroup_exclude_rmdir(&ptr->css); + cgroup_exclude_rmdir(&memcg->css); - __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); + __mem_cgroup_commit_charge_lrucare(page, memcg, ctype); /* * Now swap is on-memory. This means this page may be * counted both as mem and swap....double count. @@ -2935,21 +2784,22 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, */ if (do_swap_account && PageSwapCache(page)) { swp_entry_t ent = {.val = page_private(page)}; + struct mem_cgroup *swap_memcg; unsigned short id; - struct mem_cgroup *memcg; id = swap_cgroup_record(ent, 0); rcu_read_lock(); - memcg = mem_cgroup_lookup(id); - if (memcg) { + swap_memcg = mem_cgroup_lookup(id); + if (swap_memcg) { /* * This recorded memcg can be obsolete one. So, avoid * calling css_tryget */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); - mem_cgroup_swap_statistics(memcg, false); - mem_cgroup_put(memcg); + if (!mem_cgroup_is_root(swap_memcg)) + res_counter_uncharge(&swap_memcg->memsw, + PAGE_SIZE); + mem_cgroup_swap_statistics(swap_memcg, false); + mem_cgroup_put(swap_memcg); } rcu_read_unlock(); } @@ -2958,13 +2808,14 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, * So, rmdir()->pre_destroy() can be called while we do this charge. * In that case, we need to call pre_destroy() again. check it here. */ - cgroup_release_and_wakeup_rmdir(&ptr->css); + cgroup_release_and_wakeup_rmdir(&memcg->css); } -void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) +void mem_cgroup_commit_charge_swapin(struct page *page, + struct mem_cgroup *memcg) { - __mem_cgroup_commit_charge_swapin(page, ptr, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + __mem_cgroup_commit_charge_swapin(page, memcg, + MEM_CGROUP_CHARGE_TYPE_MAPPED); } void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) @@ -3054,7 +2905,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) * Check if our page_cgroup is valid */ pc = lookup_page_cgroup(page); - if (unlikely(!pc || !PageCgroupUsed(pc))) + if (unlikely(!PageCgroupUsed(pc))) return NULL; lock_page_cgroup(pc); @@ -3117,8 +2968,7 @@ void mem_cgroup_uncharge_page(struct page *page) /* early check. */ if (page_mapped(page)) return; - if (page->mapping && !PageAnon(page)) - return; + VM_BUG_ON(page->mapping && !PageAnon(page)); __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); } @@ -3176,6 +3026,23 @@ void mem_cgroup_uncharge_end(void) batch->memcg = NULL; } +/* + * A function for resetting pc->mem_cgroup for newly allocated pages. + * This function should be called if the newpage will be added to LRU + * before start accounting. + */ +void mem_cgroup_reset_owner(struct page *newpage) +{ + struct page_cgroup *pc; + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(newpage); + VM_BUG_ON(PageCgroupUsed(pc)); + pc->mem_cgroup = root_mem_cgroup; +} + #ifdef CONFIG_SWAP /* * called after __delete_from_swap_cache() and drop "page" account. @@ -3293,14 +3160,14 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, * page belongs to. */ int mem_cgroup_prepare_migration(struct page *page, - struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) + struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) { struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; enum charge_type ctype; int ret = 0; - *ptr = NULL; + *memcgp = NULL; VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) @@ -3351,10 +3218,10 @@ int mem_cgroup_prepare_migration(struct page *page, if (!memcg) return 0; - *ptr = memcg; - ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); + *memcgp = memcg; + ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); css_put(&memcg->css);/* drop extra refcnt */ - if (ret || *ptr == NULL) { + if (ret) { if (PageAnon(page)) { lock_page_cgroup(pc); ClearPageCgroupMigration(pc); @@ -3364,6 +3231,7 @@ int mem_cgroup_prepare_migration(struct page *page, */ mem_cgroup_uncharge_page(page); } + /* we'll need to revisit this error code (we have -EINTR) */ return -ENOMEM; } /* @@ -3432,12 +3300,51 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, cgroup_release_and_wakeup_rmdir(&memcg->css); } +/* + * At replace page cache, newpage is not under any memcg but it's on + * LRU. So, this function doesn't touch res_counter but handles LRU + * in correct way. Both pages are locked so we cannot race with uncharge. + */ +void mem_cgroup_replace_page_cache(struct page *oldpage, + struct page *newpage) +{ + struct mem_cgroup *memcg; + struct page_cgroup *pc; + enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(oldpage); + /* fix accounting on old pages */ + lock_page_cgroup(pc); + memcg = pc->mem_cgroup; + mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); + ClearPageCgroupUsed(pc); + unlock_page_cgroup(pc); + + if (PageSwapBacked(oldpage)) + type = MEM_CGROUP_CHARGE_TYPE_SHMEM; + + /* + * Even if newpage->mapping was NULL before starting replacement, + * the newpage may be on LRU(or pagevec for LRU) already. We lock + * LRU while we overwrite pc->mem_cgroup. + */ + __mem_cgroup_commit_charge_lrucare(newpage, memcg, type); +} + #ifdef CONFIG_DEBUG_VM static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { struct page_cgroup *pc; pc = lookup_page_cgroup(page); + /* + * Can be NULL while feeding pages into the page allocator for + * the first time, i.e. during boot or memory hotplug; + * or when mem_cgroup_disabled(). + */ if (likely(pc) && PageCgroupUsed(pc)) return pc; return NULL; @@ -3457,23 +3364,8 @@ void mem_cgroup_print_bad_page(struct page *page) pc = lookup_page_cgroup_used(page); if (pc) { - int ret = -1; - char *path; - - printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", + printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", pc, pc->flags, pc->mem_cgroup); - - path = kmalloc(PATH_MAX, GFP_KERNEL); - if (path) { - rcu_read_lock(); - ret = cgroup_path(pc->mem_cgroup->css.cgroup, - path, PATH_MAX); - rcu_read_unlock(); - } - - printk(KERN_CONT "(%s)\n", - (ret < 0) ? "cannot get the path" : path); - kfree(path); } } #endif @@ -3534,9 +3426,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, if (!ret) break; - mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, - MEM_CGROUP_RECLAIM_SHRINK, - NULL); + mem_cgroup_reclaim(memcg, GFP_KERNEL, + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3594,10 +3485,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, if (!ret) break; - mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, - MEM_CGROUP_RECLAIM_NOSWAP | - MEM_CGROUP_RECLAIM_SHRINK, - NULL); + mem_cgroup_reclaim(memcg, GFP_KERNEL, + MEM_CGROUP_RECLAIM_NOSWAP | + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3640,10 +3530,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; nr_scanned = 0; - reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, - gfp_mask, - MEM_CGROUP_RECLAIM_SOFT, - &nr_scanned); + reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, + gfp_mask, &nr_scanned); nr_reclaimed += reclaimed; *total_scanned += nr_scanned; spin_lock(&mctz->lock); @@ -3711,22 +3599,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { - struct zone *zone; struct mem_cgroup_per_zone *mz; - struct page_cgroup *pc, *busy; unsigned long flags, loop; struct list_head *list; + struct page *busy; + struct zone *zone; int ret = 0; zone = &NODE_DATA(node)->node_zones[zid]; mz = mem_cgroup_zoneinfo(memcg, node, zid); - list = &mz->lists[lru]; + list = &mz->lruvec.lists[lru]; loop = MEM_CGROUP_ZSTAT(mz, lru); /* give some margin against EBUSY etc...*/ loop += 256; busy = NULL; while (loop--) { + struct page_cgroup *pc; struct page *page; ret = 0; @@ -3735,24 +3624,24 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, spin_unlock_irqrestore(&zone->lru_lock, flags); break; } - pc = list_entry(list->prev, struct page_cgroup, lru); - if (busy == pc) { - list_move(&pc->lru, list); + page = list_entry(list->prev, struct page, lru); + if (busy == page) { + list_move(&page->lru, list); busy = NULL; spin_unlock_irqrestore(&zone->lru_lock, flags); continue; } spin_unlock_irqrestore(&zone->lru_lock, flags); - page = lookup_cgroup_page(pc); + pc = lookup_page_cgroup(page); ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); - if (ret == -ENOMEM) + if (ret == -ENOMEM || ret == -EINTR) break; if (ret == -EBUSY || ret == -EINVAL) { /* found lock contention or "pc" is obsolete. */ - busy = pc; + busy = page; cond_resched(); } else busy = NULL; @@ -4846,7 +4735,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) - INIT_LIST_HEAD(&mz->lists[l]); + INIT_LIST_HEAD(&mz->lruvec.lists[l]); mz->usage_in_excess = 0; mz->on_tree = false; mz->mem = memcg; @@ -4906,7 +4795,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) mem_cgroup_remove_from_trees(memcg); free_css_id(&mem_cgroup_subsys, &memcg->css); - for_each_node_state(node, N_POSSIBLE) + for_each_node(node) free_mem_cgroup_per_zone_info(memcg, node); free_percpu(memcg->stat); @@ -4965,13 +4854,13 @@ static int mem_cgroup_soft_limit_tree_init(void) struct mem_cgroup_tree_per_zone *rtpz; int tmp, node, zone; - for_each_node_state(node, N_POSSIBLE) { + for_each_node(node) { tmp = node; if (!node_state(node, N_NORMAL_MEMORY)) tmp = -1; rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); if (!rtpn) - return 1; + goto err_cleanup; soft_limit_tree.rb_tree_per_node[node] = rtpn; @@ -4982,6 +4871,16 @@ static int mem_cgroup_soft_limit_tree_init(void) } } return 0; + +err_cleanup: + for_each_node(node) { + if (!soft_limit_tree.rb_tree_per_node[node]) + break; + kfree(soft_limit_tree.rb_tree_per_node[node]); + soft_limit_tree.rb_tree_per_node[node] = NULL; + } + return 1; + } static struct cgroup_subsys_state * __ref @@ -4995,7 +4894,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) if (!memcg) return ERR_PTR(error); - for_each_node_state(node, N_POSSIBLE) + for_each_node(node) if (alloc_mem_cgroup_per_zone_info(memcg, node)) goto free_out; @@ -5033,7 +4932,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); } - memcg->last_scanned_child = 0; memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); @@ -5129,9 +5027,9 @@ one_by_one: } ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &memcg, false); - if (ret || !memcg) + if (ret) /* mem_cgroup_clear_mc() will do uncharge later */ - return -ENOMEM; + return ret; mc.precharge++; } return ret; @@ -5276,7 +5174,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, } /* There is a swap entry and a page doesn't exist or isn't charged */ if (ent.val && !ret && - css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { + css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { ret = MC_TARGET_SWAP; if (target) target->ent = ent; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 06d3479513aa..56080ea36140 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1557,7 +1557,7 @@ int soft_offline_page(struct page *page, int flags) page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, - 0, true); + 0, MIGRATE_SYNC); if (ret) { putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", diff --git a/mm/memory.c b/mm/memory.c index 829d43735402..5e30583c2605 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -293,7 +293,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { struct mmu_gather_batch *batch; - tlb->need_flush = 1; + VM_BUG_ON(!tlb->need_flush); if (tlb_fast_mode(tlb)) { free_page_and_swap_cache(page); @@ -1231,7 +1231,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (next-addr != HPAGE_PMD_SIZE) { VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); - } else if (zap_huge_pmd(tlb, vma, pmd)) + } else if (zap_huge_pmd(tlb, vma, pmd, addr)) continue; /* fall through */ } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2168489c0bc9..6629fafd6ce4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -809,7 +809,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } /* this function returns # of failed pages */ ret = migrate_pages(&source, hotremove_migrate_alloc, 0, - true, true); + true, MIGRATE_SYNC); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e3d58f088466..06b145fb64ab 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -942,7 +942,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, - false, true); + false, MIGRATE_SYNC); if (err) putback_lru_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 89ea0854332e..9871a56d82c3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -216,6 +216,56 @@ out: pte_unmap_unlock(ptep, ptl); } +#ifdef CONFIG_BLOCK +/* Returns true if all buffers are successfully locked */ +static bool buffer_migrate_lock_buffers(struct buffer_head *head, + enum migrate_mode mode) +{ + struct buffer_head *bh = head; + + /* Simple case, sync compaction */ + if (mode != MIGRATE_ASYNC) { + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return true; + } + + /* async case, we cannot block on lock_buffer so use trylock_buffer */ + do { + get_bh(bh); + if (!trylock_buffer(bh)) { + /* + * We failed to lock the buffer and cannot stall in + * async migration. Release the taken locks + */ + struct buffer_head *failed_bh = bh; + put_bh(failed_bh); + bh = head; + while (bh != failed_bh) { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + } + return false; + } + + bh = bh->b_this_page; + } while (bh != head); + return true; +} +#else +static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, + enum migrate_mode mode) +{ + return true; +} +#endif /* CONFIG_BLOCK */ + /* * Replace the page in the mapping. * @@ -225,7 +275,8 @@ out: * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ static int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + struct buffer_head *head, enum migrate_mode mode) { int expected_count; void **pslot; @@ -255,6 +306,20 @@ static int migrate_page_move_mapping(struct address_space *mapping, } /* + * In the async migration case of moving a page with buffers, lock the + * buffers using trylock before the mapping is moved. If the mapping + * was moved, we later failed to lock the buffers and could not move + * the mapping back due to an elevated page count, we would have to + * block waiting on other references to be dropped. + */ + if (mode == MIGRATE_ASYNC && head && + !buffer_migrate_lock_buffers(head, mode)) { + page_unfreeze_refs(page, expected_count); + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + /* * Now we know that no one else is looking at the page. */ get_page(newpage); /* add cache reference */ @@ -409,13 +474,14 @@ EXPORT_SYMBOL(fail_migrate_page); * Pages are locked upon entry and exit. */ int migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + enum migrate_mode mode) { int rc; BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page); + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); if (rc) return rc; @@ -432,28 +498,28 @@ EXPORT_SYMBOL(migrate_page); * exist. */ int buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, enum migrate_mode mode) { struct buffer_head *bh, *head; int rc; if (!page_has_buffers(page)) - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); head = page_buffers(page); - rc = migrate_page_move_mapping(mapping, newpage, page); + rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); if (rc) return rc; - bh = head; - do { - get_bh(bh); - lock_buffer(bh); - bh = bh->b_this_page; - - } while (bh != head); + /* + * In the async case, migrate_page_move_mapping locked the buffers + * with an IRQ-safe spinlock held. In the sync case, the buffers + * need to be locked now + */ + if (mode != MIGRATE_ASYNC) + BUG_ON(!buffer_migrate_lock_buffers(head, mode)); ClearPagePrivate(page); set_page_private(newpage, page_private(page)); @@ -530,10 +596,14 @@ static int writeout(struct address_space *mapping, struct page *page) * Default handling if a filesystem does not provide a migration function. */ static int fallback_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, enum migrate_mode mode) { - if (PageDirty(page)) + if (PageDirty(page)) { + /* Only writeback pages in full synchronous migration */ + if (mode != MIGRATE_SYNC) + return -EBUSY; return writeout(mapping, page); + } /* * Buffers may be managed in a filesystem specific way. @@ -543,7 +613,7 @@ static int fallback_migrate_page(struct address_space *mapping, !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); } /* @@ -558,7 +628,7 @@ static int fallback_migrate_page(struct address_space *mapping, * == 0 - success */ static int move_to_new_page(struct page *newpage, struct page *page, - int remap_swapcache, bool sync) + int remap_swapcache, enum migrate_mode mode) { struct address_space *mapping; int rc; @@ -579,29 +649,18 @@ static int move_to_new_page(struct page *newpage, struct page *page, mapping = page_mapping(page); if (!mapping) - rc = migrate_page(mapping, newpage, page); - else { + rc = migrate_page(mapping, newpage, page, mode); + else if (mapping->a_ops->migratepage) /* - * Do not writeback pages if !sync and migratepage is - * not pointing to migrate_page() which is nonblocking - * (swapcache/tmpfs uses migratepage = migrate_page). + * Most pages have a mapping and most filesystems provide a + * migratepage callback. Anonymous pages are part of swap + * space which also has its own migratepage callback. This + * is the most common path for page migration. */ - if (PageDirty(page) && !sync && - mapping->a_ops->migratepage != migrate_page) - rc = -EBUSY; - else if (mapping->a_ops->migratepage) - /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. - */ - rc = mapping->a_ops->migratepage(mapping, - newpage, page); - else - rc = fallback_migrate_page(mapping, newpage, page); - } + rc = mapping->a_ops->migratepage(mapping, + newpage, page, mode); + else + rc = fallback_migrate_page(mapping, newpage, page, mode); if (rc) { newpage->mapping = NULL; @@ -616,7 +675,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, } static int __unmap_and_move(struct page *page, struct page *newpage, - int force, bool offlining, bool sync) + int force, bool offlining, enum migrate_mode mode) { int rc = -EAGAIN; int remap_swapcache = 1; @@ -625,7 +684,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, struct anon_vma *anon_vma = NULL; if (!trylock_page(page)) { - if (!force || !sync) + if (!force || mode == MIGRATE_ASYNC) goto out; /* @@ -671,10 +730,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (PageWriteback(page)) { /* - * For !sync, there is no point retrying as the retry loop - * is expected to be too short for PageWriteback to be cleared + * Only in the case of a full syncronous migration is it + * necessary to wait for PageWriteback. In the async case, + * the retry loop is too short and in the sync-light case, + * the overhead of stalling is too much */ - if (!sync) { + if (mode != MIGRATE_SYNC) { rc = -EBUSY; goto uncharge; } @@ -745,7 +806,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, skip_unmap: if (!page_mapped(page)) - rc = move_to_new_page(newpage, page, remap_swapcache, sync); + rc = move_to_new_page(newpage, page, remap_swapcache, mode); if (rc && remap_swapcache) remove_migration_ptes(page, page); @@ -768,7 +829,8 @@ out: * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, bool offlining, bool sync) + struct page *page, int force, bool offlining, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -777,6 +839,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!newpage) return -ENOMEM; + mem_cgroup_reset_owner(newpage); + if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ goto out; @@ -786,7 +850,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (unlikely(split_huge_page(page))) goto out; - rc = __unmap_and_move(page, newpage, force, offlining, sync); + rc = __unmap_and_move(page, newpage, force, offlining, mode); out: if (rc != -EAGAIN) { /* @@ -834,7 +898,8 @@ out: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, bool offlining, bool sync) + int force, bool offlining, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -847,7 +912,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc = -EAGAIN; if (!trylock_page(hpage)) { - if (!force || !sync) + if (!force || mode != MIGRATE_SYNC) goto out; lock_page(hpage); } @@ -858,7 +923,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); if (!page_mapped(hpage)) - rc = move_to_new_page(new_hpage, hpage, 1, sync); + rc = move_to_new_page(new_hpage, hpage, 1, mode); if (rc) remove_migration_ptes(hpage, hpage); @@ -901,7 +966,7 @@ out: */ int migrate_pages(struct list_head *from, new_page_t get_new_page, unsigned long private, bool offlining, - bool sync) + enum migrate_mode mode) { int retry = 1; int nr_failed = 0; @@ -922,7 +987,7 @@ int migrate_pages(struct list_head *from, rc = unmap_and_move(get_new_page, private, page, pass > 2, offlining, - sync); + mode); switch(rc) { case -ENOMEM: @@ -952,7 +1017,7 @@ out: int migrate_huge_pages(struct list_head *from, new_page_t get_new_page, unsigned long private, bool offlining, - bool sync) + enum migrate_mode mode) { int retry = 1; int nr_failed = 0; @@ -969,7 +1034,7 @@ int migrate_huge_pages(struct list_head *from, rc = unmap_and_move_huge_page(get_new_page, private, page, pass > 2, offlining, - sync); + mode); switch(rc) { case -ENOMEM: @@ -1098,7 +1163,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0, true); + (unsigned long)pm, 0, MIGRATE_SYNC); if (err) putback_lru_pages(&pagelist); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7c122faa05c5..2958fd8e7c9a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -152,7 +152,7 @@ struct task_struct *find_lock_task_mm(struct task_struct *p) /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, - const struct mem_cgroup *mem, const nodemask_t *nodemask) + const struct mem_cgroup *memcg, const nodemask_t *nodemask) { if (is_global_init(p)) return true; @@ -160,7 +160,7 @@ static bool oom_unkillable_task(struct task_struct *p, return true; /* When mem_cgroup_out_of_memory() and p is not member of the group */ - if (mem && !task_in_mem_cgroup(p, mem)) + if (memcg && !task_in_mem_cgroup(p, memcg)) return true; /* p may not have freeable memory in nodemask */ @@ -179,12 +179,12 @@ static bool oom_unkillable_task(struct task_struct *p, * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ -unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, +unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages) { long points; - if (oom_unkillable_task(p, mem, nodemask)) + if (oom_unkillable_task(p, memcg, nodemask)) return 0; p = find_lock_task_mm(p); @@ -308,7 +308,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, * (not docbooked, we don't want this one cluttering up the manual) */ static struct task_struct *select_bad_process(unsigned int *ppoints, - unsigned long totalpages, struct mem_cgroup *mem, + unsigned long totalpages, struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *g, *p; @@ -320,7 +320,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, if (p->exit_state) continue; - if (oom_unkillable_task(p, mem, nodemask)) + if (oom_unkillable_task(p, memcg, nodemask)) continue; /* @@ -364,7 +364,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, } } - points = oom_badness(p, mem, nodemask, totalpages); + points = oom_badness(p, memcg, nodemask, totalpages); if (points > *ppoints) { chosen = p; *ppoints = points; @@ -387,14 +387,14 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * * Call with tasklist_lock read-locked. */ -static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) +static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); for_each_process(p) { - if (oom_unkillable_task(p, mem, nodemask)) + if (oom_unkillable_task(p, memcg, nodemask)) continue; task = find_lock_task_mm(p); @@ -417,7 +417,7 @@ static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) } static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, - struct mem_cgroup *mem, const nodemask_t *nodemask) + struct mem_cgroup *memcg, const nodemask_t *nodemask) { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " @@ -427,14 +427,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); - mem_cgroup_print_oom_info(mem, p); + mem_cgroup_print_oom_info(memcg, p); show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) - dump_tasks(mem, nodemask); + dump_tasks(memcg, nodemask); } #define K(x) ((x) << (PAGE_SHIFT-10)) -static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) +static int oom_kill_task(struct task_struct *p) { struct task_struct *q; struct mm_struct *mm; @@ -484,7 +484,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int points, unsigned long totalpages, - struct mem_cgroup *mem, nodemask_t *nodemask, + struct mem_cgroup *memcg, nodemask_t *nodemask, const char *message) { struct task_struct *victim = p; @@ -493,7 +493,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int victim_points = 0; if (printk_ratelimit()) - dump_header(p, gfp_mask, order, mem, nodemask); + dump_header(p, gfp_mask, order, memcg, nodemask); /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -524,7 +524,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * oom_badness() returns 0 if the thread is unkillable */ - child_points = oom_badness(child, mem, nodemask, + child_points = oom_badness(child, memcg, nodemask, totalpages); if (child_points > victim_points) { victim = child; @@ -533,7 +533,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, } } while_each_thread(p, t); - return oom_kill_task(victim, mem); + return oom_kill_task(victim); } /* @@ -561,7 +561,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, } #ifdef CONFIG_CGROUP_MEM_RES_CTLR -void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) +void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) { unsigned long limit; unsigned int points = 0; @@ -578,14 +578,14 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) } check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); - limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; + limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; read_lock(&tasklist_lock); retry: - p = select_bad_process(&points, limit, mem, NULL); + p = select_bad_process(&points, limit, memcg, NULL); if (!p || PTR_ERR(p) == -1UL) goto out; - if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL, + if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL, "Memory cgroup out of memory")) goto retry; out: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 794e6715c226..0027d8f4a1bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1981,14 +1981,20 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { struct page *page; - if (!order || compaction_deferred(preferred_zone)) + if (!order) return NULL; + if (compaction_deferred(preferred_zone)) { + *deferred_compaction = true; + return NULL; + } + current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration); @@ -2016,7 +2022,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * but not enough to satisfy watermarks. */ count_vm_event(COMPACTFAIL); - defer_compaction(preferred_zone); + + /* + * As async compaction considers a subset of pageblocks, only + * defer if the failure was a sync compaction failure. + */ + if (sync_migration) + defer_compaction(preferred_zone); cond_resched(); } @@ -2028,8 +2040,9 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { return NULL; } @@ -2179,6 +2192,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed = 0; unsigned long did_some_progress; bool sync_migration = false; + bool deferred_compaction = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2259,12 +2273,22 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; sync_migration = true; + /* + * If compaction is deferred for high-order allocations, it is because + * sync compaction recently failed. In this is the case and the caller + * has requested the system not be heavily disrupted, fail the + * allocation now instead of entering direct reclaim + */ + if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) + goto nopage; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -2328,8 +2352,9 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; } @@ -4237,7 +4262,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; - enum lru_list l; + enum lru_list lru; size = zone_spanned_pages_in_node(nid, j, zones_size); realsize = size - zone_absent_pages_in_node(nid, j, @@ -4287,8 +4312,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone_pcp_init(zone); - for_each_lru(l) - INIT_LIST_HEAD(&zone->lru[l].list); + for_each_lru(lru) + INIT_LIST_HEAD(&zone->lruvec.lists[lru]); zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; zone->reclaim_stat.recent_scanned[0] = 0; @@ -4642,8 +4667,10 @@ static void check_for_regular_memory(pg_data_t *pgdat) for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; - if (zone->present_pages) + if (zone->present_pages) { node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); + break; + } } #endif } diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 2d123f94a8df..de1616aa9b1e 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -11,13 +11,6 @@ #include <linux/swapops.h> #include <linux/kmemleak.h> -static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id) -{ - pc->flags = 0; - set_page_cgroup_array_id(pc, id); - pc->mem_cgroup = NULL; - INIT_LIST_HEAD(&pc->lru); -} static unsigned long total_usage; #if !defined(CONFIG_SPARSEMEM) @@ -35,35 +28,27 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) struct page_cgroup *base; base = NODE_DATA(page_to_nid(page))->node_page_cgroup; +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_cgroup arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ if (unlikely(!base)) return NULL; - +#endif offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; return base + offset; } -struct page *lookup_cgroup_page(struct page_cgroup *pc) -{ - unsigned long pfn; - struct page *page; - pg_data_t *pgdat; - - pgdat = NODE_DATA(page_cgroup_array_id(pc)); - pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn; - page = pfn_to_page(pfn); - VM_BUG_ON(pc != lookup_page_cgroup(page)); - return page; -} - static int __init alloc_node_page_cgroup(int nid) { - struct page_cgroup *base, *pc; + struct page_cgroup *base; unsigned long table_size; - unsigned long start_pfn, nr_pages, index; + unsigned long nr_pages; - start_pfn = NODE_DATA(nid)->node_start_pfn; nr_pages = NODE_DATA(nid)->node_spanned_pages; - if (!nr_pages) return 0; @@ -73,10 +58,6 @@ static int __init alloc_node_page_cgroup(int nid) table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!base) return -ENOMEM; - for (index = 0; index < nr_pages; index++) { - pc = base + index; - init_page_cgroup(pc, nid); - } NODE_DATA(nid)->node_page_cgroup = base; total_usage += table_size; return 0; @@ -111,29 +92,23 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); - +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_cgroup arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ if (!section->page_cgroup) return NULL; +#endif return section->page_cgroup + pfn; } -struct page *lookup_cgroup_page(struct page_cgroup *pc) -{ - struct mem_section *section; - struct page *page; - unsigned long nr; - - nr = page_cgroup_array_id(pc); - section = __nr_to_section(nr); - page = pfn_to_page(pc - section->page_cgroup); - VM_BUG_ON(pc != lookup_page_cgroup(page)); - return page; -} - static void *__meminit alloc_page_cgroup(size_t size, int nid) { + gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; void *addr = NULL; - gfp_t flags = GFP_KERNEL | __GFP_NOWARN; addr = alloc_pages_exact_nid(nid, size, flags); if (addr) { @@ -142,39 +117,20 @@ static void *__meminit alloc_page_cgroup(size_t size, int nid) } if (node_state(nid, N_HIGH_MEMORY)) - addr = vmalloc_node(size, nid); + addr = vzalloc_node(size, nid); else - addr = vmalloc(size); + addr = vzalloc(size); return addr; } -#ifdef CONFIG_MEMORY_HOTPLUG -static void free_page_cgroup(void *addr) -{ - if (is_vmalloc_addr(addr)) { - vfree(addr); - } else { - struct page *page = virt_to_page(addr); - size_t table_size = - sizeof(struct page_cgroup) * PAGES_PER_SECTION; - - BUG_ON(PageReserved(page)); - free_pages_exact(addr, table_size); - } -} -#endif - static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) { - struct page_cgroup *base, *pc; struct mem_section *section; + struct page_cgroup *base; unsigned long table_size; - unsigned long nr; - int index; - nr = pfn_to_section_nr(pfn); - section = __nr_to_section(nr); + section = __pfn_to_section(pfn); if (section->page_cgroup) return 0; @@ -194,10 +150,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) return -ENOMEM; } - for (index = 0; index < PAGES_PER_SECTION; index++) { - pc = base + index; - init_page_cgroup(pc, nr); - } /* * The passed "pfn" may not be aligned to SECTION. For the calculation * we need to apply a mask. @@ -208,6 +160,20 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) return 0; } #ifdef CONFIG_MEMORY_HOTPLUG +static void free_page_cgroup(void *addr) +{ + if (is_vmalloc_addr(addr)) { + vfree(addr); + } else { + struct page *page = virt_to_page(addr); + size_t table_size = + sizeof(struct page_cgroup) * PAGES_PER_SECTION; + + BUG_ON(PageReserved(page)); + free_pages_exact(addr, table_size); + } +} + void __free_page_cgroup(unsigned long pfn) { struct mem_section *ms; @@ -366,7 +332,6 @@ struct swap_cgroup { unsigned short id; }; #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) -#define SC_POS_MASK (SC_PER_PAGE - 1) /* * SwapCgroup implements "lookup" and "exchange" operations. @@ -408,6 +373,21 @@ not_enough_page: return -ENOMEM; } +static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, + struct swap_cgroup_ctrl **ctrlp) +{ + pgoff_t offset = swp_offset(ent); + struct swap_cgroup_ctrl *ctrl; + struct page *mappage; + + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + if (ctrlp) + *ctrlp = ctrl; + + mappage = ctrl->map[offset / SC_PER_PAGE]; + return page_address(mappage) + offset % SC_PER_PAGE; +} + /** * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. * @end: swap entry to be cmpxchged @@ -420,21 +400,13 @@ not_enough_page: unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new) { - int type = swp_type(ent); - unsigned long offset = swp_offset(ent); - unsigned long idx = offset / SC_PER_PAGE; - unsigned long pos = offset & SC_POS_MASK; struct swap_cgroup_ctrl *ctrl; - struct page *mappage; struct swap_cgroup *sc; unsigned long flags; unsigned short retval; - ctrl = &swap_cgroup_ctrl[type]; + sc = lookup_swap_cgroup(ent, &ctrl); - mappage = ctrl->map[idx]; - sc = page_address(mappage); - sc += pos; spin_lock_irqsave(&ctrl->lock, flags); retval = sc->id; if (retval == old) @@ -455,21 +427,13 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, */ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) { - int type = swp_type(ent); - unsigned long offset = swp_offset(ent); - unsigned long idx = offset / SC_PER_PAGE; - unsigned long pos = offset & SC_POS_MASK; struct swap_cgroup_ctrl *ctrl; - struct page *mappage; struct swap_cgroup *sc; unsigned short old; unsigned long flags; - ctrl = &swap_cgroup_ctrl[type]; + sc = lookup_swap_cgroup(ent, &ctrl); - mappage = ctrl->map[idx]; - sc = page_address(mappage); - sc += pos; spin_lock_irqsave(&ctrl->lock, flags); old = sc->id; sc->id = id; @@ -479,28 +443,14 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) } /** - * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry + * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry * @ent: swap entry to be looked up. * * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) */ -unsigned short lookup_swap_cgroup(swp_entry_t ent) +unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { - int type = swp_type(ent); - unsigned long offset = swp_offset(ent); - unsigned long idx = offset / SC_PER_PAGE; - unsigned long pos = offset & SC_POS_MASK; - struct swap_cgroup_ctrl *ctrl; - struct page *mappage; - struct swap_cgroup *sc; - unsigned short ret; - - ctrl = &swap_cgroup_ctrl[type]; - mappage = ctrl->map[idx]; - sc = page_address(mappage); - sc += pos; - ret = sc->id; - return ret; + return lookup_swap_cgroup(ent, NULL)->id; } int swap_cgroup_swapon(int type, unsigned long max_pages) diff --git a/mm/rmap.c b/mm/rmap.c index a2e5ce1fa081..c8454e06b6c8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -773,7 +773,7 @@ out: } static int page_referenced_anon(struct page *page, - struct mem_cgroup *mem_cont, + struct mem_cgroup *memcg, unsigned long *vm_flags) { unsigned int mapcount; @@ -796,7 +796,7 @@ static int page_referenced_anon(struct page *page, * counting on behalf of references from different * cgroups */ - if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) continue; referenced += page_referenced_one(page, vma, address, &mapcount, vm_flags); @@ -811,7 +811,7 @@ static int page_referenced_anon(struct page *page, /** * page_referenced_file - referenced check for object-based rmap * @page: the page we're checking references on. - * @mem_cont: target memory controller + * @memcg: target memory control group * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * For an object-based mapped page, find all the places it is mapped and @@ -822,7 +822,7 @@ static int page_referenced_anon(struct page *page, * This function is only called from page_referenced for object-based pages. */ static int page_referenced_file(struct page *page, - struct mem_cgroup *mem_cont, + struct mem_cgroup *memcg, unsigned long *vm_flags) { unsigned int mapcount; @@ -864,7 +864,7 @@ static int page_referenced_file(struct page *page, * counting on behalf of references from different * cgroups */ - if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) continue; referenced += page_referenced_one(page, vma, address, &mapcount, vm_flags); @@ -880,7 +880,7 @@ static int page_referenced_file(struct page *page, * page_referenced - test if the page was referenced * @page: the page to test * @is_locked: caller holds lock on the page - * @mem_cont: target memory controller + * @memcg: target memory cgroup * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * Quick test_and_clear_referenced for all mappings to a page, @@ -888,7 +888,7 @@ static int page_referenced_file(struct page *page, */ int page_referenced(struct page *page, int is_locked, - struct mem_cgroup *mem_cont, + struct mem_cgroup *memcg, unsigned long *vm_flags) { int referenced = 0; @@ -904,13 +904,13 @@ int page_referenced(struct page *page, } } if (unlikely(PageKsm(page))) - referenced += page_referenced_ksm(page, mem_cont, + referenced += page_referenced_ksm(page, memcg, vm_flags); else if (PageAnon(page)) - referenced += page_referenced_anon(page, mem_cont, + referenced += page_referenced_anon(page, memcg, vm_flags); else if (page->mapping) - referenced += page_referenced_file(page, mem_cont, + referenced += page_referenced_file(page, memcg, vm_flags); if (we_locked) unlock_page(page); diff --git a/mm/slub.c b/mm/slub.c index 5d37b5e44140..4907563ef7ff 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -366,7 +366,8 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page const char *n) { VM_BUG_ON(!irqs_disabled()); -#ifdef CONFIG_CMPXCHG_DOUBLE +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, @@ -400,7 +401,8 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_new, unsigned long counters_new, const char *n) { -#ifdef CONFIG_CMPXCHG_DOUBLE +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, @@ -3014,7 +3016,8 @@ static int kmem_cache_open(struct kmem_cache *s, } } -#ifdef CONFIG_CMPXCHG_DOUBLE +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) /* Enable fast mode */ s->flags |= __CMPXCHG_DOUBLE; diff --git a/mm/swap.c b/mm/swap.c index 67a09a633a09..b0f529b38979 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -23,7 +23,6 @@ #include <linux/init.h> #include <linux/export.h> #include <linux/mm_inline.h> -#include <linux/buffer_head.h> /* for try_to_release_page() */ #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/cpu.h> @@ -54,7 +53,7 @@ static void __page_cache_release(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); - del_page_from_lru(zone, page); + del_page_from_lru_list(zone, page, page_off_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags); } } @@ -232,12 +231,14 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, static void pagevec_move_tail_fn(struct page *page, void *arg) { int *pgmoved = arg; - struct zone *zone = page_zone(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { enum lru_list lru = page_lru_base_type(page); - list_move_tail(&page->lru, &zone->lru[lru].list); - mem_cgroup_rotate_reclaimable_page(page); + struct lruvec *lruvec; + + lruvec = mem_cgroup_lru_move_lists(page_zone(page), + page, lru, lru); + list_move_tail(&page->lru, &lruvec->lists[lru]); (*pgmoved)++; } } @@ -368,7 +369,6 @@ void mark_page_accessed(struct page *page) SetPageReferenced(page); } } - EXPORT_SYMBOL(mark_page_accessed); void __lru_cache_add(struct page *page, enum lru_list lru) @@ -377,7 +377,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru) page_cache_get(page); if (!pagevec_add(pvec, page)) - ____pagevec_lru_add(pvec, lru); + __pagevec_lru_add(pvec, lru); put_cpu_var(lru_add_pvecs); } EXPORT_SYMBOL(__lru_cache_add); @@ -476,12 +476,13 @@ static void lru_deactivate_fn(struct page *page, void *arg) */ SetPageReclaim(page); } else { + struct lruvec *lruvec; /* * The page's writeback ends up during pagevec * We moves tha page into tail of inactive. */ - list_move_tail(&page->lru, &zone->lru[lru].list); - mem_cgroup_rotate_reclaimable_page(page); + lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru); + list_move_tail(&page->lru, &lruvec->lists[lru]); __count_vm_event(PGROTATED); } @@ -504,7 +505,7 @@ static void drain_cpu_pagevecs(int cpu) for_each_lru(lru) { pvec = &pvecs[lru - LRU_BASE]; if (pagevec_count(pvec)) - ____pagevec_lru_add(pvec, lru); + __pagevec_lru_add(pvec, lru); } pvec = &per_cpu(lru_rotate_pvecs, cpu); @@ -616,7 +617,7 @@ void release_pages(struct page **pages, int nr, int cold) } VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); - del_page_from_lru(zone, page); + del_page_from_lru_list(zone, page, page_off_lru(page)); } list_add(&page->lru, &pages_to_free); @@ -644,9 +645,9 @@ void __pagevec_release(struct pagevec *pvec) release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); pagevec_reinit(pvec); } - EXPORT_SYMBOL(__pagevec_release); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* used by __split_huge_page_refcount() */ void lru_add_page_tail(struct zone* zone, struct page *page, struct page *page_tail) @@ -654,7 +655,6 @@ void lru_add_page_tail(struct zone* zone, int active; enum lru_list lru; const int file = 0; - struct list_head *head; VM_BUG_ON(!PageHead(page)); VM_BUG_ON(PageCompound(page_tail)); @@ -673,18 +673,30 @@ void lru_add_page_tail(struct zone* zone, lru = LRU_INACTIVE_ANON; } update_page_reclaim_stat(zone, page_tail, file, active); - if (likely(PageLRU(page))) - head = page->lru.prev; - else - head = &zone->lru[lru].list; - __add_page_to_lru_list(zone, page_tail, lru, head); } else { SetPageUnevictable(page_tail); - add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); + lru = LRU_UNEVICTABLE; + } + + if (likely(PageLRU(page))) + list_add_tail(&page_tail->lru, &page->lru); + else { + struct list_head *list_head; + /* + * Head page has not yet been counted, as an hpage, + * so we must account for each subpage individually. + * + * Use the standard add function to put page_tail on the list, + * but then correct its position so they all end up in order. + */ + add_page_to_lru_list(zone, page_tail, lru); + list_head = page_tail->lru.prev; + list_move_tail(&page_tail->lru, list_head); } } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void ____pagevec_lru_add_fn(struct page *page, void *arg) +static void __pagevec_lru_add_fn(struct page *page, void *arg) { enum lru_list lru = (enum lru_list)arg; struct zone *zone = page_zone(page); @@ -706,32 +718,13 @@ static void ____pagevec_lru_add_fn(struct page *page, void *arg) * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ -void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) +void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { VM_BUG_ON(is_unevictable_lru(lru)); - pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); -} - -EXPORT_SYMBOL(____pagevec_lru_add); - -/* - * Try to drop buffers from the pages in a pagevec - */ -void pagevec_strip(struct pagevec *pvec) -{ - int i; - - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - - if (page_has_private(page) && trylock_page(page)) { - if (page_has_private(page)) - try_to_release_page(page, 0); - unlock_page(page); - } - } + pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru); } +EXPORT_SYMBOL(__pagevec_lru_add); /** * pagevec_lookup - gang pagecache lookup @@ -755,7 +748,6 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); return pagevec_count(pvec); } - EXPORT_SYMBOL(pagevec_lookup); unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, @@ -765,7 +757,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, nr_pages, pvec->pages); return pagevec_count(pvec); } - EXPORT_SYMBOL(pagevec_lookup_tag); /* diff --git a/mm/swap_state.c b/mm/swap_state.c index ea6b32d61873..470038a91873 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -300,6 +300,16 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, new_page = alloc_page_vma(gfp_mask, vma, addr); if (!new_page) break; /* Out of memory */ + /* + * The memcg-specific accounting when moving + * pages around the LRU lists relies on the + * page's owner (memcg) to be valid. Usually, + * pages are assigned to a new owner before + * being put on the LRU list, but since this + * is not the case here, the stale owner from + * a previous allocation cycle must be reset. + */ + mem_cgroup_reset_owner(new_page); } /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 9520592d4231..d999f090dfda 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -847,12 +847,13 @@ unsigned int count_swap_pages(int type, int free) static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { - struct mem_cgroup *ptr; + struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret = 1; - if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { + if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, + GFP_KERNEL, &memcg)) { ret = -ENOMEM; goto out_nolock; } @@ -860,7 +861,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { if (ret > 0) - mem_cgroup_cancel_charge_swapin(ptr); + mem_cgroup_cancel_charge_swapin(memcg); ret = 0; goto out; } @@ -871,7 +872,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); page_add_anon_rmap(page, vma, addr); - mem_cgroup_commit_charge_swapin(page, ptr); + mem_cgroup_commit_charge_swapin(page, memcg); swap_free(entry); /* * Move the page to the active list so it is not diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 877ca046f43d..86ce9a526c17 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2378,7 +2378,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); if (!vas || !vms) - goto err_free; + goto err_free2; for (area = 0; area < nr_vms; area++) { vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); @@ -2476,11 +2476,10 @@ found: err_free: for (area = 0; area < nr_vms; area++) { - if (vas) - kfree(vas[area]); - if (vms) - kfree(vms[area]); + kfree(vas[area]); + kfree(vms[area]); } +err_free2: kfree(vas); kfree(vms); return NULL; diff --git a/mm/vmscan.c b/mm/vmscan.c index 26f4a8a4e0c7..2880396f7953 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -103,8 +103,11 @@ struct scan_control { */ reclaim_mode_t reclaim_mode; - /* Which cgroup do we reclaim from */ - struct mem_cgroup *mem_cgroup; + /* + * The memory cgroup that hit its limit and as a result is the + * primary target of this reclaim invocation. + */ + struct mem_cgroup *target_mem_cgroup; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes @@ -113,6 +116,11 @@ struct scan_control { nodemask_t *nodemask; }; +struct mem_cgroup_zone { + struct mem_cgroup *mem_cgroup; + struct zone *zone; +}; + #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) #ifdef ARCH_HAS_PREFETCH @@ -153,28 +161,45 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_CGROUP_MEM_RES_CTLR -#define scanning_global_lru(sc) (!(sc)->mem_cgroup) +static bool global_reclaim(struct scan_control *sc) +{ + return !sc->target_mem_cgroup; +} + +static bool scanning_global_lru(struct mem_cgroup_zone *mz) +{ + return !mz->mem_cgroup; +} #else -#define scanning_global_lru(sc) (1) +static bool global_reclaim(struct scan_control *sc) +{ + return true; +} + +static bool scanning_global_lru(struct mem_cgroup_zone *mz) +{ + return true; +} #endif -static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, - struct scan_control *sc) +static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) { - if (!scanning_global_lru(sc)) - return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); + if (!scanning_global_lru(mz)) + return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone); - return &zone->reclaim_stat; + return &mz->zone->reclaim_stat; } -static unsigned long zone_nr_lru_pages(struct zone *zone, - struct scan_control *sc, enum lru_list lru) +static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz, + enum lru_list lru) { - if (!scanning_global_lru(sc)) - return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, - zone_to_nid(zone), zone_idx(zone), BIT(lru)); + if (!scanning_global_lru(mz)) + return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup, + zone_to_nid(mz->zone), + zone_idx(mz->zone), + BIT(lru)); - return zone_page_state(zone, NR_LRU_BASE + lru); + return zone_page_state(mz->zone, NR_LRU_BASE + lru); } @@ -677,12 +702,13 @@ enum page_references { }; static enum page_references page_check_references(struct page *page, + struct mem_cgroup_zone *mz, struct scan_control *sc) { int referenced_ptes, referenced_page; unsigned long vm_flags; - referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); + referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ @@ -738,7 +764,7 @@ static enum page_references page_check_references(struct page *page, * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct zone *zone, + struct mem_cgroup_zone *mz, struct scan_control *sc, int priority, unsigned long *ret_nr_dirty, @@ -769,7 +795,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON(PageActive(page)); - VM_BUG_ON(page_zone(page) != zone); + VM_BUG_ON(page_zone(page) != mz->zone); sc->nr_scanned++; @@ -803,7 +829,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - references = page_check_references(page, sc); + references = page_check_references(page, mz, sc); switch (references) { case PAGEREF_ACTIVATE: goto activate_locked; @@ -994,8 +1020,8 @@ keep_lumpy: * back off and wait for congestion to clear because further reclaim * will encounter the same problem */ - if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) - zone_set_flag(zone, ZONE_CONGESTED); + if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) + zone_set_flag(mz->zone, ZONE_CONGESTED); free_hot_cold_page_list(&free_pages, 1); @@ -1049,8 +1075,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) ret = -EBUSY; - if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) - return ret; + /* + * To minimise LRU disruption, the caller can indicate that it only + * wants to isolate pages it will be able to operate on without + * blocking - clean pages for the most part. + * + * ISOLATE_CLEAN means that only clean pages should be isolated. This + * is used by reclaim when it is cannot write to backing storage + * + * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages + * that it is possible to migrate without blocking + */ + if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { + /* All the caller can do on PageWriteback is block */ + if (PageWriteback(page)) + return ret; + + if (PageDirty(page)) { + struct address_space *mapping; + + /* ISOLATE_CLEAN means only clean pages */ + if (mode & ISOLATE_CLEAN) + return ret; + + /* + * Only pages without mappings or that have a + * ->migratepage callback are possible to migrate + * without blocking + */ + mapping = page_mapping(page); + if (mapping && !mapping->a_ops->migratepage) + return ret; + } + } if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) return ret; @@ -1079,25 +1136,36 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) * Appropriate locks must be held before calling this function. * * @nr_to_scan: The number of pages to look through on the list. - * @src: The LRU list to pull pages off. + * @mz: The mem_cgroup_zone to pull pages from. * @dst: The temp list to put pages on to. - * @scanned: The number of pages that were scanned. + * @nr_scanned: The number of pages that were scanned. * @order: The caller's attempted allocation order * @mode: One of the LRU isolation modes + * @active: True [1] if isolating active pages * @file: True [1] if isolating file [!anon] pages * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, - struct list_head *src, struct list_head *dst, - unsigned long *scanned, int order, isolate_mode_t mode, - int file) + struct mem_cgroup_zone *mz, struct list_head *dst, + unsigned long *nr_scanned, int order, isolate_mode_t mode, + int active, int file) { + struct lruvec *lruvec; + struct list_head *src; unsigned long nr_taken = 0; unsigned long nr_lumpy_taken = 0; unsigned long nr_lumpy_dirty = 0; unsigned long nr_lumpy_failed = 0; unsigned long scan; + int lru = LRU_BASE; + + lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup); + if (active) + lru += LRU_ACTIVE; + if (file) + lru += LRU_FILE; + src = &lruvec->lists[lru]; for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { struct page *page; @@ -1113,15 +1181,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, switch (__isolate_lru_page(page, mode, file)) { case 0: + mem_cgroup_lru_del(page); list_move(&page->lru, dst); - mem_cgroup_del_lru(page); nr_taken += hpage_nr_pages(page); break; case -EBUSY: /* else it is being freed elsewhere */ list_move(&page->lru, src); - mem_cgroup_rotate_lru_list(page, page_lru(page)); continue; default: @@ -1171,13 +1238,17 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, break; if (__isolate_lru_page(cursor_page, mode, file) == 0) { + unsigned int isolated_pages; + + mem_cgroup_lru_del(cursor_page); list_move(&cursor_page->lru, dst); - mem_cgroup_del_lru(cursor_page); - nr_taken += hpage_nr_pages(cursor_page); - nr_lumpy_taken++; + isolated_pages = hpage_nr_pages(cursor_page); + nr_taken += isolated_pages; + nr_lumpy_taken += isolated_pages; if (PageDirty(cursor_page)) - nr_lumpy_dirty++; + nr_lumpy_dirty += isolated_pages; scan++; + pfn += isolated_pages - 1; } else { /* * Check if the page is freed already. @@ -1203,57 +1274,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, nr_lumpy_failed++; } - *scanned = scan; + *nr_scanned = scan; trace_mm_vmscan_lru_isolate(order, nr_to_scan, scan, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, - mode); + mode, file); return nr_taken; } -static unsigned long isolate_pages_global(unsigned long nr, - struct list_head *dst, - unsigned long *scanned, int order, - isolate_mode_t mode, - struct zone *z, int active, int file) -{ - int lru = LRU_BASE; - if (active) - lru += LRU_ACTIVE; - if (file) - lru += LRU_FILE; - return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, - mode, file); -} - -/* - * clear_active_flags() is a helper for shrink_active_list(), clearing - * any active bits from the pages in the list. - */ -static unsigned long clear_active_flags(struct list_head *page_list, - unsigned int *count) -{ - int nr_active = 0; - int lru; - struct page *page; - - list_for_each_entry(page, page_list, lru) { - int numpages = hpage_nr_pages(page); - lru = page_lru_base_type(page); - if (PageActive(page)) { - lru += LRU_ACTIVE; - ClearPageActive(page); - nr_active += numpages; - } - if (count) - count[lru] += numpages; - } - - return nr_active; -} - /** * isolate_lru_page - tries to isolate a page from its LRU list * @page: page to isolate from its LRU list @@ -1313,7 +1343,7 @@ static int too_many_isolated(struct zone *zone, int file, if (current_is_kswapd()) return 0; - if (!scanning_global_lru(sc)) + if (!global_reclaim(sc)) return 0; if (file) { @@ -1327,27 +1357,21 @@ static int too_many_isolated(struct zone *zone, int file, return isolated > inactive; } -/* - * TODO: Try merging with migrations version of putback_lru_pages - */ static noinline_for_stack void -putback_lru_pages(struct zone *zone, struct scan_control *sc, - unsigned long nr_anon, unsigned long nr_file, - struct list_head *page_list) +putback_inactive_pages(struct mem_cgroup_zone *mz, + struct list_head *page_list) { - struct page *page; - struct pagevec pvec; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - - pagevec_init(&pvec, 1); + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); + struct zone *zone = mz->zone; + LIST_HEAD(pages_to_free); /* * Put back any unfreeable pages. */ - spin_lock(&zone->lru_lock); while (!list_empty(page_list)) { + struct page *page = lru_to_page(page_list); int lru; - page = lru_to_page(page_list); + VM_BUG_ON(PageLRU(page)); list_del(&page->lru); if (unlikely(!page_evictable(page, NULL))) { @@ -1364,30 +1388,53 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; } - if (!pagevec_add(&pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); + if (put_page_testzero(page)) { + __ClearPageLRU(page); + __ClearPageActive(page); + del_page_from_lru_list(zone, page, lru); + + if (unlikely(PageCompound(page))) { + spin_unlock_irq(&zone->lru_lock); + (*get_compound_page_dtor(page))(page); + spin_lock_irq(&zone->lru_lock); + } else + list_add(&page->lru, &pages_to_free); } } - __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); - spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); + /* + * To save our caller's stack, now use input list for pages to free. + */ + list_splice(&pages_to_free, page_list); } -static noinline_for_stack void update_isolated_counts(struct zone *zone, - struct scan_control *sc, - unsigned long *nr_anon, - unsigned long *nr_file, - struct list_head *isolated_list) +static noinline_for_stack void +update_isolated_counts(struct mem_cgroup_zone *mz, + struct list_head *page_list, + unsigned long *nr_anon, + unsigned long *nr_file) { - unsigned long nr_active; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); + struct zone *zone = mz->zone; unsigned int count[NR_LRU_LISTS] = { 0, }; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + unsigned long nr_active = 0; + struct page *page; + int lru; + + /* + * Count pages and clear active flags + */ + list_for_each_entry(page, page_list, lru) { + int numpages = hpage_nr_pages(page); + lru = page_lru_base_type(page); + if (PageActive(page)) { + lru += LRU_ACTIVE; + ClearPageActive(page); + nr_active += numpages; + } + count[lru] += numpages; + } - nr_active = clear_active_flags(isolated_list, count); __count_vm_events(PGDEACTIVATE, nr_active); __mod_zone_page_state(zone, NR_ACTIVE_FILE, @@ -1401,8 +1448,6 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; - __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); reclaim_stat->recent_scanned[0] += *nr_anon; reclaim_stat->recent_scanned[1] += *nr_file; @@ -1454,8 +1499,8 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, * of reclaimed pages */ static noinline_for_stack unsigned long -shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, - struct scan_control *sc, int priority, int file) +shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, + struct scan_control *sc, int priority, int file) { LIST_HEAD(page_list); unsigned long nr_scanned; @@ -1466,6 +1511,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, unsigned long nr_dirty = 0; unsigned long nr_writeback = 0; isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; + struct zone *zone = mz->zone; while (unlikely(too_many_isolated(zone, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1488,9 +1534,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, spin_lock_irq(&zone->lru_lock); - if (scanning_global_lru(sc)) { - nr_taken = isolate_pages_global(nr_to_scan, &page_list, - &nr_scanned, sc->order, reclaim_mode, zone, 0, file); + nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, + &nr_scanned, sc->order, + reclaim_mode, 0, file); + if (global_reclaim(sc)) { zone->pages_scanned += nr_scanned; if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, @@ -1498,14 +1545,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, else __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); - } else { - nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, - &nr_scanned, sc->order, reclaim_mode, zone, - sc->mem_cgroup, 0, file); - /* - * mem_cgroup_isolate_pages() keeps track of - * scanned pages on its own. - */ } if (nr_taken == 0) { @@ -1513,26 +1552,37 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, return 0; } - update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); + update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); + + __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, + nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, &nr_dirty, &nr_writeback); /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { set_reclaim_mode(priority, sc, true); - nr_reclaimed += shrink_page_list(&page_list, zone, sc, + nr_reclaimed += shrink_page_list(&page_list, mz, sc, priority, &nr_dirty, &nr_writeback); } - local_irq_disable(); + spin_lock_irq(&zone->lru_lock); + if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_reclaimed); __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); - putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + putback_inactive_pages(mz, &page_list); + + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + + spin_unlock_irq(&zone->lru_lock); + + free_hot_cold_page_list(&page_list, 1); /* * If reclaim is isolating dirty pages under writeback, it implies @@ -1588,30 +1638,47 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, static void move_active_pages_to_lru(struct zone *zone, struct list_head *list, + struct list_head *pages_to_free, enum lru_list lru) { unsigned long pgmoved = 0; - struct pagevec pvec; struct page *page; - pagevec_init(&pvec, 1); + if (buffer_heads_over_limit) { + spin_unlock_irq(&zone->lru_lock); + list_for_each_entry(page, list, lru) { + if (page_has_private(page) && trylock_page(page)) { + if (page_has_private(page)) + try_to_release_page(page, 0); + unlock_page(page); + } + } + spin_lock_irq(&zone->lru_lock); + } while (!list_empty(list)) { + struct lruvec *lruvec; + page = lru_to_page(list); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - list_move(&page->lru, &zone->lru[lru].list); - mem_cgroup_add_lru_list(page, lru); + lruvec = mem_cgroup_lru_add_list(zone, page, lru); + list_move(&page->lru, &lruvec->lists[lru]); pgmoved += hpage_nr_pages(page); - if (!pagevec_add(&pvec, page) || list_empty(list)) { - spin_unlock_irq(&zone->lru_lock); - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); + if (put_page_testzero(page)) { + __ClearPageLRU(page); + __ClearPageActive(page); + del_page_from_lru_list(zone, page, lru); + + if (unlikely(PageCompound(page))) { + spin_unlock_irq(&zone->lru_lock); + (*get_compound_page_dtor(page))(page); + spin_lock_irq(&zone->lru_lock); + } else + list_add(&page->lru, pages_to_free); } } __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); @@ -1619,19 +1686,22 @@ static void move_active_pages_to_lru(struct zone *zone, __count_vm_events(PGDEACTIVATE, pgmoved); } -static void shrink_active_list(unsigned long nr_pages, struct zone *zone, - struct scan_control *sc, int priority, int file) +static void shrink_active_list(unsigned long nr_to_scan, + struct mem_cgroup_zone *mz, + struct scan_control *sc, + int priority, int file) { unsigned long nr_taken; - unsigned long pgscanned; + unsigned long nr_scanned; unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); unsigned long nr_rotated = 0; isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; + struct zone *zone = mz->zone; lru_add_drain(); @@ -1641,26 +1711,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, reclaim_mode |= ISOLATE_CLEAN; spin_lock_irq(&zone->lru_lock); - if (scanning_global_lru(sc)) { - nr_taken = isolate_pages_global(nr_pages, &l_hold, - &pgscanned, sc->order, - reclaim_mode, zone, - 1, file); - zone->pages_scanned += pgscanned; - } else { - nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, - &pgscanned, sc->order, - reclaim_mode, zone, - sc->mem_cgroup, 1, file); - /* - * mem_cgroup_isolate_pages() keeps track of - * scanned pages on its own. - */ - } + + nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, + &nr_scanned, sc->order, + reclaim_mode, 1, file); + if (global_reclaim(sc)) + zone->pages_scanned += nr_scanned; reclaim_stat->recent_scanned[file] += nr_taken; - __count_zone_vm_events(PGREFILL, zone, pgscanned); + __count_zone_vm_events(PGREFILL, zone, nr_scanned); if (file) __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); else @@ -1678,7 +1738,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, continue; } - if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { + if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and @@ -1711,12 +1771,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, */ reclaim_stat->recent_rotated[file] += nr_rotated; - move_active_pages_to_lru(zone, &l_active, + move_active_pages_to_lru(zone, &l_active, &l_hold, LRU_ACTIVE + file * LRU_FILE); - move_active_pages_to_lru(zone, &l_inactive, + move_active_pages_to_lru(zone, &l_inactive, &l_hold, LRU_BASE + file * LRU_FILE); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); + + free_hot_cold_page_list(&l_hold, 1); } #ifdef CONFIG_SWAP @@ -1741,10 +1803,8 @@ static int inactive_anon_is_low_global(struct zone *zone) * Returns true if the zone does not have enough inactive anon pages, * meaning some active anon pages need to be deactivated. */ -static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) +static int inactive_anon_is_low(struct mem_cgroup_zone *mz) { - int low; - /* * If we don't have swap space, anonymous page deactivation * is pointless. @@ -1752,15 +1812,14 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) if (!total_swap_pages) return 0; - if (scanning_global_lru(sc)) - low = inactive_anon_is_low_global(zone); - else - low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone); - return low; + if (!scanning_global_lru(mz)) + return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, + mz->zone); + + return inactive_anon_is_low_global(mz->zone); } #else -static inline int inactive_anon_is_low(struct zone *zone, - struct scan_control *sc) +static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz) { return 0; } @@ -1778,8 +1837,7 @@ static int inactive_file_is_low_global(struct zone *zone) /** * inactive_file_is_low - check if file pages need to be deactivated - * @zone: zone to check - * @sc: scan control of this context + * @mz: memory cgroup and zone to check * * When the system is doing streaming IO, memory pressure here * ensures that active file pages get deactivated, until more @@ -1791,45 +1849,44 @@ static int inactive_file_is_low_global(struct zone *zone) * This uses a different ratio than the anonymous pages, because * the page cache uses a use-once replacement algorithm. */ -static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) +static int inactive_file_is_low(struct mem_cgroup_zone *mz) { - int low; + if (!scanning_global_lru(mz)) + return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, + mz->zone); - if (scanning_global_lru(sc)) - low = inactive_file_is_low_global(zone); - else - low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone); - return low; + return inactive_file_is_low_global(mz->zone); } -static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, - int file) +static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file) { if (file) - return inactive_file_is_low(zone, sc); + return inactive_file_is_low(mz); else - return inactive_anon_is_low(zone, sc); + return inactive_anon_is_low(mz); } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct zone *zone, struct scan_control *sc, int priority) + struct mem_cgroup_zone *mz, + struct scan_control *sc, int priority) { int file = is_file_lru(lru); if (is_active_lru(lru)) { - if (inactive_list_is_low(zone, sc, file)) - shrink_active_list(nr_to_scan, zone, sc, priority, file); + if (inactive_list_is_low(mz, file)) + shrink_active_list(nr_to_scan, mz, sc, priority, file); return 0; } - return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); + return shrink_inactive_list(nr_to_scan, mz, sc, priority, file); } -static int vmscan_swappiness(struct scan_control *sc) +static int vmscan_swappiness(struct mem_cgroup_zone *mz, + struct scan_control *sc) { - if (scanning_global_lru(sc)) + if (global_reclaim(sc)) return vm_swappiness; - return mem_cgroup_swappiness(sc->mem_cgroup); + return mem_cgroup_swappiness(mz->mem_cgroup); } /* @@ -1840,15 +1897,15 @@ static int vmscan_swappiness(struct scan_control *sc) * * nr[0] = anon pages to scan; nr[1] = file pages to scan */ -static void get_scan_count(struct zone *zone, struct scan_control *sc, - unsigned long *nr, int priority) +static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, + unsigned long *nr, int priority) { unsigned long anon, file, free; unsigned long anon_prio, file_prio; unsigned long ap, fp; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); u64 fraction[2], denominator; - enum lru_list l; + enum lru_list lru; int noswap = 0; bool force_scan = false; @@ -1862,9 +1919,9 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * latencies, so it's better to scan a minimum amount there as * well. */ - if (scanning_global_lru(sc) && current_is_kswapd()) + if (current_is_kswapd() && mz->zone->all_unreclaimable) force_scan = true; - if (!scanning_global_lru(sc)) + if (!global_reclaim(sc)) force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ @@ -1876,16 +1933,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, goto out; } - anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); - if (scanning_global_lru(sc)) { - free = zone_page_state(zone, NR_FREE_PAGES); + if (global_reclaim(sc)) { + free = zone_page_state(mz->zone, NR_FREE_PAGES); /* If we have very few page cache pages, force-scan anon pages. */ - if (unlikely(file + free <= high_wmark_pages(zone))) { + if (unlikely(file + free <= high_wmark_pages(mz->zone))) { fraction[0] = 1; fraction[1] = 0; denominator = 1; @@ -1897,8 +1954,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = vmscan_swappiness(sc); - file_prio = 200 - vmscan_swappiness(sc); + anon_prio = vmscan_swappiness(mz, sc); + file_prio = 200 - vmscan_swappiness(mz, sc); /* * OK, so we have swap space and a fair amount of page cache @@ -1911,7 +1968,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * * anon in [0], file in [1] */ - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&mz->zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; @@ -1932,24 +1989,24 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&mz->zone->lru_lock); fraction[0] = ap; fraction[1] = fp; denominator = ap + fp + 1; out: - for_each_evictable_lru(l) { - int file = is_file_lru(l); + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); unsigned long scan; - scan = zone_nr_lru_pages(zone, sc, l); + scan = zone_nr_lru_pages(mz, lru); if (priority || noswap) { scan >>= priority; if (!scan && force_scan) scan = SWAP_CLUSTER_MAX; scan = div64_u64(scan * fraction[file], denominator); } - nr[l] = scan; + nr[lru] = scan; } } @@ -1960,7 +2017,7 @@ out: * back to the allocator and call try_to_compact_zone(), we ensure that * there are enough free pages for it to be likely successful */ -static inline bool should_continue_reclaim(struct zone *zone, +static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, unsigned long nr_reclaimed, unsigned long nr_scanned, struct scan_control *sc) @@ -2000,15 +2057,15 @@ static inline bool should_continue_reclaim(struct zone *zone, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); if (nr_swap_pages > 0) - inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(zone, sc->order)) { + switch (compaction_suitable(mz->zone, sc->order)) { case COMPACT_PARTIAL: case COMPACT_CONTINUE: return false; @@ -2020,12 +2077,12 @@ static inline bool should_continue_reclaim(struct zone *zone, /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_zone(int priority, struct zone *zone, - struct scan_control *sc) +static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, + struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; - enum lru_list l; + enum lru_list lru; unsigned long nr_reclaimed, nr_scanned; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; @@ -2033,19 +2090,19 @@ static void shrink_zone(int priority, struct zone *zone, restart: nr_reclaimed = 0; nr_scanned = sc->nr_scanned; - get_scan_count(zone, sc, nr, priority); + get_scan_count(mz, sc, nr, priority); blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { - for_each_evictable_lru(l) { - if (nr[l]) { + for_each_evictable_lru(lru) { + if (nr[lru]) { nr_to_scan = min_t(unsigned long, - nr[l], SWAP_CLUSTER_MAX); - nr[l] -= nr_to_scan; + nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; - nr_reclaimed += shrink_list(l, nr_to_scan, - zone, sc, priority); + nr_reclaimed += shrink_list(lru, nr_to_scan, + mz, sc, priority); } } /* @@ -2066,17 +2123,89 @@ restart: * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(zone, sc)) - shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + if (inactive_anon_is_low(mz)) + shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0); /* reclaim/compaction might need reclaim to continue */ - if (should_continue_reclaim(zone, nr_reclaimed, + if (should_continue_reclaim(mz, nr_reclaimed, sc->nr_scanned - nr_scanned, sc)) goto restart; throttle_vm_writeout(sc->gfp_mask); } +static void shrink_zone(int priority, struct zone *zone, + struct scan_control *sc) +{ + struct mem_cgroup *root = sc->target_mem_cgroup; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = priority, + }; + struct mem_cgroup *memcg; + + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { + struct mem_cgroup_zone mz = { + .mem_cgroup = memcg, + .zone = zone, + }; + + shrink_mem_cgroup_zone(priority, &mz, sc); + /* + * Limit reclaim has historically picked one memcg and + * scanned it with decreasing priority levels until + * nr_to_reclaim had been reclaimed. This priority + * cycle is thus over after a single memcg. + * + * Direct reclaim and kswapd, on the other hand, have + * to scan all memory cgroups to fulfill the overall + * scan target for the zone. + */ + if (!global_reclaim(sc)) { + mem_cgroup_iter_break(root, memcg); + break; + } + memcg = mem_cgroup_iter(root, memcg, &reclaim); + } while (memcg); +} + +/* Returns true if compaction should go ahead for a high-order request */ +static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +{ + unsigned long balance_gap, watermark; + bool watermark_ok; + + /* Do not consider compaction for orders reclaim is meant to satisfy */ + if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) + return false; + + /* + * Compaction takes time to run and there are potentially other + * callers using the pages just freed. Continue reclaiming until + * there is a buffer of free pages available to give compaction + * a reasonable chance of completing and allocating the page + */ + balance_gap = min(low_wmark_pages(zone), + (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + KSWAPD_ZONE_BALANCE_GAP_RATIO); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + + /* + * If compaction is deferred, reclaim up to a point where + * compaction will have a chance of success when re-enabled + */ + if (compaction_deferred(zone)) + return watermark_ok; + + /* If compaction is not ready to start, keep reclaiming */ + if (!compaction_suitable(zone, sc->order)) + return false; + + return watermark_ok; +} + /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -2094,8 +2223,9 @@ restart: * scan then give up on it. * * This function returns true if a zone is being reclaimed for a costly - * high-order allocation and compaction is either ready to begin or deferred. - * This indicates to the caller that it should retry the allocation or fail. + * high-order allocation and compaction is ready to begin. This indicates to + * the caller that it should consider retrying the allocation instead of + * further reclaim. */ static bool shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) @@ -2104,7 +2234,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - bool should_abort_reclaim = false; + bool aborted_reclaim = false; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -2114,7 +2244,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, * Take care memory controller reclaiming has small influence * to global LRU. */ - if (scanning_global_lru(sc)) { + if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; if (zone->all_unreclaimable && priority != DEF_PRIORITY) @@ -2129,10 +2259,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, * noticable problem, like transparent huge page * allocations. */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER && - (compaction_suitable(zone, sc->order) || - compaction_deferred(zone))) { - should_abort_reclaim = true; + if (compaction_ready(zone, sc)) { + aborted_reclaim = true; continue; } } @@ -2154,7 +2282,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, shrink_zone(priority, zone, sc); } - return should_abort_reclaim; + return aborted_reclaim; } static bool zone_reclaimable(struct zone *zone) @@ -2208,25 +2336,25 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; unsigned long writeback_threshold; + bool aborted_reclaim; get_mems_allowed(); delayacct_freepages_start(); - if (scanning_global_lru(sc)) + if (global_reclaim(sc)) count_vm_event(ALLOCSTALL); for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc->nr_scanned = 0; if (!priority) - disable_swap_token(sc->mem_cgroup); - if (shrink_zones(priority, zonelist, sc)) - break; + disable_swap_token(sc->target_mem_cgroup); + aborted_reclaim = shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups */ - if (scanning_global_lru(sc)) { + if (global_reclaim(sc)) { unsigned long lru_pages = 0; for_each_zone_zonelist(zone, z, zonelist, gfp_zone(sc->gfp_mask)) { @@ -2287,8 +2415,12 @@ out: if (oom_killer_disabled) return 0; + /* Aborted reclaim to try compaction? don't OOM, then */ + if (aborted_reclaim) + return 1; + /* top priority shrink_zones still had more to do? don't OOM, then */ - if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) + if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) return 1; return 0; @@ -2305,7 +2437,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_unmap = 1, .may_swap = 1, .order = order, - .mem_cgroup = NULL, + .target_mem_cgroup = NULL, .nodemask = nodemask, }; struct shrink_control shrink = { @@ -2325,7 +2457,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR -unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, struct zone *zone, unsigned long *nr_scanned) @@ -2337,7 +2469,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_unmap = 1, .may_swap = !noswap, .order = 0, - .mem_cgroup = mem, + .target_mem_cgroup = memcg, + }; + struct mem_cgroup_zone mz = { + .mem_cgroup = memcg, + .zone = zone, }; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | @@ -2354,7 +2490,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_zone(0, zone, &sc); + shrink_mem_cgroup_zone(0, &mz, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2362,7 +2498,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, return sc.nr_reclaimed; } -unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap) { @@ -2375,7 +2511,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, .order = 0, - .mem_cgroup = mem_cont, + .target_mem_cgroup = memcg, .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), @@ -2389,7 +2525,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, * take care of from where we get pages. So the node where we start the * scan does not need to be the current node. */ - nid = mem_cgroup_select_victim_node(mem_cont); + nid = mem_cgroup_select_victim_node(memcg); zonelist = NODE_DATA(nid)->node_zonelists; @@ -2405,6 +2541,29 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +static void age_active_anon(struct zone *zone, struct scan_control *sc, + int priority) +{ + struct mem_cgroup *memcg; + + if (!total_swap_pages) + return; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct mem_cgroup_zone mz = { + .mem_cgroup = memcg, + .zone = zone, + }; + + if (inactive_anon_is_low(&mz)) + shrink_active_list(SWAP_CLUSTER_MAX, &mz, + sc, priority, 0); + + memcg = mem_cgroup_iter(NULL, memcg, NULL); + } while (memcg); +} + /* * pgdat_balanced is used when checking if a node is balanced for high-order * allocations. Only zones that meet watermarks and are in a zone allowed @@ -2525,7 +2684,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, */ .nr_to_reclaim = ULONG_MAX, .order = order, - .mem_cgroup = NULL, + .target_mem_cgroup = NULL, }; struct shrink_control shrink = { .gfp_mask = sc.gfp_mask, @@ -2564,9 +2723,7 @@ loop_again: * Do some background aging of the anon list, to give * pages a chance to be referenced before reclaiming. */ - if (inactive_anon_is_low(zone, &sc)) - shrink_active_list(SWAP_CLUSTER_MAX, zone, - &sc, priority, 0); + age_active_anon(zone, &sc, priority); if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { @@ -3355,16 +3512,18 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) */ static void check_move_unevictable_page(struct page *page, struct zone *zone) { - VM_BUG_ON(PageActive(page)); + struct lruvec *lruvec; + VM_BUG_ON(PageActive(page)); retry: ClearPageUnevictable(page); if (page_evictable(page, NULL)) { enum lru_list l = page_lru_base_type(page); __dec_zone_state(zone, NR_UNEVICTABLE); - list_move(&page->lru, &zone->lru[l].list); - mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); + lruvec = mem_cgroup_lru_move_lists(zone, page, + LRU_UNEVICTABLE, l); + list_move(&page->lru, &lruvec->lists[l]); __inc_zone_state(zone, NR_INACTIVE_ANON + l); __count_vm_event(UNEVICTABLE_PGRESCUED); } else { @@ -3372,8 +3531,9 @@ retry: * rotate unevictable list */ SetPageUnevictable(page); - list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); - mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); + lruvec = mem_cgroup_lru_move_lists(zone, page, LRU_UNEVICTABLE, + LRU_UNEVICTABLE); + list_move(&page->lru, &lruvec->lists[LRU_UNEVICTABLE]); if (page_evictable(page, NULL)) goto retry; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 8fd603b1665e..f600557a7659 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -295,7 +295,7 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) } EXPORT_SYMBOL(__dec_zone_page_state); -#ifdef CONFIG_CMPXCHG_LOCAL +#ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * If we have cmpxchg_local support then we do not need to incur the overhead * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. |