diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 363 |
1 files changed, 156 insertions, 207 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d9b5c817dce8..9acfb165eb52 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -62,6 +62,7 @@ #include <linux/oom.h> #include <linux/lockdep.h> #include <linux/file.h> +#include <linux/tracehook.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -434,7 +435,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) memcg = page->mem_cgroup; - if (!memcg || !cgroup_on_dfl(memcg->css.cgroup)) + if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; rcu_read_unlock(); @@ -1661,7 +1662,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_oom.may_oom) + if (!current->memcg_may_oom) return; /* * We are in the middle of the charge context here, so we @@ -1678,9 +1679,9 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) * and when we know whether the fault was overall successful. */ css_get(&memcg->css); - current->memcg_oom.memcg = memcg; - current->memcg_oom.gfp_mask = mask; - current->memcg_oom.order = order; + current->memcg_in_oom = memcg; + current->memcg_oom_gfp_mask = mask; + current->memcg_oom_order = order; } /** @@ -1702,7 +1703,7 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) */ bool mem_cgroup_oom_synchronize(bool handle) { - struct mem_cgroup *memcg = current->memcg_oom.memcg; + struct mem_cgroup *memcg = current->memcg_in_oom; struct oom_wait_info owait; bool locked; @@ -1730,8 +1731,8 @@ bool mem_cgroup_oom_synchronize(bool handle) if (locked && !memcg->oom_kill_disable) { mem_cgroup_unmark_under_oom(memcg); finish_wait(&memcg_oom_waitq, &owait.wait); - mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, - current->memcg_oom.order); + mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, + current->memcg_oom_order); } else { schedule(); mem_cgroup_unmark_under_oom(memcg); @@ -1748,7 +1749,7 @@ bool mem_cgroup_oom_synchronize(bool handle) memcg_oom_recover(memcg); } cleanup: - current->memcg_oom.memcg = NULL; + current->memcg_in_oom = NULL; css_put(&memcg->css); return true; } @@ -1972,6 +1973,31 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } +/* + * Scheduled by try_charge() to be executed from the userland return path + * and reclaims memory over the high limit. + */ +void mem_cgroup_handle_over_high(void) +{ + unsigned int nr_pages = current->memcg_nr_pages_over_high; + struct mem_cgroup *memcg, *pos; + + if (likely(!nr_pages)) + return; + + pos = memcg = get_mem_cgroup_from_mm(current->mm); + + do { + if (page_counter_read(&pos->memory) <= pos->high) + continue; + mem_cgroup_events(pos, MEMCG_HIGH, 1); + try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true); + } while ((pos = parent_mem_cgroup(pos))); + + css_put(&memcg->css); + current->memcg_nr_pages_over_high = 0; +} + static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages) { @@ -1982,17 +2008,16 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long nr_reclaimed; bool may_swap = true; bool drained = false; - int ret = 0; if (mem_cgroup_is_root(memcg)) - goto done; + return 0; retry: if (consume_stock(memcg, nr_pages)) - goto done; + return 0; if (!do_swap_account || - !page_counter_try_charge(&memcg->memsw, batch, &counter)) { - if (!page_counter_try_charge(&memcg->memory, batch, &counter)) + page_counter_try_charge(&memcg->memsw, batch, &counter)) { + if (page_counter_try_charge(&memcg->memory, batch, &counter)) goto done_restock; if (do_swap_account) page_counter_uncharge(&memcg->memsw, batch); @@ -2016,12 +2041,12 @@ retry: if (unlikely(test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current) || current->flags & PF_EXITING)) - goto bypass; + goto force; if (unlikely(task_in_memcg_oom(current))) goto nomem; - if (!(gfp_mask & __GFP_WAIT)) + if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); @@ -2062,38 +2087,54 @@ retry: goto retry; if (gfp_mask & __GFP_NOFAIL) - goto bypass; + goto force; if (fatal_signal_pending(current)) - goto bypass; + goto force; mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); + mem_cgroup_oom(mem_over_limit, gfp_mask, + get_order(nr_pages * PAGE_SIZE)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; -bypass: - return -EINTR; +force: + /* + * The allocation either can't fail or will lead to more memory + * being freed very soon. Allow memory usage go over the limit + * temporarily by force charging it. + */ + page_counter_charge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_charge(&memcg->memsw, nr_pages); + css_get_many(&memcg->css, nr_pages); + + return 0; done_restock: css_get_many(&memcg->css, batch); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); - if (!(gfp_mask & __GFP_WAIT)) - goto done; + /* - * If the hierarchy is above the normal consumption range, - * make the charging task trim their excess contribution. + * If the hierarchy is above the normal consumption range, schedule + * reclaim on returning to userland. We can perform reclaim here + * if __GFP_RECLAIM but let's always punt for simplicity and so that + * GFP_KERNEL can consistently be used during reclaim. @memcg is + * not recorded as it most likely matches current's and won't + * change in the meantime. As high limit is checked again before + * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) <= memcg->high) - continue; - mem_cgroup_events(memcg, MEMCG_HIGH, 1); - try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + if (page_counter_read(&memcg->memory) > memcg->high) { + current->memcg_nr_pages_over_high += nr_pages; + set_notify_resume(current); + break; + } } while ((memcg = parent_mem_cgroup(memcg))); -done: - return ret; + + return 0; } static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) @@ -2174,55 +2215,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, } #ifdef CONFIG_MEMCG_KMEM -int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, - unsigned long nr_pages) -{ - struct page_counter *counter; - int ret = 0; - - ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); - if (ret < 0) - return ret; - - ret = try_charge(memcg, gfp, nr_pages); - if (ret == -EINTR) { - /* - * try_charge() chose to bypass to root due to OOM kill or - * fatal signal. Since our only options are to either fail - * the allocation or charge it to this cgroup, do it as a - * temporary condition. But we can't fail. From a kmem/slab - * perspective, the cache has already been selected, by - * mem_cgroup_kmem_get_cache(), so it is too late to change - * our minds. - * - * This condition will only trigger if the task entered - * memcg_charge_kmem in a sane state, but was OOM-killed - * during try_charge() above. Tasks that were already dying - * when the allocation triggers should have been already - * directed to the root cgroup in memcontrol.h - */ - page_counter_charge(&memcg->memory, nr_pages); - if (do_swap_account) - page_counter_charge(&memcg->memsw, nr_pages); - css_get_many(&memcg->css, nr_pages); - ret = 0; - } else if (ret) - page_counter_uncharge(&memcg->kmem, nr_pages); - - return ret; -} - -void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) -{ - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) - page_counter_uncharge(&memcg->memsw, nr_pages); - - page_counter_uncharge(&memcg->kmem, nr_pages); - - css_put_many(&memcg->css, nr_pages); -} - static int memcg_alloc_cache_id(void) { int id, size; @@ -2384,85 +2376,58 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep) css_put(&cachep->memcg_params.memcg->css); } -/* - * We need to verify if the allocation against current->mm->owner's memcg is - * possible for the given order. But the page is not allocated yet, so we'll - * need a further commit step to do the final arrangements. - * - * It is possible for the task to switch cgroups in this mean time, so at - * commit time, we can't rely on task conversion any longer. We'll then use - * the handle argument to return to the caller which cgroup we should commit - * against. We could also return the memcg directly and avoid the pointer - * passing, but a boolean return value gives better semantics considering - * the compiled-out case as well. - * - * Returning true means the allocation is possible. - */ -bool -__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg) { - struct mem_cgroup *memcg; + unsigned int nr_pages = 1 << order; + struct page_counter *counter; int ret; - *_memcg = NULL; + if (!memcg_kmem_is_active(memcg)) + return 0; - memcg = get_mem_cgroup_from_mm(current->mm); + if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) + return -ENOMEM; - if (!memcg_kmem_is_active(memcg)) { - css_put(&memcg->css); - return true; + ret = try_charge(memcg, gfp, nr_pages); + if (ret) { + page_counter_uncharge(&memcg->kmem, nr_pages); + return ret; } - ret = memcg_charge_kmem(memcg, gfp, 1 << order); - if (!ret) - *_memcg = memcg; + page->mem_cgroup = memcg; - css_put(&memcg->css); - return (ret == 0); + return 0; } -void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, - int order) +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { - VM_BUG_ON(mem_cgroup_is_root(memcg)); + struct mem_cgroup *memcg; + int ret; - /* The page allocation failed. Revert */ - if (!page) { - memcg_uncharge_kmem(memcg, 1 << order); - return; - } - page->mem_cgroup = memcg; + memcg = get_mem_cgroup_from_mm(current->mm); + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); + css_put(&memcg->css); + return ret; } -void __memcg_kmem_uncharge_pages(struct page *page, int order) +void __memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; + unsigned int nr_pages = 1 << order; if (!memcg) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - memcg_uncharge_kmem(memcg, 1 << order); - page->mem_cgroup = NULL; -} - -struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) -{ - struct mem_cgroup *memcg = NULL; - struct kmem_cache *cachep; - struct page *page; - - page = virt_to_head_page(ptr); - if (PageSlab(page)) { - cachep = page->slab_cache; - if (!is_root_cache(cachep)) - memcg = cachep->memcg_params.memcg; - } else - /* page allocated by alloc_kmem_pages */ - memcg = page->mem_cgroup; + page_counter_uncharge(&memcg->kmem, nr_pages); + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_uncharge(&memcg->memsw, nr_pages); - return memcg; + page->mem_cgroup = NULL; + css_put_many(&memcg->css, nr_pages); } #endif /* CONFIG_MEMCG_KMEM */ @@ -2836,9 +2801,9 @@ static unsigned long tree_stat(struct mem_cgroup *memcg, return val; } -static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { - u64 val; + unsigned long val; if (mem_cgroup_is_root(memcg)) { val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); @@ -2851,7 +2816,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) else val = page_counter_read(&memcg->memsw); } - return val << PAGE_SHIFT; + return val; } enum { @@ -2885,9 +2850,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, switch (MEMFILE_ATTR(cft->private)) { case RES_USAGE: if (counter == &memcg->memory) - return mem_cgroup_usage(memcg, false); + return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; if (counter == &memcg->memsw) - return mem_cgroup_usage(memcg, true); + return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: return (u64)counter->limit * PAGE_SIZE; @@ -2926,7 +2891,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, * of course permitted. */ mutex_lock(&memcg_create_mutex); - if (cgroup_has_tasks(memcg->css.cgroup) || + if (cgroup_is_populated(memcg->css.cgroup) || (memcg->use_hierarchy && memcg_has_children(memcg))) err = -EBUSY; mutex_unlock(&memcg_create_mutex); @@ -3387,7 +3352,6 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, ret = page_counter_memparse(args, "-1", &threshold); if (ret) return ret; - threshold <<= PAGE_SHIFT; mutex_lock(&memcg->thresholds_lock); @@ -3741,44 +3705,43 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) /** * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg * @wb: bdi_writeback in question - * @pavail: out parameter for number of available pages + * @pfilepages: out parameter for number of file pages + * @pheadroom: out parameter for number of allocatable pages according to memcg * @pdirty: out parameter for number of dirty pages * @pwriteback: out parameter for number of pages under writeback * - * Determine the numbers of available, dirty, and writeback pages in @wb's - * memcg. Dirty and writeback are self-explanatory. Available is a bit - * more involved. + * Determine the numbers of file, headroom, dirty, and writeback pages in + * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom + * is a bit more involved. * - * A memcg's headroom is "min(max, high) - used". The available memory is - * calculated as the lowest headroom of itself and the ancestors plus the - * number of pages already being used for file pages. Note that this - * doesn't consider the actual amount of available memory in the system. - * The caller should further cap *@pavail accordingly. + * A memcg's headroom is "min(max, high) - used". In the hierarchy, the + * headroom is calculated as the lowest headroom of itself and the + * ancestors. Note that this doesn't consider the actual amount of + * available memory in the system. The caller should further cap + * *@pheadroom accordingly. */ -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail, - unsigned long *pdirty, unsigned long *pwriteback) +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, + unsigned long *pheadroom, unsigned long *pdirty, + unsigned long *pwriteback) { struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - unsigned long head_room = PAGE_COUNTER_MAX; - unsigned long file_pages; *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY); /* this should eventually include NR_UNSTABLE_NFS */ *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | + (1 << LRU_ACTIVE_FILE)); + *pheadroom = PAGE_COUNTER_MAX; - file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | - (1 << LRU_ACTIVE_FILE)); while ((parent = parent_mem_cgroup(memcg))) { unsigned long ceiling = min(memcg->memory.limit, memcg->high); unsigned long used = page_counter_read(&memcg->memory); - head_room = min(head_room, ceiling - min(ceiling, used)); + *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); memcg = parent; } - - *pavail = file_pages + head_room; } #else /* CONFIG_CGROUP_WRITEBACK */ @@ -4067,8 +4030,7 @@ static struct cftype mem_cgroup_legacy_files[] = { { .name = "cgroup.event_control", /* XXX: for compat */ .write = memcg_write_event_control, - .flags = CFTYPE_NO_PREFIX, - .mode = S_IWUGO, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, }, { .name = "swappiness", @@ -4402,28 +4364,16 @@ static int mem_cgroup_do_precharge(unsigned long count) { int ret; - /* Try a single bulk charge without reclaim first */ - ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + /* Try a single bulk charge without reclaim first, kswapd may wake */ + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); if (!ret) { mc.precharge += count; return ret; } - if (ret == -EINTR) { - cancel_charge(root_mem_cgroup, count); - return ret; - } /* Try charges one by one with reclaim */ while (count--) { ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); - /* - * In case of failure, any residual charges against - * mc.to will be dropped by mem_cgroup_clear_mc() - * later on. However, cancel any charges that are - * bypassed to root right away or they'll be lost. - */ - if (ret == -EINTR) - cancel_charge(root_mem_cgroup, 1); if (ret) return ret; mc.precharge++; @@ -4578,9 +4528,8 @@ static int mem_cgroup_move_account(struct page *page, goto out; /* - * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup - * of its source page while we change it: page migration takes - * both pages off the LRU, but page cache replacement doesn't. + * Prevent mem_cgroup_replace_page() from looking at + * page->mem_cgroup of its source page while we change it. */ if (!trylock_page(page)) goto out; @@ -4835,7 +4784,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *from; - struct task_struct *p; + struct task_struct *leader, *p; struct mm_struct *mm; unsigned long move_flags; int ret = 0; @@ -4849,7 +4798,20 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, if (!move_flags) return 0; - p = cgroup_taskset_first(tset); + /* + * Multi-process migrations only happen on the default hierarchy + * where charge immigration is not used. Perform charge + * immigration if @tset contains a leader and whine if there are + * multiple. + */ + p = NULL; + cgroup_taskset_for_each_leader(leader, tset) { + WARN_ON_ONCE(p); + p = leader; + } + if (!p) + return 0; + from = mem_cgroup_from_task(p); VM_BUG_ON(from == memcg); @@ -5065,7 +5027,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) * guarantees that @root doesn't have any children, so turning it * on for the root memcg is enough. */ - if (cgroup_on_dfl(root_css->cgroup)) + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) root_mem_cgroup->use_hierarchy = true; else root_mem_cgroup->use_hierarchy = false; @@ -5074,7 +5036,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) static u64 memory_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_usage(mem_cgroup_from_css(css), false); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } static int memory_low_show(struct seq_file *m, void *v) @@ -5186,6 +5150,7 @@ static int memory_events_show(struct seq_file *m, void *v) static struct cftype memory_files[] = { { .name = "current", + .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = memory_current_read, }, { @@ -5209,6 +5174,7 @@ static struct cftype memory_files[] = { { .name = "events", .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, events_file), .seq_show = memory_events_show, }, { } /* terminate */ @@ -5328,11 +5294,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, ret = try_charge(memcg, gfp_mask, nr_pages); css_put(&memcg->css); - - if (ret == -EINTR) { - memcg = root_mem_cgroup; - ret = 0; - } out: *memcgp = memcg; return ret; @@ -5547,7 +5508,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) } /** - * mem_cgroup_migrate - migrate a charge to another page + * mem_cgroup_replace_page - migrate a charge to another page * @oldpage: currently charged page * @newpage: page to transfer the charge to * @lrucare: either or both pages might be on the LRU already @@ -5556,16 +5517,13 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) * * Both pages must be locked, @newpage->mapping must be set up. */ -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, - bool lrucare) +void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) { struct mem_cgroup *memcg; int isolated; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); - VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), newpage); @@ -5577,25 +5535,16 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, if (newpage->mem_cgroup) return; - /* - * Swapcache readahead pages can get migrated before being - * charged, and migration from compaction can happen to an - * uncharged page when the PFN walker finds a page that - * reclaim just put back on the LRU but has not released yet. - */ + /* Swapcache readahead pages can get replaced before being charged */ memcg = oldpage->mem_cgroup; if (!memcg) return; - if (lrucare) - lock_page_lru(oldpage, &isolated); - + lock_page_lru(oldpage, &isolated); oldpage->mem_cgroup = NULL; + unlock_page_lru(oldpage, isolated); - if (lrucare) - unlock_page_lru(oldpage, isolated); - - commit_charge(newpage, memcg, lrucare); + commit_charge(newpage, memcg, true); } /* |