diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-06 10:10:54 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-06 10:10:54 +0300 |
commit | 2e3078af2c67730c479f1d183af5b367f5d95337 (patch) | |
tree | b7881c6c9c479aadac345df7e18e3c0e10f0811e /mm/memcontrol.c | |
parent | ea5c58e70c3a148ada0d3061a8f529589bb766ba (diff) | |
parent | b3b0d09c7a2330759ac293f5269bd932439ea0ff (diff) | |
download | linux-2e3078af2c67730c479f1d183af5b367f5d95337.tar.xz |
Merge branch 'akpm' (patches from Andrew)
Merge patch-bomb from Andrew Morton:
- inotify tweaks
- some ocfs2 updates (many more are awaiting review)
- various misc bits
- kernel/watchdog.c updates
- Some of mm. I have a huge number of MM patches this time and quite a
lot of it is quite difficult and much will be held over to next time.
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (162 commits)
selftests: vm: add tests for lock on fault
mm: mlock: add mlock flags to enable VM_LOCKONFAULT usage
mm: introduce VM_LOCKONFAULT
mm: mlock: add new mlock system call
mm: mlock: refactor mlock, munlock, and munlockall code
kasan: always taint kernel on report
mm, slub, kasan: enable user tracking by default with KASAN=y
kasan: use IS_ALIGNED in memory_is_poisoned_8()
kasan: Fix a type conversion error
lib: test_kasan: add some testcases
kasan: update reference to kasan prototype repo
kasan: move KASAN_SANITIZE in arch/x86/boot/Makefile
kasan: various fixes in documentation
kasan: update log messages
kasan: accurately determine the type of the bad access
kasan: update reported bug types for kernel memory accesses
kasan: update reported bug types for not user nor kernel memory accesses
mm/kasan: prevent deadlock in kasan reporting
mm/kasan: don't use kasan shadow pointer in generic functions
mm/kasan: MODULE_VADDR is not available on all archs
...
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 295 |
1 files changed, 116 insertions, 179 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b732edfddb76..bc502e590366 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -62,6 +62,7 @@ #include <linux/oom.h> #include <linux/lockdep.h> #include <linux/file.h> +#include <linux/tracehook.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -1661,7 +1662,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_oom.may_oom) + if (!current->memcg_may_oom) return; /* * We are in the middle of the charge context here, so we @@ -1678,9 +1679,9 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) * and when we know whether the fault was overall successful. */ css_get(&memcg->css); - current->memcg_oom.memcg = memcg; - current->memcg_oom.gfp_mask = mask; - current->memcg_oom.order = order; + current->memcg_in_oom = memcg; + current->memcg_oom_gfp_mask = mask; + current->memcg_oom_order = order; } /** @@ -1702,7 +1703,7 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) */ bool mem_cgroup_oom_synchronize(bool handle) { - struct mem_cgroup *memcg = current->memcg_oom.memcg; + struct mem_cgroup *memcg = current->memcg_in_oom; struct oom_wait_info owait; bool locked; @@ -1730,8 +1731,8 @@ bool mem_cgroup_oom_synchronize(bool handle) if (locked && !memcg->oom_kill_disable) { mem_cgroup_unmark_under_oom(memcg); finish_wait(&memcg_oom_waitq, &owait.wait); - mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, - current->memcg_oom.order); + mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, + current->memcg_oom_order); } else { schedule(); mem_cgroup_unmark_under_oom(memcg); @@ -1748,7 +1749,7 @@ bool mem_cgroup_oom_synchronize(bool handle) memcg_oom_recover(memcg); } cleanup: - current->memcg_oom.memcg = NULL; + current->memcg_in_oom = NULL; css_put(&memcg->css); return true; } @@ -1972,6 +1973,31 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } +/* + * Scheduled by try_charge() to be executed from the userland return path + * and reclaims memory over the high limit. + */ +void mem_cgroup_handle_over_high(void) +{ + unsigned int nr_pages = current->memcg_nr_pages_over_high; + struct mem_cgroup *memcg, *pos; + + if (likely(!nr_pages)) + return; + + pos = memcg = get_mem_cgroup_from_mm(current->mm); + + do { + if (page_counter_read(&pos->memory) <= pos->high) + continue; + mem_cgroup_events(pos, MEMCG_HIGH, 1); + try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true); + } while ((pos = parent_mem_cgroup(pos))); + + css_put(&memcg->css); + current->memcg_nr_pages_over_high = 0; +} + static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages) { @@ -1982,17 +2008,16 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long nr_reclaimed; bool may_swap = true; bool drained = false; - int ret = 0; if (mem_cgroup_is_root(memcg)) - goto done; + return 0; retry: if (consume_stock(memcg, nr_pages)) - goto done; + return 0; if (!do_swap_account || - !page_counter_try_charge(&memcg->memsw, batch, &counter)) { - if (!page_counter_try_charge(&memcg->memory, batch, &counter)) + page_counter_try_charge(&memcg->memsw, batch, &counter)) { + if (page_counter_try_charge(&memcg->memory, batch, &counter)) goto done_restock; if (do_swap_account) page_counter_uncharge(&memcg->memsw, batch); @@ -2016,7 +2041,7 @@ retry: if (unlikely(test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current) || current->flags & PF_EXITING)) - goto bypass; + goto force; if (unlikely(task_in_memcg_oom(current))) goto nomem; @@ -2062,38 +2087,54 @@ retry: goto retry; if (gfp_mask & __GFP_NOFAIL) - goto bypass; + goto force; if (fatal_signal_pending(current)) - goto bypass; + goto force; mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); + mem_cgroup_oom(mem_over_limit, gfp_mask, + get_order(nr_pages * PAGE_SIZE)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; -bypass: - return -EINTR; +force: + /* + * The allocation either can't fail or will lead to more memory + * being freed very soon. Allow memory usage go over the limit + * temporarily by force charging it. + */ + page_counter_charge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_charge(&memcg->memsw, nr_pages); + css_get_many(&memcg->css, nr_pages); + + return 0; done_restock: css_get_many(&memcg->css, batch); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); - if (!(gfp_mask & __GFP_WAIT)) - goto done; + /* - * If the hierarchy is above the normal consumption range, - * make the charging task trim their excess contribution. + * If the hierarchy is above the normal consumption range, schedule + * reclaim on returning to userland. We can perform reclaim here + * if __GFP_WAIT but let's always punt for simplicity and so that + * GFP_KERNEL can consistently be used during reclaim. @memcg is + * not recorded as it most likely matches current's and won't + * change in the meantime. As high limit is checked again before + * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) <= memcg->high) - continue; - mem_cgroup_events(memcg, MEMCG_HIGH, 1); - try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + if (page_counter_read(&memcg->memory) > memcg->high) { + current->memcg_nr_pages_over_high += nr_pages; + set_notify_resume(current); + break; + } } while ((memcg = parent_mem_cgroup(memcg))); -done: - return ret; + + return 0; } static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) @@ -2174,55 +2215,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, } #ifdef CONFIG_MEMCG_KMEM -int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, - unsigned long nr_pages) -{ - struct page_counter *counter; - int ret = 0; - - ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); - if (ret < 0) - return ret; - - ret = try_charge(memcg, gfp, nr_pages); - if (ret == -EINTR) { - /* - * try_charge() chose to bypass to root due to OOM kill or - * fatal signal. Since our only options are to either fail - * the allocation or charge it to this cgroup, do it as a - * temporary condition. But we can't fail. From a kmem/slab - * perspective, the cache has already been selected, by - * mem_cgroup_kmem_get_cache(), so it is too late to change - * our minds. - * - * This condition will only trigger if the task entered - * memcg_charge_kmem in a sane state, but was OOM-killed - * during try_charge() above. Tasks that were already dying - * when the allocation triggers should have been already - * directed to the root cgroup in memcontrol.h - */ - page_counter_charge(&memcg->memory, nr_pages); - if (do_swap_account) - page_counter_charge(&memcg->memsw, nr_pages); - css_get_many(&memcg->css, nr_pages); - ret = 0; - } else if (ret) - page_counter_uncharge(&memcg->kmem, nr_pages); - - return ret; -} - -void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) -{ - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) - page_counter_uncharge(&memcg->memsw, nr_pages); - - page_counter_uncharge(&memcg->kmem, nr_pages); - - css_put_many(&memcg->css, nr_pages); -} - static int memcg_alloc_cache_id(void) { int id, size; @@ -2384,85 +2376,58 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep) css_put(&cachep->memcg_params.memcg->css); } -/* - * We need to verify if the allocation against current->mm->owner's memcg is - * possible for the given order. But the page is not allocated yet, so we'll - * need a further commit step to do the final arrangements. - * - * It is possible for the task to switch cgroups in this mean time, so at - * commit time, we can't rely on task conversion any longer. We'll then use - * the handle argument to return to the caller which cgroup we should commit - * against. We could also return the memcg directly and avoid the pointer - * passing, but a boolean return value gives better semantics considering - * the compiled-out case as well. - * - * Returning true means the allocation is possible. - */ -bool -__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg) { - struct mem_cgroup *memcg; + unsigned int nr_pages = 1 << order; + struct page_counter *counter; int ret; - *_memcg = NULL; + if (!memcg_kmem_is_active(memcg)) + return 0; - memcg = get_mem_cgroup_from_mm(current->mm); + if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) + return -ENOMEM; - if (!memcg_kmem_is_active(memcg)) { - css_put(&memcg->css); - return true; + ret = try_charge(memcg, gfp, nr_pages); + if (ret) { + page_counter_uncharge(&memcg->kmem, nr_pages); + return ret; } - ret = memcg_charge_kmem(memcg, gfp, 1 << order); - if (!ret) - *_memcg = memcg; + page->mem_cgroup = memcg; - css_put(&memcg->css); - return (ret == 0); + return 0; } -void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, - int order) +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { - VM_BUG_ON(mem_cgroup_is_root(memcg)); + struct mem_cgroup *memcg; + int ret; - /* The page allocation failed. Revert */ - if (!page) { - memcg_uncharge_kmem(memcg, 1 << order); - return; - } - page->mem_cgroup = memcg; + memcg = get_mem_cgroup_from_mm(current->mm); + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); + css_put(&memcg->css); + return ret; } -void __memcg_kmem_uncharge_pages(struct page *page, int order) +void __memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; + unsigned int nr_pages = 1 << order; if (!memcg) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - memcg_uncharge_kmem(memcg, 1 << order); - page->mem_cgroup = NULL; -} - -struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) -{ - struct mem_cgroup *memcg = NULL; - struct kmem_cache *cachep; - struct page *page; - - page = virt_to_head_page(ptr); - if (PageSlab(page)) { - cachep = page->slab_cache; - if (!is_root_cache(cachep)) - memcg = cachep->memcg_params.memcg; - } else - /* page allocated by alloc_kmem_pages */ - memcg = page->mem_cgroup; + page_counter_uncharge(&memcg->kmem, nr_pages); + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_uncharge(&memcg->memsw, nr_pages); - return memcg; + page->mem_cgroup = NULL; + css_put_many(&memcg->css, nr_pages); } #endif /* CONFIG_MEMCG_KMEM */ @@ -2836,9 +2801,9 @@ static unsigned long tree_stat(struct mem_cgroup *memcg, return val; } -static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { - u64 val; + unsigned long val; if (mem_cgroup_is_root(memcg)) { val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); @@ -2851,7 +2816,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) else val = page_counter_read(&memcg->memsw); } - return val << PAGE_SHIFT; + return val; } enum { @@ -2885,9 +2850,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, switch (MEMFILE_ATTR(cft->private)) { case RES_USAGE: if (counter == &memcg->memory) - return mem_cgroup_usage(memcg, false); + return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; if (counter == &memcg->memsw) - return mem_cgroup_usage(memcg, true); + return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: return (u64)counter->limit * PAGE_SIZE; @@ -3387,7 +3352,6 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, ret = page_counter_memparse(args, "-1", &threshold); if (ret) return ret; - threshold <<= PAGE_SHIFT; mutex_lock(&memcg->thresholds_lock); @@ -4406,22 +4370,10 @@ static int mem_cgroup_do_precharge(unsigned long count) mc.precharge += count; return ret; } - if (ret == -EINTR) { - cancel_charge(root_mem_cgroup, count); - return ret; - } /* Try charges one by one with reclaim */ while (count--) { ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); - /* - * In case of failure, any residual charges against - * mc.to will be dropped by mem_cgroup_clear_mc() - * later on. However, cancel any charges that are - * bypassed to root right away or they'll be lost. - */ - if (ret == -EINTR) - cancel_charge(root_mem_cgroup, 1); if (ret) return ret; mc.precharge++; @@ -4576,9 +4528,8 @@ static int mem_cgroup_move_account(struct page *page, goto out; /* - * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup - * of its source page while we change it: page migration takes - * both pages off the LRU, but page cache replacement doesn't. + * Prevent mem_cgroup_replace_page() from looking at + * page->mem_cgroup of its source page while we change it. */ if (!trylock_page(page)) goto out; @@ -5085,7 +5036,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) static u64 memory_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_usage(mem_cgroup_from_css(css), false); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } static int memory_low_show(struct seq_file *m, void *v) @@ -5197,6 +5150,7 @@ static int memory_events_show(struct seq_file *m, void *v) static struct cftype memory_files[] = { { .name = "current", + .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = memory_current_read, }, { @@ -5340,11 +5294,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, ret = try_charge(memcg, gfp_mask, nr_pages); css_put(&memcg->css); - - if (ret == -EINTR) { - memcg = root_mem_cgroup; - ret = 0; - } out: *memcgp = memcg; return ret; @@ -5559,7 +5508,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) } /** - * mem_cgroup_migrate - migrate a charge to another page + * mem_cgroup_replace_page - migrate a charge to another page * @oldpage: currently charged page * @newpage: page to transfer the charge to * @lrucare: either or both pages might be on the LRU already @@ -5568,16 +5517,13 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) * * Both pages must be locked, @newpage->mapping must be set up. */ -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, - bool lrucare) +void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) { struct mem_cgroup *memcg; int isolated; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); - VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), newpage); @@ -5589,25 +5535,16 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, if (newpage->mem_cgroup) return; - /* - * Swapcache readahead pages can get migrated before being - * charged, and migration from compaction can happen to an - * uncharged page when the PFN walker finds a page that - * reclaim just put back on the LRU but has not released yet. - */ + /* Swapcache readahead pages can get replaced before being charged */ memcg = oldpage->mem_cgroup; if (!memcg) return; - if (lrucare) - lock_page_lru(oldpage, &isolated); - + lock_page_lru(oldpage, &isolated); oldpage->mem_cgroup = NULL; + unlock_page_lru(oldpage, isolated); - if (lrucare) - unlock_page_lru(oldpage, isolated); - - commit_charge(newpage, memcg, lrucare); + commit_charge(newpage, memcg, true); } /* |