summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-06 10:10:54 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-06 10:10:54 +0300
commit2e3078af2c67730c479f1d183af5b367f5d95337 (patch)
treeb7881c6c9c479aadac345df7e18e3c0e10f0811e /mm/memcontrol.c
parentea5c58e70c3a148ada0d3061a8f529589bb766ba (diff)
parentb3b0d09c7a2330759ac293f5269bd932439ea0ff (diff)
downloadlinux-2e3078af2c67730c479f1d183af5b367f5d95337.tar.xz
Merge branch 'akpm' (patches from Andrew)
Merge patch-bomb from Andrew Morton: - inotify tweaks - some ocfs2 updates (many more are awaiting review) - various misc bits - kernel/watchdog.c updates - Some of mm. I have a huge number of MM patches this time and quite a lot of it is quite difficult and much will be held over to next time. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (162 commits) selftests: vm: add tests for lock on fault mm: mlock: add mlock flags to enable VM_LOCKONFAULT usage mm: introduce VM_LOCKONFAULT mm: mlock: add new mlock system call mm: mlock: refactor mlock, munlock, and munlockall code kasan: always taint kernel on report mm, slub, kasan: enable user tracking by default with KASAN=y kasan: use IS_ALIGNED in memory_is_poisoned_8() kasan: Fix a type conversion error lib: test_kasan: add some testcases kasan: update reference to kasan prototype repo kasan: move KASAN_SANITIZE in arch/x86/boot/Makefile kasan: various fixes in documentation kasan: update log messages kasan: accurately determine the type of the bad access kasan: update reported bug types for kernel memory accesses kasan: update reported bug types for not user nor kernel memory accesses mm/kasan: prevent deadlock in kasan reporting mm/kasan: don't use kasan shadow pointer in generic functions mm/kasan: MODULE_VADDR is not available on all archs ...
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c295
1 files changed, 116 insertions, 179 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b732edfddb76..bc502e590366 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -62,6 +62,7 @@
#include <linux/oom.h>
#include <linux/lockdep.h>
#include <linux/file.h>
+#include <linux/tracehook.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -1661,7 +1662,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- if (!current->memcg_oom.may_oom)
+ if (!current->memcg_may_oom)
return;
/*
* We are in the middle of the charge context here, so we
@@ -1678,9 +1679,9 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
* and when we know whether the fault was overall successful.
*/
css_get(&memcg->css);
- current->memcg_oom.memcg = memcg;
- current->memcg_oom.gfp_mask = mask;
- current->memcg_oom.order = order;
+ current->memcg_in_oom = memcg;
+ current->memcg_oom_gfp_mask = mask;
+ current->memcg_oom_order = order;
}
/**
@@ -1702,7 +1703,7 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
*/
bool mem_cgroup_oom_synchronize(bool handle)
{
- struct mem_cgroup *memcg = current->memcg_oom.memcg;
+ struct mem_cgroup *memcg = current->memcg_in_oom;
struct oom_wait_info owait;
bool locked;
@@ -1730,8 +1731,8 @@ bool mem_cgroup_oom_synchronize(bool handle)
if (locked && !memcg->oom_kill_disable) {
mem_cgroup_unmark_under_oom(memcg);
finish_wait(&memcg_oom_waitq, &owait.wait);
- mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
- current->memcg_oom.order);
+ mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
+ current->memcg_oom_order);
} else {
schedule();
mem_cgroup_unmark_under_oom(memcg);
@@ -1748,7 +1749,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
memcg_oom_recover(memcg);
}
cleanup:
- current->memcg_oom.memcg = NULL;
+ current->memcg_in_oom = NULL;
css_put(&memcg->css);
return true;
}
@@ -1972,6 +1973,31 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
return NOTIFY_OK;
}
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+ unsigned int nr_pages = current->memcg_nr_pages_over_high;
+ struct mem_cgroup *memcg, *pos;
+
+ if (likely(!nr_pages))
+ return;
+
+ pos = memcg = get_mem_cgroup_from_mm(current->mm);
+
+ do {
+ if (page_counter_read(&pos->memory) <= pos->high)
+ continue;
+ mem_cgroup_events(pos, MEMCG_HIGH, 1);
+ try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
+ } while ((pos = parent_mem_cgroup(pos)));
+
+ css_put(&memcg->css);
+ current->memcg_nr_pages_over_high = 0;
+}
+
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
@@ -1982,17 +2008,16 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
- int ret = 0;
if (mem_cgroup_is_root(memcg))
- goto done;
+ return 0;
retry:
if (consume_stock(memcg, nr_pages))
- goto done;
+ return 0;
if (!do_swap_account ||
- !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
- if (!page_counter_try_charge(&memcg->memory, batch, &counter))
+ page_counter_try_charge(&memcg->memsw, batch, &counter)) {
+ if (page_counter_try_charge(&memcg->memory, batch, &counter))
goto done_restock;
if (do_swap_account)
page_counter_uncharge(&memcg->memsw, batch);
@@ -2016,7 +2041,7 @@ retry:
if (unlikely(test_thread_flag(TIF_MEMDIE) ||
fatal_signal_pending(current) ||
current->flags & PF_EXITING))
- goto bypass;
+ goto force;
if (unlikely(task_in_memcg_oom(current)))
goto nomem;
@@ -2062,38 +2087,54 @@ retry:
goto retry;
if (gfp_mask & __GFP_NOFAIL)
- goto bypass;
+ goto force;
if (fatal_signal_pending(current))
- goto bypass;
+ goto force;
mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
- mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
+ mem_cgroup_oom(mem_over_limit, gfp_mask,
+ get_order(nr_pages * PAGE_SIZE));
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM;
-bypass:
- return -EINTR;
+force:
+ /*
+ * The allocation either can't fail or will lead to more memory
+ * being freed very soon. Allow memory usage go over the limit
+ * temporarily by force charging it.
+ */
+ page_counter_charge(&memcg->memory, nr_pages);
+ if (do_swap_account)
+ page_counter_charge(&memcg->memsw, nr_pages);
+ css_get_many(&memcg->css, nr_pages);
+
+ return 0;
done_restock:
css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
- if (!(gfp_mask & __GFP_WAIT))
- goto done;
+
/*
- * If the hierarchy is above the normal consumption range,
- * make the charging task trim their excess contribution.
+ * If the hierarchy is above the normal consumption range, schedule
+ * reclaim on returning to userland. We can perform reclaim here
+ * if __GFP_WAIT but let's always punt for simplicity and so that
+ * GFP_KERNEL can consistently be used during reclaim. @memcg is
+ * not recorded as it most likely matches current's and won't
+ * change in the meantime. As high limit is checked again before
+ * reclaim, the cost of mismatch is negligible.
*/
do {
- if (page_counter_read(&memcg->memory) <= memcg->high)
- continue;
- mem_cgroup_events(memcg, MEMCG_HIGH, 1);
- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+ if (page_counter_read(&memcg->memory) > memcg->high) {
+ current->memcg_nr_pages_over_high += nr_pages;
+ set_notify_resume(current);
+ break;
+ }
} while ((memcg = parent_mem_cgroup(memcg)));
-done:
- return ret;
+
+ return 0;
}
static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
@@ -2174,55 +2215,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
}
#ifdef CONFIG_MEMCG_KMEM
-int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned long nr_pages)
-{
- struct page_counter *counter;
- int ret = 0;
-
- ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
- if (ret < 0)
- return ret;
-
- ret = try_charge(memcg, gfp, nr_pages);
- if (ret == -EINTR) {
- /*
- * try_charge() chose to bypass to root due to OOM kill or
- * fatal signal. Since our only options are to either fail
- * the allocation or charge it to this cgroup, do it as a
- * temporary condition. But we can't fail. From a kmem/slab
- * perspective, the cache has already been selected, by
- * mem_cgroup_kmem_get_cache(), so it is too late to change
- * our minds.
- *
- * This condition will only trigger if the task entered
- * memcg_charge_kmem in a sane state, but was OOM-killed
- * during try_charge() above. Tasks that were already dying
- * when the allocation triggers should have been already
- * directed to the root cgroup in memcontrol.h
- */
- page_counter_charge(&memcg->memory, nr_pages);
- if (do_swap_account)
- page_counter_charge(&memcg->memsw, nr_pages);
- css_get_many(&memcg->css, nr_pages);
- ret = 0;
- } else if (ret)
- page_counter_uncharge(&memcg->kmem, nr_pages);
-
- return ret;
-}
-
-void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
-{
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_swap_account)
- page_counter_uncharge(&memcg->memsw, nr_pages);
-
- page_counter_uncharge(&memcg->kmem, nr_pages);
-
- css_put_many(&memcg->css, nr_pages);
-}
-
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -2384,85 +2376,58 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep)
css_put(&cachep->memcg_params.memcg->css);
}
-/*
- * We need to verify if the allocation against current->mm->owner's memcg is
- * possible for the given order. But the page is not allocated yet, so we'll
- * need a further commit step to do the final arrangements.
- *
- * It is possible for the task to switch cgroups in this mean time, so at
- * commit time, we can't rely on task conversion any longer. We'll then use
- * the handle argument to return to the caller which cgroup we should commit
- * against. We could also return the memcg directly and avoid the pointer
- * passing, but a boolean return value gives better semantics considering
- * the compiled-out case as well.
- *
- * Returning true means the allocation is possible.
- */
-bool
-__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+ struct mem_cgroup *memcg)
{
- struct mem_cgroup *memcg;
+ unsigned int nr_pages = 1 << order;
+ struct page_counter *counter;
int ret;
- *_memcg = NULL;
+ if (!memcg_kmem_is_active(memcg))
+ return 0;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
+ return -ENOMEM;
- if (!memcg_kmem_is_active(memcg)) {
- css_put(&memcg->css);
- return true;
+ ret = try_charge(memcg, gfp, nr_pages);
+ if (ret) {
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+ return ret;
}
- ret = memcg_charge_kmem(memcg, gfp, 1 << order);
- if (!ret)
- *_memcg = memcg;
+ page->mem_cgroup = memcg;
- css_put(&memcg->css);
- return (ret == 0);
+ return 0;
}
-void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
- int order)
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
- VM_BUG_ON(mem_cgroup_is_root(memcg));
+ struct mem_cgroup *memcg;
+ int ret;
- /* The page allocation failed. Revert */
- if (!page) {
- memcg_uncharge_kmem(memcg, 1 << order);
- return;
- }
- page->mem_cgroup = memcg;
+ memcg = get_mem_cgroup_from_mm(current->mm);
+ ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ css_put(&memcg->css);
+ return ret;
}
-void __memcg_kmem_uncharge_pages(struct page *page, int order)
+void __memcg_kmem_uncharge(struct page *page, int order)
{
struct mem_cgroup *memcg = page->mem_cgroup;
+ unsigned int nr_pages = 1 << order;
if (!memcg)
return;
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- memcg_uncharge_kmem(memcg, 1 << order);
- page->mem_cgroup = NULL;
-}
-
-struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
-{
- struct mem_cgroup *memcg = NULL;
- struct kmem_cache *cachep;
- struct page *page;
-
- page = virt_to_head_page(ptr);
- if (PageSlab(page)) {
- cachep = page->slab_cache;
- if (!is_root_cache(cachep))
- memcg = cachep->memcg_params.memcg;
- } else
- /* page allocated by alloc_kmem_pages */
- memcg = page->mem_cgroup;
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_swap_account)
+ page_counter_uncharge(&memcg->memsw, nr_pages);
- return memcg;
+ page->mem_cgroup = NULL;
+ css_put_many(&memcg->css, nr_pages);
}
#endif /* CONFIG_MEMCG_KMEM */
@@ -2836,9 +2801,9 @@ static unsigned long tree_stat(struct mem_cgroup *memcg,
return val;
}
-static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
- u64 val;
+ unsigned long val;
if (mem_cgroup_is_root(memcg)) {
val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
@@ -2851,7 +2816,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
else
val = page_counter_read(&memcg->memsw);
}
- return val << PAGE_SHIFT;
+ return val;
}
enum {
@@ -2885,9 +2850,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
switch (MEMFILE_ATTR(cft->private)) {
case RES_USAGE:
if (counter == &memcg->memory)
- return mem_cgroup_usage(memcg, false);
+ return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
if (counter == &memcg->memsw)
- return mem_cgroup_usage(memcg, true);
+ return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
return (u64)page_counter_read(counter) * PAGE_SIZE;
case RES_LIMIT:
return (u64)counter->limit * PAGE_SIZE;
@@ -3387,7 +3352,6 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
ret = page_counter_memparse(args, "-1", &threshold);
if (ret)
return ret;
- threshold <<= PAGE_SHIFT;
mutex_lock(&memcg->thresholds_lock);
@@ -4406,22 +4370,10 @@ static int mem_cgroup_do_precharge(unsigned long count)
mc.precharge += count;
return ret;
}
- if (ret == -EINTR) {
- cancel_charge(root_mem_cgroup, count);
- return ret;
- }
/* Try charges one by one with reclaim */
while (count--) {
ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
- /*
- * In case of failure, any residual charges against
- * mc.to will be dropped by mem_cgroup_clear_mc()
- * later on. However, cancel any charges that are
- * bypassed to root right away or they'll be lost.
- */
- if (ret == -EINTR)
- cancel_charge(root_mem_cgroup, 1);
if (ret)
return ret;
mc.precharge++;
@@ -4576,9 +4528,8 @@ static int mem_cgroup_move_account(struct page *page,
goto out;
/*
- * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
- * of its source page while we change it: page migration takes
- * both pages off the LRU, but page cache replacement doesn't.
+ * Prevent mem_cgroup_replace_page() from looking at
+ * page->mem_cgroup of its source page while we change it.
*/
if (!trylock_page(page))
goto out;
@@ -5085,7 +5036,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
static u64 memory_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- return mem_cgroup_usage(mem_cgroup_from_css(css), false);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
}
static int memory_low_show(struct seq_file *m, void *v)
@@ -5197,6 +5150,7 @@ static int memory_events_show(struct seq_file *m, void *v)
static struct cftype memory_files[] = {
{
.name = "current",
+ .flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = memory_current_read,
},
{
@@ -5340,11 +5294,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
ret = try_charge(memcg, gfp_mask, nr_pages);
css_put(&memcg->css);
-
- if (ret == -EINTR) {
- memcg = root_mem_cgroup;
- ret = 0;
- }
out:
*memcgp = memcg;
return ret;
@@ -5559,7 +5508,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
}
/**
- * mem_cgroup_migrate - migrate a charge to another page
+ * mem_cgroup_replace_page - migrate a charge to another page
* @oldpage: currently charged page
* @newpage: page to transfer the charge to
* @lrucare: either or both pages might be on the LRU already
@@ -5568,16 +5517,13 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
*
* Both pages must be locked, @newpage->mapping must be set up.
*/
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
- bool lrucare)
+void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
{
struct mem_cgroup *memcg;
int isolated;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
- VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
- VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
newpage);
@@ -5589,25 +5535,16 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
if (newpage->mem_cgroup)
return;
- /*
- * Swapcache readahead pages can get migrated before being
- * charged, and migration from compaction can happen to an
- * uncharged page when the PFN walker finds a page that
- * reclaim just put back on the LRU but has not released yet.
- */
+ /* Swapcache readahead pages can get replaced before being charged */
memcg = oldpage->mem_cgroup;
if (!memcg)
return;
- if (lrucare)
- lock_page_lru(oldpage, &isolated);
-
+ lock_page_lru(oldpage, &isolated);
oldpage->mem_cgroup = NULL;
+ unlock_page_lru(oldpage, isolated);
- if (lrucare)
- unlock_page_lru(oldpage, isolated);
-
- commit_charge(newpage, memcg, lrucare);
+ commit_charge(newpage, memcg, true);
}
/*