From 626ebc4100285be56fe3546f29b6afeb36b6871a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:09 -0800 Subject: memcg: flatten task_struct->memcg_oom task_struct->memcg_oom is a sub-struct containing fields which are used for async memcg oom handling. Most task_struct fields aren't packaged this way and it can lead to unnecessary alignment paddings. This patch flattens it. * task.memcg_oom.memcg -> task.memcg_in_oom * task.memcg_oom.gfp_mask -> task.memcg_oom_gfp_mask * task.memcg_oom.order -> task.memcg_oom_order * task.memcg_oom.may_oom -> task.memcg_may_oom In addition, task.memcg_may_oom is relocated to where other bitfields are which reduces the size of task_struct. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3e3318ddfc0e..56174c7199ee 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -406,19 +406,19 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, static inline void mem_cgroup_oom_enable(void) { - WARN_ON(current->memcg_oom.may_oom); - current->memcg_oom.may_oom = 1; + WARN_ON(current->memcg_may_oom); + current->memcg_may_oom = 1; } static inline void mem_cgroup_oom_disable(void) { - WARN_ON(!current->memcg_oom.may_oom); - current->memcg_oom.may_oom = 0; + WARN_ON(!current->memcg_may_oom); + current->memcg_may_oom = 0; } static inline bool task_in_memcg_oom(struct task_struct *p) { - return p->memcg_oom.memcg; + return p->memcg_in_oom; } bool mem_cgroup_oom_synchronize(bool wait); -- cgit v1.2.3 From b23afb93d317c65cef553b804f08dec8a7a0f7e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:11 -0800 Subject: memcg: punt high overage reclaim to return-to-userland path Currently, try_charge() tries to reclaim memory synchronously when the high limit is breached; however, if the allocation doesn't have __GFP_WAIT, synchronous reclaim is skipped. If a process performs only speculative allocations, it can blow way past the high limit. This is actually easily reproducible by simply doing "find /". slab/slub allocator tries speculative allocations first, so as long as there's memory which can be consumed without blocking, it can keep allocating memory regardless of the high limit. This patch makes try_charge() always punt the over-high reclaim to the return-to-userland path. If try_charge() detects that high limit is breached, it adds the overage to current->memcg_nr_pages_over_high and schedules execution of mem_cgroup_handle_over_high() which performs synchronous reclaim from the return-to-userland path. As long as kernel doesn't have a run-away allocation spree, this should provide enough protection while making kmemcg behave more consistently. It also has the following benefits. - All over-high reclaims can use GFP_KERNEL regardless of the specific gfp mask in use, e.g. GFP_NOFS, when the limit was breached. - It copes with prio inversion. Previously, a low-prio task with small memory.high might perform over-high reclaim with a bunch of locks held. If a higher prio task needed any of these locks, it would have to wait until the low prio task finished reclaim and released the locks. By handing over-high reclaim to the task exit path this issue can be avoided. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++++ include/linux/sched.h | 3 +++ include/linux/tracehook.h | 3 +++ mm/memcontrol.c | 47 ++++++++++++++++++++++++++++++++++++++-------- 4 files changed, 51 insertions(+), 8 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 56174c7199ee..77bf42966200 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -401,6 +401,8 @@ static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return inactive * inactive_ratio < active; } +void mem_cgroup_handle_over_high(void); + void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); @@ -620,6 +622,10 @@ static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) { } +static inline void mem_cgroup_handle_over_high(void) +{ +} + static inline void mem_cgroup_oom_enable(void) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index 17bf8b845aa0..055f2ee3b0f0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1809,6 +1809,9 @@ struct task_struct { struct mem_cgroup *memcg_in_oom; gfp_t memcg_oom_gfp_mask; int memcg_oom_order; + + /* number of pages to reclaim on returning to userland */ + unsigned int memcg_nr_pages_over_high; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 84d497297c5f..26c152122a42 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -50,6 +50,7 @@ #include #include #include +#include struct linux_binprm; /* @@ -188,6 +189,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) smp_mb__after_atomic(); if (unlikely(current->task_works)) task_work_run(); + + mem_cgroup_handle_over_high(); } #endif /* */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 47bd7f13f526..327dcda3ebf6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -62,6 +62,7 @@ #include #include #include +#include #include "internal.h" #include #include @@ -1972,6 +1973,31 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } +/* + * Scheduled by try_charge() to be executed from the userland return path + * and reclaims memory over the high limit. + */ +void mem_cgroup_handle_over_high(void) +{ + unsigned int nr_pages = current->memcg_nr_pages_over_high; + struct mem_cgroup *memcg, *pos; + + if (likely(!nr_pages)) + return; + + pos = memcg = get_mem_cgroup_from_mm(current->mm); + + do { + if (page_counter_read(&pos->memory) <= pos->high) + continue; + mem_cgroup_events(pos, MEMCG_HIGH, 1); + try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true); + } while ((pos = parent_mem_cgroup(pos))); + + css_put(&memcg->css); + current->memcg_nr_pages_over_high = 0; +} + static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages) { @@ -2080,17 +2106,22 @@ done_restock: css_get_many(&memcg->css, batch); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); - if (!(gfp_mask & __GFP_WAIT)) - goto done; + /* - * If the hierarchy is above the normal consumption range, - * make the charging task trim their excess contribution. + * If the hierarchy is above the normal consumption range, schedule + * reclaim on returning to userland. We can perform reclaim here + * if __GFP_WAIT but let's always punt for simplicity and so that + * GFP_KERNEL can consistently be used during reclaim. @memcg is + * not recorded as it most likely matches current's and won't + * change in the meantime. As high limit is checked again before + * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) <= memcg->high) - continue; - mem_cgroup_events(memcg, MEMCG_HIGH, 1); - try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + if (page_counter_read(&memcg->memory) > memcg->high) { + current->memcg_nr_pages_over_high += nr_pages; + set_notify_resume(current); + break; + } } while ((memcg = parent_mem_cgroup(memcg))); done: return ret; -- cgit v1.2.3 From cbfb479809c1b8d871cb9a31832e065e900a24c1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:14 -0800 Subject: memcg: collect kmem bypass conditions into __memcg_kmem_bypass() memcg_kmem_newpage_charge() and memcg_kmem_get_cache() are testing the same series of conditions to decide whether to bypass kmem accounting. Collect the tests into __memcg_kmem_bypass(). This is pure refactoring. Signed-off-by: Tejun Heo Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 77bf42966200..d8174e8d6ab4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -777,20 +777,7 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages); void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); -/** - * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. - * @gfp: the gfp allocation flags. - * @memcg: a pointer to the memcg this was charged against. - * @order: allocation order. - * - * returns true if the memcg where the current task belongs can hold this - * allocation. - * - * We return true automatically if this allocation is not to be accounted to - * any memcg. - */ -static inline bool -memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) +static inline bool __memcg_kmem_bypass(gfp_t gfp) { if (!memcg_kmem_enabled()) return true; @@ -812,6 +799,26 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) if (unlikely(fatal_signal_pending(current))) return true; + return false; +} + +/** + * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. + * @gfp: the gfp allocation flags. + * @memcg: a pointer to the memcg this was charged against. + * @order: allocation order. + * + * returns true if the memcg where the current task belongs can hold this + * allocation. + * + * We return true automatically if this allocation is not to be accounted to + * any memcg. + */ +static inline bool +memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) +{ + if (__memcg_kmem_bypass(gfp)) + return true; return __memcg_kmem_newpage_charge(gfp, memcg, order); } @@ -854,17 +861,8 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) static __always_inline struct kmem_cache * memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { - if (!memcg_kmem_enabled()) - return cachep; - if (gfp & __GFP_NOACCOUNT) - return cachep; - if (gfp & __GFP_NOFAIL) + if (__memcg_kmem_bypass(gfp)) return cachep; - if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) - return cachep; - if (unlikely(fatal_signal_pending(current))) - return cachep; - return __memcg_kmem_get_cache(cachep); } -- cgit v1.2.3 From 7f822c24c2b32a9feafb33e3b9a9e23ef4278c2c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:20 -0800 Subject: memcg: drop unnecessary cold-path tests from __memcg_kmem_bypass() __memcg_kmem_bypass() decides whether a kmem allocation should be bypassed to the root memcg. Some conditions that it tests are valid criteria regarding who should be held accountable; however, there are a couple unnecessary tests for cold paths - __GFP_FAIL and fatal_signal_pending(). The previous patch updated try_charge() to handle both __GFP_FAIL and dying tasks correctly and the only thing these two tests are doing is making accounting less accurate and sprinkling tests for cold path conditions in the hot paths. There's nothing meaningful gained by these extra tests. This patch removes the two unnecessary tests from __memcg_kmem_bypass(). Signed-off-by: Tejun Heo Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d8174e8d6ab4..641bb60dd7db 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -781,24 +781,10 @@ static inline bool __memcg_kmem_bypass(gfp_t gfp) { if (!memcg_kmem_enabled()) return true; - if (gfp & __GFP_NOACCOUNT) return true; - /* - * __GFP_NOFAIL allocations will move on even if charging is not - * possible. Therefore we don't even try, and have this allocation - * unaccounted. We could in theory charge it forcibly, but we hope - * those allocations are rare, and won't be worth the trouble. - */ - if (gfp & __GFP_NOFAIL) - return true; if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) return true; - - /* If the test is dying, just let it go. */ - if (unlikely(fatal_signal_pending(current))) - return true; - return false; } -- cgit v1.2.3 From 13308ca9ef9acb325b52bd8562639b5844f3cbf2 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 5 Nov 2015 18:47:40 -0800 Subject: mm/memcontrol: make mem_cgroup_inactive_anon_is_low() return bool Make mem_cgroup_inactive_anon_is_low return bool due to this particular function only using either one or zero as its return value. No functional change. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 641bb60dd7db..4142f94822ea 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -382,7 +382,7 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) return mz->lru_size[lru]; } -static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) +static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) { unsigned long inactive_ratio; unsigned long inactive; @@ -585,10 +585,10 @@ static inline bool mem_cgroup_disabled(void) return true; } -static inline int +static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) { - return 1; + return true; } static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec) -- cgit v1.2.3 From d05e83a6f861ad02c2fcba75d4c4cfe49e3bc90f Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:48:59 -0800 Subject: memcg: simplify charging kmem pages Charging kmem pages proceeds in two steps. First, we try to charge the allocation size to the memcg the current task belongs to, then we allocate a page and "commit" the charge storing the pointer to the memcg in the page struct. Such a design looks overcomplicated, because there is not much sense in trying charging the allocation before actually allocating a page: we won't be able to consume much memory over the limit even if we charge after doing the actual allocation, besides we already charge user pages post factum, so being pedantic with kmem pages just looks pointless. So this patch simplifies the design by merging the "charge" and the "commit" steps into the same function, which takes the allocated page. Also, rename the charge and uncharge methods to memcg_kmem_charge and memcg_kmem_uncharge and make the charge method return error code instead of bool to conform to mem_cgroup_try_charge. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 69 +++++++++++++--------------------------------- mm/memcontrol.c | 39 +++----------------------- mm/page_alloc.c | 18 ++++++------ 3 files changed, 32 insertions(+), 94 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4142f94822ea..c95246627f87 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -752,11 +752,8 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) * conditions, but because they are pretty simple, they are expected to be * fast. */ -bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, - int order); -void __memcg_kmem_commit_charge(struct page *page, - struct mem_cgroup *memcg, int order); -void __memcg_kmem_uncharge_pages(struct page *page, int order); +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); +void __memcg_kmem_uncharge(struct page *page, int order); /* * helper for acessing a memcg's index. It will be used as an index in the @@ -789,52 +786,30 @@ static inline bool __memcg_kmem_bypass(gfp_t gfp) } /** - * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. - * @gfp: the gfp allocation flags. - * @memcg: a pointer to the memcg this was charged against. - * @order: allocation order. + * memcg_kmem_charge: charge a kmem page + * @page: page to charge + * @gfp: reclaim mode + * @order: allocation order * - * returns true if the memcg where the current task belongs can hold this - * allocation. - * - * We return true automatically if this allocation is not to be accounted to - * any memcg. + * Returns 0 on success, an error code on failure. */ -static inline bool -memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) +static __always_inline int memcg_kmem_charge(struct page *page, + gfp_t gfp, int order) { if (__memcg_kmem_bypass(gfp)) - return true; - return __memcg_kmem_newpage_charge(gfp, memcg, order); + return 0; + return __memcg_kmem_charge(page, gfp, order); } /** - * memcg_kmem_uncharge_pages: uncharge pages from memcg - * @page: pointer to struct page being freed - * @order: allocation order. + * memcg_kmem_uncharge: uncharge a kmem page + * @page: page to uncharge + * @order: allocation order */ -static inline void -memcg_kmem_uncharge_pages(struct page *page, int order) +static __always_inline void memcg_kmem_uncharge(struct page *page, int order) { if (memcg_kmem_enabled()) - __memcg_kmem_uncharge_pages(page, order); -} - -/** - * memcg_kmem_commit_charge: embeds correct memcg in a page - * @page: pointer to struct page recently allocated - * @memcg: the memcg structure we charged against - * @order: allocation order. - * - * Needs to be called after memcg_kmem_newpage_charge, regardless of success or - * failure of the allocation. if @page is NULL, this function will revert the - * charges. Otherwise, it will commit @page to @memcg. - */ -static inline void -memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) -{ - if (memcg_kmem_enabled() && memcg) - __memcg_kmem_commit_charge(page, memcg, order); + __memcg_kmem_uncharge(page, order); } /** @@ -878,18 +853,12 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) return false; } -static inline bool -memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) -{ - return true; -} - -static inline void memcg_kmem_uncharge_pages(struct page *page, int order) +static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { + return 0; } -static inline void -memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) +static inline void memcg_kmem_uncharge(struct page *page, int order) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a1c05ff5892d..e249279f60c1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2404,57 +2404,26 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep) css_put(&cachep->memcg_params.memcg->css); } -/* - * We need to verify if the allocation against current->mm->owner's memcg is - * possible for the given order. But the page is not allocated yet, so we'll - * need a further commit step to do the final arrangements. - * - * It is possible for the task to switch cgroups in this mean time, so at - * commit time, we can't rely on task conversion any longer. We'll then use - * the handle argument to return to the caller which cgroup we should commit - * against. We could also return the memcg directly and avoid the pointer - * passing, but a boolean return value gives better semantics considering - * the compiled-out case as well. - * - * Returning true means the allocation is possible. - */ -bool -__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; int ret; - *_memcg = NULL; - memcg = get_mem_cgroup_from_mm(current->mm); if (!memcg_kmem_is_active(memcg)) { css_put(&memcg->css); - return true; + return 0; } ret = memcg_charge_kmem(memcg, gfp, 1 << order); - if (!ret) - *_memcg = memcg; css_put(&memcg->css); - return (ret == 0); -} - -void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, - int order) -{ - VM_BUG_ON(mem_cgroup_is_root(memcg)); - - /* The page allocation failed. Revert */ - if (!page) { - memcg_uncharge_kmem(memcg, 1 << order); - return; - } page->mem_cgroup = memcg; + return ret; } -void __memcg_kmem_uncharge_pages(struct page *page, int order) +void __memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 06e62300d627..446bb36ee59d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3428,24 +3428,24 @@ EXPORT_SYMBOL(__free_page_frag); struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) { struct page *page; - struct mem_cgroup *memcg = NULL; - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; page = alloc_pages(gfp_mask, order); - memcg_kmem_commit_charge(page, memcg, order); + if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { + __free_pages(page, order); + page = NULL; + } return page; } struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { struct page *page; - struct mem_cgroup *memcg = NULL; - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; page = alloc_pages_node(nid, gfp_mask, order); - memcg_kmem_commit_charge(page, memcg, order); + if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { + __free_pages(page, order); + page = NULL; + } return page; } @@ -3455,7 +3455,7 @@ struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) */ void __free_kmem_pages(struct page *page, unsigned int order) { - memcg_kmem_uncharge_pages(page, order); + memcg_kmem_uncharge(page, order); __free_pages(page, order); } -- cgit v1.2.3 From f3ccb2c42297757d2e9b820ad37960462df7b7c1 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:49:01 -0800 Subject: memcg: unify slab and other kmem pages charging We have memcg_kmem_charge and memcg_kmem_uncharge methods for charging and uncharging kmem pages to memcg, but currently they are not used for charging slab pages (i.e. they are only used for charging pages allocated with alloc_kmem_pages). The only reason why the slab subsystem uses special helpers, memcg_charge_slab and memcg_uncharge_slab, is that it needs to charge to the memcg of kmem cache while memcg_charge_kmem charges to the memcg that the current task belongs to. To remove this diversity, this patch adds an extra argument to __memcg_kmem_charge that can be a pointer to a memcg or NULL. If it is not NULL, the function tries to charge to the memcg it points to, otherwise it charge to the current context. Next, it makes the slab subsystem use this function to charge slab pages. Since memcg_charge_kmem and memcg_uncharge_kmem helpers are now used only in __memcg_kmem_charge and __memcg_kmem_uncharge, they are inlined. Since __memcg_kmem_charge stores a pointer to the memcg in the page struct, we don't need memcg_uncharge_slab anymore and can use free_kmem_pages. Besides, one can now detect which memcg a slab page belongs to by reading /proc/kpagecgroup. Note, this patch switches slab to charge-after-alloc design. Since this design is already used for all other memcg charges, it should not make any difference. [hannes@cmpxchg.org: better to have an outer function than a magic parameter for the memcg lookup] Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Signed-off-by: Johannes Weiner Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++-- mm/memcontrol.c | 71 +++++++++++++++++++++------------------------- mm/slab.c | 12 ++++---- mm/slab.h | 24 +++++----------- mm/slub.c | 12 ++++---- 5 files changed, 53 insertions(+), 72 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c95246627f87..0ba4c86d3b40 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -752,6 +752,8 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) * conditions, but because they are pretty simple, they are expected to be * fast. */ +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg); int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge(struct page *page, int order); @@ -770,10 +772,6 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep); struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr); -int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, - unsigned long nr_pages); -void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); - static inline bool __memcg_kmem_bypass(gfp_t gfp) { if (!memcg_kmem_enabled()) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e249279f60c1..9575cff544fa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2215,34 +2215,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, } #ifdef CONFIG_MEMCG_KMEM -int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, - unsigned long nr_pages) -{ - struct page_counter *counter; - int ret = 0; - - ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); - if (ret < 0) - return ret; - - ret = try_charge(memcg, gfp, nr_pages); - if (ret) - page_counter_uncharge(&memcg->kmem, nr_pages); - - return ret; -} - -void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) -{ - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) - page_counter_uncharge(&memcg->memsw, nr_pages); - - page_counter_uncharge(&memcg->kmem, nr_pages); - - css_put_many(&memcg->css, nr_pages); -} - static int memcg_alloc_cache_id(void) { int id, size; @@ -2404,36 +2376,59 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep) css_put(&cachep->memcg_params.memcg->css); } -int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg) { - struct mem_cgroup *memcg; - int ret; - - memcg = get_mem_cgroup_from_mm(current->mm); + unsigned int nr_pages = 1 << order; + struct page_counter *counter; + int ret = 0; - if (!memcg_kmem_is_active(memcg)) { - css_put(&memcg->css); + if (!memcg_kmem_is_active(memcg)) return 0; + + ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); + if (ret) + return ret; + + ret = try_charge(memcg, gfp, nr_pages); + if (ret) { + page_counter_uncharge(&memcg->kmem, nr_pages); + return ret; } - ret = memcg_charge_kmem(memcg, gfp, 1 << order); + page->mem_cgroup = memcg; + return 0; +} + +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + struct mem_cgroup *memcg; + int ret; + + memcg = get_mem_cgroup_from_mm(current->mm); + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); css_put(&memcg->css); - page->mem_cgroup = memcg; return ret; } void __memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; + unsigned int nr_pages = 1 << order; if (!memcg) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - memcg_uncharge_kmem(memcg, 1 << order); + page_counter_uncharge(&memcg->kmem, nr_pages); + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_uncharge(&memcg->memsw, nr_pages); + page->mem_cgroup = NULL; + css_put_many(&memcg->css, nr_pages); } struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) diff --git a/mm/slab.c b/mm/slab.c index 461935bab9ef..272e809404d5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1593,16 +1593,17 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; - if (memcg_charge_slab(cachep, flags, cachep->gfporder)) - return NULL; - page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) { - memcg_uncharge_slab(cachep, cachep->gfporder); slab_out_of_memory(cachep, flags, nodeid); return NULL; } + if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) { + __free_pages(page, cachep->gfporder); + return NULL; + } + /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ if (page_is_pfmemalloc(page)) pfmemalloc_active = true; @@ -1654,8 +1655,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_pages(page, cachep->gfporder); - memcg_uncharge_slab(cachep, cachep->gfporder); + __free_kmem_pages(page, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) diff --git a/mm/slab.h b/mm/slab.h index bf51a8d254cf..27492eb678f7 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -236,23 +236,16 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s->memcg_params.root_cache; } -static __always_inline int memcg_charge_slab(struct kmem_cache *s, - gfp_t gfp, int order) +static __always_inline int memcg_charge_slab(struct page *page, + gfp_t gfp, int order, + struct kmem_cache *s) { if (!memcg_kmem_enabled()) return 0; if (is_root_cache(s)) return 0; - return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order); -} - -static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) -{ - if (!memcg_kmem_enabled()) - return; - if (is_root_cache(s)) - return; - memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order); + return __memcg_kmem_charge_memcg(page, gfp, order, + s->memcg_params.memcg); } extern void slab_init_memcg_params(struct kmem_cache *); @@ -289,15 +282,12 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s; } -static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) +static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, + struct kmem_cache *s) { return 0; } -static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) -{ -} - static inline void slab_init_memcg_params(struct kmem_cache *s) { } diff --git a/mm/slub.c b/mm/slub.c index e1bb147827ef..423dbe77d145 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1328,16 +1328,15 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, flags |= __GFP_NOTRACK; - if (memcg_charge_slab(s, flags, order)) - return NULL; - if (node == NUMA_NO_NODE) page = alloc_pages(flags, order); else page = __alloc_pages_node(node, flags, order); - if (!page) - memcg_uncharge_slab(s, order); + if (page && memcg_charge_slab(page, flags, order, s)) { + __free_pages(page, order); + page = NULL; + } return page; } @@ -1476,8 +1475,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_pages(page, order); - memcg_uncharge_slab(s, order); + __free_kmem_pages(page, order); } #define need_reserve_slab_rcu \ -- cgit v1.2.3 From df4065516b0dbfa35ac0e9b8124d441221c0a285 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:49:04 -0800 Subject: memcg: simplify and inline __mem_cgroup_from_kmem Before the previous patch ("memcg: unify slab and other kmem pages charging"), __mem_cgroup_from_kmem had to handle two types of kmem - slab pages and pages allocated with alloc_kmem_pages - memcg in the page struct. Now we can unify it. Since after it, this function becomes tiny we can fold it into mem_cgroup_from_kmem. [hughd@google.com: move mem_cgroup_from_kmem into list_lru.c] Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 15 --------------- mm/list_lru.c | 10 ++++++++++ mm/memcontrol.c | 18 ------------------ 3 files changed, 10 insertions(+), 33 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0ba4c86d3b40..998311e0c70c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -770,8 +770,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); void __memcg_kmem_put_cache(struct kmem_cache *cachep); -struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr); - static inline bool __memcg_kmem_bypass(gfp_t gfp) { if (!memcg_kmem_enabled()) @@ -830,13 +828,6 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) if (memcg_kmem_enabled()) __memcg_kmem_put_cache(cachep); } - -static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) -{ - if (!memcg_kmem_enabled()) - return NULL; - return __mem_cgroup_from_kmem(ptr); -} #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) @@ -882,11 +873,5 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) { } - -static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) -{ - return NULL; -} #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ - diff --git a/mm/list_lru.c b/mm/list_lru.c index 28237476b055..afc71ea9a381 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -63,6 +63,16 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) return &nlru->lru; } +static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) +{ + struct page *page; + + if (!memcg_kmem_enabled()) + return NULL; + page = virt_to_head_page(ptr); + return page->mem_cgroup; +} + static inline struct list_lru_one * list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9575cff544fa..c0111cb667b5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2430,24 +2430,6 @@ void __memcg_kmem_uncharge(struct page *page, int order) page->mem_cgroup = NULL; css_put_many(&memcg->css, nr_pages); } - -struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) -{ - struct mem_cgroup *memcg = NULL; - struct kmem_cache *cachep; - struct page *page; - - page = virt_to_head_page(ptr); - if (PageSlab(page)) { - cachep = page->slab_cache; - if (!is_root_cache(cachep)) - memcg = cachep->memcg_params.memcg; - } else - /* page allocated by alloc_kmem_pages */ - memcg = page->mem_cgroup; - - return memcg; -} #endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -- cgit v1.2.3 From 45637bab30d6e7651737f51aa99417baef4d114a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:40 -0800 Subject: mm: rename mem_cgroup_migrate to mem_cgroup_replace_page After v4.3's commit 0610c25daa3e ("memcg: fix dirty page migration") mem_cgroup_migrate() doesn't have much to offer in page migration: convert migrate_misplaced_transhuge_page() to set_page_memcg() instead. Then rename mem_cgroup_migrate() to mem_cgroup_replace_page(), since its remaining callers are replace_page_cache_page() and shmem_replace_page(): both of whom passed lrucare true, so just eliminate that argument. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ++----- mm/filemap.c | 2 +- mm/memcontrol.c | 29 ++++++++--------------------- mm/migrate.c | 5 ++--- mm/shmem.c | 2 +- 5 files changed, 14 insertions(+), 31 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 998311e0c70c..c06c70b4404e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -297,8 +297,7 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, - bool lrucare); +void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage); struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); @@ -537,9 +536,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } -static inline void mem_cgroup_migrate(struct page *oldpage, - struct page *newpage, - bool lrucare) +static inline void mem_cgroup_replace_page(struct page *old, struct page *new) { } diff --git a/mm/filemap.c b/mm/filemap.c index 884766da1165..58e04e26f996 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -551,7 +551,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); mem_cgroup_end_page_stat(memcg); - mem_cgroup_migrate(old, new, true); + mem_cgroup_replace_page(old, new); radix_tree_preload_end(); if (freepage) freepage(old); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c0111cb667b5..a44494f3a965 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4531,9 +4531,8 @@ static int mem_cgroup_move_account(struct page *page, goto out; /* - * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup - * of its source page while we change it: page migration takes - * both pages off the LRU, but page cache replacement doesn't. + * Prevent mem_cgroup_replace_page() from looking at + * page->mem_cgroup of its source page while we change it. */ if (!trylock_page(page)) goto out; @@ -5495,7 +5494,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) } /** - * mem_cgroup_migrate - migrate a charge to another page + * mem_cgroup_replace_page - migrate a charge to another page * @oldpage: currently charged page * @newpage: page to transfer the charge to * @lrucare: either or both pages might be on the LRU already @@ -5504,16 +5503,13 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) * * Both pages must be locked, @newpage->mapping must be set up. */ -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, - bool lrucare) +void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) { struct mem_cgroup *memcg; int isolated; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); - VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), newpage); @@ -5525,25 +5521,16 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, if (newpage->mem_cgroup) return; - /* - * Swapcache readahead pages can get migrated before being - * charged, and migration from compaction can happen to an - * uncharged page when the PFN walker finds a page that - * reclaim just put back on the LRU but has not released yet. - */ + /* Swapcache readahead pages can get replaced before being charged */ memcg = oldpage->mem_cgroup; if (!memcg) return; - if (lrucare) - lock_page_lru(oldpage, &isolated); - + lock_page_lru(oldpage, &isolated); oldpage->mem_cgroup = NULL; + unlock_page_lru(oldpage, isolated); - if (lrucare) - unlock_page_lru(oldpage, isolated); - - commit_charge(newpage, memcg, lrucare); + commit_charge(newpage, memcg, true); } /* diff --git a/mm/migrate.c b/mm/migrate.c index ed72c499df8a..836e4108eaef 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -1831,8 +1830,8 @@ fail_putback: } mlock_migrate_page(new_page, page); - mem_cgroup_migrate(page, new_page, false); - + set_page_memcg(new_page, page_memcg(page)); + set_page_memcg(page, NULL); page_remove_rmap(page); spin_unlock(ptl); diff --git a/mm/shmem.c b/mm/shmem.c index 48ce82926d93..6529226b3814 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1023,7 +1023,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_migrate(oldpage, newpage, true); + mem_cgroup_replace_page(oldpage, newpage); lru_cache_add_anon(newpage); *pagep = newpage; } -- cgit v1.2.3