diff options
author | Muchun Song <songmuchun@bytedance.com> | 2021-04-30 08:56:52 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-30 21:20:38 +0300 |
commit | b4e0b68fbd9d1fd7e31cbe8adca3ad6cf556e2ee (patch) | |
tree | 45fc74ef3200d97941ec6579e7c9ec9bea7bdca6 /mm/memcontrol.c | |
parent | 7ab345a8973017c89a1be87b6c8722d1fee1fd95 (diff) | |
download | linux-b4e0b68fbd9d1fd7e31cbe8adca3ad6cf556e2ee.tar.xz |
mm: memcontrol: use obj_cgroup APIs to charge kmem pages
Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup. The new
APIs introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the
memory. But there are still some corner objects (e.g. allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs. Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.
We want to reuse the obj_cgroup APIs to charge the kmem pages. If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.
Finally, page->memcg_data will have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
We do not change the behavior of page_memcg() and page_memcg_rcu(). They
are also suitable for LRU pages and kmem pages. Why?
Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ... instance of the same job that was restarted into a new
cgroup every time. Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer. At that time, LRU pages and kmem pages will
be treated the same. The implementation of page_memcg() will remove the
kmem page check.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to an
object cgroup. We can use the __page_objcg() to get the object cgroup
associated with a kmem page. Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g. acquire the rcu_read_lock or
css_set_lock).
Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 116 |
1 files changed, 58 insertions, 58 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 85fd2d86d23f..f3ddf94c9485 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -865,18 +865,22 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { struct page *head = compound_head(page); /* rmap on tail pages */ - struct mem_cgroup *memcg = page_memcg(head); + struct mem_cgroup *memcg; pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; + rcu_read_lock(); + memcg = page_memcg(head); /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!memcg) { + rcu_read_unlock(); __mod_node_page_state(pgdat, idx, val); return; } lruvec = mem_cgroup_lruvec(memcg, pgdat); __mod_lruvec_state(lruvec, idx, val); + rcu_read_unlock(); } EXPORT_SYMBOL(__mod_lruvec_page_state); @@ -1051,20 +1055,6 @@ static __always_inline struct mem_cgroup *active_memcg(void) return current->active_memcg; } -static __always_inline struct mem_cgroup *get_active_memcg(void) -{ - struct mem_cgroup *memcg; - - rcu_read_lock(); - memcg = active_memcg(); - /* remote memcg must hold a ref. */ - if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css))) - memcg = root_mem_cgroup; - rcu_read_unlock(); - - return memcg; -} - static __always_inline bool memcg_kmem_bypass(void) { /* Allow remote memcg charging from any context. */ @@ -1079,20 +1069,6 @@ static __always_inline bool memcg_kmem_bypass(void) } /** - * If active memcg is set, do not fallback to current->mm->memcg. - */ -static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) -{ - if (memcg_kmem_bypass()) - return NULL; - - if (unlikely(active_memcg())) - return get_active_memcg(); - - return get_mem_cgroup_from_mm(current->mm); -} - -/** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation @@ -3121,18 +3097,18 @@ static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_page */ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { - struct mem_cgroup *memcg; + struct obj_cgroup *objcg; int ret = 0; - memcg = get_mem_cgroup_from_current(); - if (memcg && !mem_cgroup_is_root(memcg)) { - ret = __memcg_kmem_charge(memcg, gfp, 1 << order); + objcg = get_obj_cgroup_from_current(); + if (objcg) { + ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); if (!ret) { - page->memcg_data = (unsigned long)memcg | + page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM; return 0; } - css_put(&memcg->css); + obj_cgroup_put(objcg); } return ret; } @@ -3144,16 +3120,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) */ void __memcg_kmem_uncharge_page(struct page *page, int order) { - struct mem_cgroup *memcg = page_memcg(page); + struct obj_cgroup *objcg; unsigned int nr_pages = 1 << order; - if (!memcg) + if (!PageMemcgKmem(page)) return; - VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - __memcg_kmem_uncharge(memcg, nr_pages); + objcg = __page_objcg(page); + obj_cgroup_uncharge_pages(objcg, nr_pages); page->memcg_data = 0; - css_put(&memcg->css); + obj_cgroup_put(objcg); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) @@ -3294,7 +3270,11 @@ void split_page_memcg(struct page *head, unsigned int nr) for (i = 1; i < nr; i++) head[i].memcg_data = head->memcg_data; - css_get_many(&memcg->css, nr - 1); + + if (PageMemcgKmem(head)) + obj_cgroup_get_many(__page_objcg(head), nr - 1); + else + css_get_many(&memcg->css, nr - 1); } #ifdef CONFIG_MEMCG_SWAP @@ -6788,7 +6768,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) struct uncharge_gather { struct mem_cgroup *memcg; - unsigned long nr_pages; + unsigned long nr_memory; unsigned long pgpgout; unsigned long nr_kmem; struct page *dummy_page; @@ -6803,10 +6783,10 @@ static void uncharge_batch(const struct uncharge_gather *ug) { unsigned long flags; - if (!mem_cgroup_is_root(ug->memcg)) { - page_counter_uncharge(&ug->memcg->memory, ug->nr_pages); + if (ug->nr_memory) { + page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); if (do_memsw_account()) - page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages); + page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); memcg_oom_recover(ug->memcg); @@ -6814,7 +6794,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) local_irq_save(flags); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); @@ -6825,40 +6805,60 @@ static void uncharge_batch(const struct uncharge_gather *ug) static void uncharge_page(struct page *page, struct uncharge_gather *ug) { unsigned long nr_pages; + struct mem_cgroup *memcg; + struct obj_cgroup *objcg; VM_BUG_ON_PAGE(PageLRU(page), page); - if (!page_memcg(page)) - return; - /* * Nobody should be changing or seriously looking at - * page_memcg(page) at this point, we have fully + * page memcg or objcg at this point, we have fully * exclusive access to the page. */ + if (PageMemcgKmem(page)) { + objcg = __page_objcg(page); + /* + * This get matches the put at the end of the function and + * kmem pages do not hold memcg references anymore. + */ + memcg = get_mem_cgroup_from_objcg(objcg); + } else { + memcg = __page_memcg(page); + } - if (ug->memcg != page_memcg(page)) { + if (!memcg) + return; + + if (ug->memcg != memcg) { if (ug->memcg) { uncharge_batch(ug); uncharge_gather_clear(ug); } - ug->memcg = page_memcg(page); + ug->memcg = memcg; ug->dummy_page = page; /* pairs with css_put in uncharge_batch */ - css_get(&ug->memcg->css); + css_get(&memcg->css); } nr_pages = compound_nr(page); - ug->nr_pages += nr_pages; - if (PageMemcgKmem(page)) + if (PageMemcgKmem(page)) { + ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; - else + + page->memcg_data = 0; + obj_cgroup_put(objcg); + } else { + /* LRU pages aren't accounted at the root level */ + if (!mem_cgroup_is_root(memcg)) + ug->nr_memory += nr_pages; ug->pgpgout++; - page->memcg_data = 0; - css_put(&ug->memcg->css); + page->memcg_data = 0; + } + + css_put(&memcg->css); } /** |