diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 502 |
1 files changed, 284 insertions, 218 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ca0bc6e6be13..4f05735b02d3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -57,6 +57,7 @@ #include <linux/lockdep.h> #include <linux/file.h> #include <linux/tracehook.h> +#include <linux/seq_buf.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -485,7 +486,10 @@ ino_t page_cgroup_ino(struct page *page) unsigned long ino = 0; rcu_read_lock(); - memcg = READ_ONCE(page->mem_cgroup); + if (PageHead(page) && PageSlab(page)) + memcg = memcg_from_slab_page(page); + else + memcg = READ_ONCE(page->mem_cgroup); while (memcg && !(memcg->css.flags & CSS_ONLINE)) memcg = parent_mem_cgroup(memcg); if (memcg) @@ -691,11 +695,12 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) if (mem_cgroup_disabled()) return; + __this_cpu_add(memcg->vmstats_local->stat[idx], val); + x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { struct mem_cgroup *mi; - atomic_long_add(x, &memcg->vmstats_local[idx]); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &mi->vmstats[idx]); x = 0; @@ -745,11 +750,12 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_state(memcg, idx, val); /* Update lruvec */ + __this_cpu_add(pn->lruvec_stat_local->count[idx], val); + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { struct mem_cgroup_per_node *pi; - atomic_long_add(x, &pn->lruvec_stat_local[idx]); for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) atomic_long_add(x, &pi->lruvec_stat[idx]); x = 0; @@ -771,11 +777,12 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled()) return; + __this_cpu_add(memcg->vmstats_local->events[idx], count); + x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); if (unlikely(x > MEMCG_CHARGE_BATCH)) { struct mem_cgroup *mi; - atomic_long_add(x, &memcg->vmevents_local[idx]); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &mi->vmevents[idx]); x = 0; @@ -790,7 +797,12 @@ static unsigned long memcg_events(struct mem_cgroup *memcg, int event) static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { - return atomic_long_read(&memcg->vmevents_local[event]); + long x = 0; + int cpu; + + for_each_possible_cpu(cpu) + x += per_cpu(memcg->vmstats_local->events[event], cpu); + return x; } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -1155,7 +1167,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&iter->css, 0, &it); + css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); while (!ret && (task = css_task_iter_next(&it))) ret = fn(task, arg); css_task_iter_end(&it); @@ -1247,32 +1259,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, *lru_size += nr_pages; } -bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) -{ - struct mem_cgroup *task_memcg; - struct task_struct *p; - bool ret; - - p = find_lock_task_mm(task); - if (p) { - task_memcg = get_mem_cgroup_from_mm(p->mm); - task_unlock(p); - } else { - /* - * All threads may have already detached their mm's, but the oom - * killer still needs to detect if they have already been oom - * killed to prevent needlessly killing additional tasks. - */ - rcu_read_lock(); - task_memcg = mem_cgroup_from_task(task); - css_get(&task_memcg->css); - rcu_read_unlock(); - } - ret = mem_cgroup_is_descendant(task_memcg, memcg); - css_put(&task_memcg->css); - return ret; -} - /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup * @memcg: the memory cgroup @@ -1348,27 +1334,114 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } -static const unsigned int memcg1_stats[] = { - MEMCG_CACHE, - MEMCG_RSS, - MEMCG_RSS_HUGE, - NR_SHMEM, - NR_FILE_MAPPED, - NR_FILE_DIRTY, - NR_WRITEBACK, - MEMCG_SWAP, -}; +static char *memory_stat_format(struct mem_cgroup *memcg) +{ + struct seq_buf s; + int i; -static const char *const memcg1_stat_names[] = { - "cache", - "rss", - "rss_huge", - "shmem", - "mapped_file", - "dirty", - "writeback", - "swap", -}; + seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); + if (!s.buffer) + return NULL; + + /* + * Provide statistics on the state of the memory subsystem as + * well as cumulative event counters that show past behavior. + * + * This list is ordered following a combination of these gradients: + * 1) generic big picture -> specifics and details + * 2) reflecting userspace activity -> reflecting kernel heuristics + * + * Current memory state: + */ + + seq_buf_printf(&s, "anon %llu\n", + (u64)memcg_page_state(memcg, MEMCG_RSS) * + PAGE_SIZE); + seq_buf_printf(&s, "file %llu\n", + (u64)memcg_page_state(memcg, MEMCG_CACHE) * + PAGE_SIZE); + seq_buf_printf(&s, "kernel_stack %llu\n", + (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * + 1024); + seq_buf_printf(&s, "slab %llu\n", + (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + + memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * + PAGE_SIZE); + seq_buf_printf(&s, "sock %llu\n", + (u64)memcg_page_state(memcg, MEMCG_SOCK) * + PAGE_SIZE); + + seq_buf_printf(&s, "shmem %llu\n", + (u64)memcg_page_state(memcg, NR_SHMEM) * + PAGE_SIZE); + seq_buf_printf(&s, "file_mapped %llu\n", + (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * + PAGE_SIZE); + seq_buf_printf(&s, "file_dirty %llu\n", + (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * + PAGE_SIZE); + seq_buf_printf(&s, "file_writeback %llu\n", + (u64)memcg_page_state(memcg, NR_WRITEBACK) * + PAGE_SIZE); + + /* + * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter + * with the NR_ANON_THP vm counter, but right now it's a pain in the + * arse because it requires migrating the work out of rmap to a place + * where the page->mem_cgroup is set up and stable. + */ + seq_buf_printf(&s, "anon_thp %llu\n", + (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * + PAGE_SIZE); + + for (i = 0; i < NR_LRU_LISTS; i++) + seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i], + (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); + + seq_buf_printf(&s, "slab_reclaimable %llu\n", + (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * + PAGE_SIZE); + seq_buf_printf(&s, "slab_unreclaimable %llu\n", + (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * + PAGE_SIZE); + + /* Accumulated memory events */ + + seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT)); + seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT)); + + seq_buf_printf(&s, "workingset_refault %lu\n", + memcg_page_state(memcg, WORKINGSET_REFAULT)); + seq_buf_printf(&s, "workingset_activate %lu\n", + memcg_page_state(memcg, WORKINGSET_ACTIVATE)); + seq_buf_printf(&s, "workingset_nodereclaim %lu\n", + memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); + + seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL)); + seq_buf_printf(&s, "pgscan %lu\n", + memcg_events(memcg, PGSCAN_KSWAPD) + + memcg_events(memcg, PGSCAN_DIRECT)); + seq_buf_printf(&s, "pgsteal %lu\n", + memcg_events(memcg, PGSTEAL_KSWAPD) + + memcg_events(memcg, PGSTEAL_DIRECT)); + seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE)); + seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE)); + seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE)); + seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED)); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + seq_buf_printf(&s, "thp_fault_alloc %lu\n", + memcg_events(memcg, THP_FAULT_ALLOC)); + seq_buf_printf(&s, "thp_collapse_alloc %lu\n", + memcg_events(memcg, THP_COLLAPSE_ALLOC)); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + /* The above should easily fit into one page */ + WARN_ON_ONCE(seq_buf_has_overflowed(&s)); + + return s.buffer; +} #define K(x) ((x) << (PAGE_SHIFT-10)) /** @@ -1403,39 +1476,32 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct * */ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { - struct mem_cgroup *iter; - unsigned int i; + char *buf; pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), K((u64)memcg->memory.max), memcg->memory.failcnt); - pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", - K((u64)page_counter_read(&memcg->memsw)), - K((u64)memcg->memsw.max), memcg->memsw.failcnt); - pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", - K((u64)page_counter_read(&memcg->kmem)), - K((u64)memcg->kmem.max), memcg->kmem.failcnt); - - for_each_mem_cgroup_tree(iter, memcg) { - pr_info("Memory cgroup stats for "); - pr_cont_cgroup_path(iter->css.cgroup); - pr_cont(":"); - - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { - if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account) - continue; - pr_cont(" %s:%luKB", memcg1_stat_names[i], - K(memcg_page_state_local(iter, - memcg1_stats[i]))); - } - - for (i = 0; i < NR_LRU_LISTS; i++) - pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], - K(memcg_page_state_local(iter, - NR_LRU_BASE + i))); - - pr_cont("\n"); + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->swap)), + K((u64)memcg->swap.max), memcg->swap.failcnt); + else { + pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memsw)), + K((u64)memcg->memsw.max), memcg->memsw.failcnt); + pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->kmem)), + K((u64)memcg->kmem.max), memcg->kmem.failcnt); } + + pr_info("Memory cgroup stats for "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_cont(":"); + buf = memory_stat_format(memcg); + if (!buf) + return; + pr_info("%s", buf); + kfree(buf); } /* @@ -2191,11 +2257,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) long x; x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); - if (x) { - atomic_long_add(x, &memcg->vmstats_local[i]); + if (x) for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &memcg->vmstats[i]); - } if (i >= NR_VM_NODE_STAT_ITEMS) continue; @@ -2205,12 +2269,10 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) pn = mem_cgroup_nodeinfo(memcg, nid); x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); - if (x) { - atomic_long_add(x, &pn->lruvec_stat_local[i]); + if (x) do { atomic_long_add(x, &pn->lruvec_stat[i]); } while ((pn = parent_nodeinfo(pn, nid))); - } } } @@ -2218,11 +2280,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) long x; x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); - if (x) { - atomic_long_add(x, &memcg->vmevents_local[i]); + if (x) for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &memcg->vmevents[i]); - } } } @@ -2277,7 +2337,6 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long nr_reclaimed; bool may_swap = true; bool drained = false; - bool oomed = false; enum oom_status oom_status; if (mem_cgroup_is_root(memcg)) @@ -2364,7 +2423,7 @@ retry: if (nr_retries--) goto retry; - if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed) + if (gfp_mask & __GFP_RETRY_MAYFAIL) goto nomem; if (gfp_mask & __GFP_NOFAIL) @@ -2383,7 +2442,6 @@ retry: switch (oom_status) { case OOM_SUCCESS: nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - oomed = true; goto retry; case OOM_FAILED: goto force; @@ -2586,12 +2644,13 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, { struct memcg_kmem_cache_create_work *cw; + if (!css_tryget_online(&memcg->css)) + return; + cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); if (!cw) return; - css_get(&memcg->css); - cw->memcg = memcg; cw->cachep = cachep; INIT_WORK(&cw->work, memcg_kmem_cache_create_func); @@ -2626,6 +2685,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; + struct memcg_cache_array *arr; int kmemcg_id; VM_BUG_ON(!is_root_cache(cachep)); @@ -2633,14 +2693,28 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (memcg_kmem_bypass()) return cachep; - memcg = get_mem_cgroup_from_current(); + rcu_read_lock(); + + if (unlikely(current->active_memcg)) + memcg = current->active_memcg; + else + memcg = mem_cgroup_from_task(current); + + if (!memcg || memcg == root_mem_cgroup) + goto out_unlock; + kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) - goto out; + goto out_unlock; + + arr = rcu_dereference(cachep->memcg_params.memcg_caches); - memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); - if (likely(memcg_cachep)) - return memcg_cachep; + /* + * Make sure we will access the up-to-date value. The code updating + * memcg_caches issues a write barrier to match the data dependency + * barrier inside READ_ONCE() (see memcg_create_kmem_cache()). + */ + memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]); /* * If we are in a safe context (can wait, and not in interrupt @@ -2653,10 +2727,20 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) * memcg_create_kmem_cache, this means no further allocation * could happen with the slab_mutex held. So it's better to * defer everything. + * + * If the memcg is dying or memcg_cache is about to be released, + * don't bother creating new kmem_caches. Because memcg_cachep + * is ZEROed as the fist step of kmem offlining, we don't need + * percpu_ref_tryget_live() here. css_tryget_online() check in + * memcg_schedule_kmem_cache_create() will prevent us from + * creation of a new kmem_cache. */ - memcg_schedule_kmem_cache_create(memcg, cachep); -out: - css_put(&memcg->css); + if (unlikely(!memcg_cachep)) + memcg_schedule_kmem_cache_create(memcg, cachep); + else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt)) + cachep = memcg_cachep; +out_unlock: + rcu_read_unlock(); return cachep; } @@ -2667,7 +2751,7 @@ out: void memcg_kmem_put_cache(struct kmem_cache *cachep) { if (!is_root_cache(cachep)) - css_put(&cachep->memcg_params.memcg->css); + percpu_ref_put(&cachep->memcg_params.refcnt); } /** @@ -2695,9 +2779,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, cancel_charge(memcg, nr_pages); return -ENOMEM; } - - page->mem_cgroup = memcg; - return 0; } @@ -2720,12 +2801,30 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); - if (!ret) + if (!ret) { + page->mem_cgroup = memcg; __SetPageKmemcg(page); + } } css_put(&memcg->css); return ret; } + +/** + * __memcg_kmem_uncharge_memcg: uncharge a kmem page + * @memcg: memcg to uncharge + * @nr_pages: number of pages to uncharge + */ +void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg, + unsigned int nr_pages) +{ + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); + + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); +} /** * __memcg_kmem_uncharge: uncharge a kmem page * @page: page to uncharge @@ -2740,14 +2839,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->kmem, nr_pages); - - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); - + __memcg_kmem_uncharge_memcg(memcg, nr_pages); page->mem_cgroup = NULL; /* slab pages do not have PageKmemcg flag set */ @@ -3166,15 +3258,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) */ memcg->kmem_state = KMEM_ALLOCATED; - memcg_deactivate_kmem_caches(memcg); - - kmemcg_id = memcg->kmemcg_id; - BUG_ON(kmemcg_id < 0); - parent = parent_mem_cgroup(memcg); if (!parent) parent = root_mem_cgroup; + memcg_deactivate_kmem_caches(memcg, parent); + + kmemcg_id = memcg->kmemcg_id; + BUG_ON(kmemcg_id < 0); + /* * Change kmemcg_id of this cgroup and all its descendants to the * parent's id, and then move all entries from this cgroup's list_lrus @@ -3205,9 +3297,8 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) memcg_offline_kmem(memcg); if (memcg->kmem_state == KMEM_ALLOCATED) { - memcg_destroy_kmem_caches(memcg); + WARN_ON(!list_empty(&memcg->kmem_caches)); static_branch_dec(&memcg_kmem_enabled_key); - WARN_ON(page_counter_read(&memcg->kmem)); } } #else @@ -3470,6 +3561,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) } #endif /* CONFIG_NUMA */ +static const unsigned int memcg1_stats[] = { + MEMCG_CACHE, + MEMCG_RSS, + MEMCG_RSS_HUGE, + NR_SHMEM, + NR_FILE_MAPPED, + NR_FILE_DIRTY, + NR_WRITEBACK, + MEMCG_SWAP, +}; + +static const char *const memcg1_stat_names[] = { + "cache", + "rss", + "rss_huge", + "shmem", + "mapped_file", + "dirty", + "writeback", + "swap", +}; + /* Universal VM events cgroup1 shows, original sort order */ static const unsigned int memcg1_events[] = { PGPGIN, @@ -3528,12 +3641,13 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)memcg_page_state(memcg, i) * PAGE_SIZE); + (u64)memcg_page_state(memcg, memcg1_stats[i]) * + PAGE_SIZE); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], - (u64)memcg_events(memcg, i)); + (u64)memcg_events(memcg, memcg1_events[i])); for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], @@ -4483,8 +4597,15 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; + pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat); + if (!pn->lruvec_stat_local) { + kfree(pn); + return 1; + } + pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat); if (!pn->lruvec_stat_cpu) { + free_percpu(pn->lruvec_stat_local); kfree(pn); return 1; } @@ -4506,6 +4627,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) return; free_percpu(pn->lruvec_stat_cpu); + free_percpu(pn->lruvec_stat_local); kfree(pn); } @@ -4516,6 +4638,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); + free_percpu(memcg->vmstats_local); kfree(memcg); } @@ -4544,6 +4667,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg->id.id < 0) goto fail; + memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu); + if (!memcg->vmstats_local) + goto fail; + memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu); if (!memcg->vmstats_percpu) goto fail; @@ -4619,6 +4746,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) /* The following stuff does not apply to the root */ if (!parent) { +#ifdef CONFIG_MEMCG_KMEM + INIT_LIST_HEAD(&memcg->kmem_caches); +#endif root_mem_cgroup = memcg; return &memcg->css; } @@ -5610,112 +5740,42 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, return nbytes; } +static void __memory_events_show(struct seq_file *m, atomic_long_t *events) +{ + seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); + seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); + seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); + seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); + seq_printf(m, "oom_kill %lu\n", + atomic_long_read(&events[MEMCG_OOM_KILL])); +} + static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - seq_printf(m, "low %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_LOW])); - seq_printf(m, "high %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_HIGH])); - seq_printf(m, "max %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_MAX])); - seq_printf(m, "oom %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_OOM])); - seq_printf(m, "oom_kill %lu\n", - atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); - + __memory_events_show(m, memcg->memory_events); return 0; } -static int memory_stat_show(struct seq_file *m, void *v) +static int memory_events_local_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - int i; - - /* - * Provide statistics on the state of the memory subsystem as - * well as cumulative event counters that show past behavior. - * - * This list is ordered following a combination of these gradients: - * 1) generic big picture -> specifics and details - * 2) reflecting userspace activity -> reflecting kernel heuristics - * - * Current memory state: - */ - - seq_printf(m, "anon %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE); - seq_printf(m, "file %llu\n", - (u64)memcg_page_state(memcg, MEMCG_CACHE) * PAGE_SIZE); - seq_printf(m, "kernel_stack %llu\n", - (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024); - seq_printf(m, "slab %llu\n", - (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + - memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * - PAGE_SIZE); - seq_printf(m, "sock %llu\n", - (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE); - - seq_printf(m, "shmem %llu\n", - (u64)memcg_page_state(memcg, NR_SHMEM) * PAGE_SIZE); - seq_printf(m, "file_mapped %llu\n", - (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * PAGE_SIZE); - seq_printf(m, "file_dirty %llu\n", - (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * PAGE_SIZE); - seq_printf(m, "file_writeback %llu\n", - (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE); - - /* - * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter - * with the NR_ANON_THP vm counter, but right now it's a pain in the - * arse because it requires migrating the work out of rmap to a place - * where the page->mem_cgroup is set up and stable. - */ - seq_printf(m, "anon_thp %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * PAGE_SIZE); - - for (i = 0; i < NR_LRU_LISTS; i++) - seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], - (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * - PAGE_SIZE); - - seq_printf(m, "slab_reclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * - PAGE_SIZE); - seq_printf(m, "slab_unreclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * - PAGE_SIZE); - /* Accumulated memory events */ - - seq_printf(m, "pgfault %lu\n", memcg_events(memcg, PGFAULT)); - seq_printf(m, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT)); - - seq_printf(m, "workingset_refault %lu\n", - memcg_page_state(memcg, WORKINGSET_REFAULT)); - seq_printf(m, "workingset_activate %lu\n", - memcg_page_state(memcg, WORKINGSET_ACTIVATE)); - seq_printf(m, "workingset_nodereclaim %lu\n", - memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); - - seq_printf(m, "pgrefill %lu\n", memcg_events(memcg, PGREFILL)); - seq_printf(m, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + - memcg_events(memcg, PGSCAN_DIRECT)); - seq_printf(m, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + - memcg_events(memcg, PGSTEAL_DIRECT)); - seq_printf(m, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE)); - seq_printf(m, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE)); - seq_printf(m, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE)); - seq_printf(m, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED)); + __memory_events_show(m, memcg->memory_events_local); + return 0; +} -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - seq_printf(m, "thp_fault_alloc %lu\n", - memcg_events(memcg, THP_FAULT_ALLOC)); - seq_printf(m, "thp_collapse_alloc %lu\n", - memcg_events(memcg, THP_COLLAPSE_ALLOC)); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static int memory_stat_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + char *buf; + buf = memory_stat_format(memcg); + if (!buf) + return -ENOMEM; + seq_puts(m, buf); + kfree(buf); return 0; } @@ -5787,6 +5847,12 @@ static struct cftype memory_files[] = { .seq_show = memory_events_show, }, { + .name = "events.local", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, events_local_file), + .seq_show = memory_events_local_show, + }, + { .name = "stat", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = memory_stat_show, |