summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c268
1 files changed, 154 insertions, 114 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b807952b4d43..7f74a158cfa8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -197,14 +197,6 @@ static struct move_charge_struct {
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
-enum charge_type {
- MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
- MEM_CGROUP_CHARGE_TYPE_ANON,
- MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
- MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
- NR_CHARGE_TYPE,
-};
-
/* for encoding cft->private value on file */
enum res_type {
_MEM,
@@ -1102,9 +1094,9 @@ static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
* invocations for reference counting, or use mem_cgroup_iter_break()
* to cancel a hierarchy walk before the round-trip is complete.
*
- * Reclaimers can specify a node and a priority level in @reclaim to
- * divide up the memcgs in the hierarchy among all concurrent
- * reclaimers operating on the same node and priority.
+ * Reclaimers can specify a node in @reclaim to divide up the memcgs
+ * in the hierarchy among all concurrent reclaimers operating on the
+ * same node.
*/
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -1456,6 +1448,70 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
return false;
}
+struct memory_stat {
+ const char *name;
+ unsigned int ratio;
+ unsigned int idx;
+};
+
+static struct memory_stat memory_stats[] = {
+ { "anon", PAGE_SIZE, NR_ANON_MAPPED },
+ { "file", PAGE_SIZE, NR_FILE_PAGES },
+ { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
+ { "percpu", 1, MEMCG_PERCPU_B },
+ { "sock", PAGE_SIZE, MEMCG_SOCK },
+ { "shmem", PAGE_SIZE, NR_SHMEM },
+ { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
+ { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
+ { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+ * The ratio will be initialized in memory_stats_init(). Because
+ * on some architectures, the macro of HPAGE_PMD_SIZE is not
+ * constant(e.g. powerpc).
+ */
+ { "anon_thp", 0, NR_ANON_THPS },
+#endif
+ { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
+ { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
+ { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
+ { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
+ { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
+
+ /*
+ * Note: The slab_reclaimable and slab_unreclaimable must be
+ * together and slab_reclaimable must be in front.
+ */
+ { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
+ { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
+
+ /* The memory events */
+ { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
+ { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
+ { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
+ { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
+ { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
+ { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
+ { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
+};
+
+static int __init memory_stats_init(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (memory_stats[i].idx == NR_ANON_THPS)
+ memory_stats[i].ratio = HPAGE_PMD_SIZE;
+#endif
+ VM_BUG_ON(!memory_stats[i].ratio);
+ VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
+ }
+
+ return 0;
+}
+pure_initcall(memory_stats_init);
+
static char *memory_stat_format(struct mem_cgroup *memcg)
{
struct seq_buf s;
@@ -1476,52 +1532,19 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
* Current memory state:
*/
- seq_buf_printf(&s, "anon %llu\n",
- (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
- PAGE_SIZE);
- seq_buf_printf(&s, "kernel_stack %llu\n",
- (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
- 1024);
- seq_buf_printf(&s, "slab %llu\n",
- (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
- memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
- seq_buf_printf(&s, "percpu %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_PERCPU_B));
- seq_buf_printf(&s, "sock %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_SOCK) *
- PAGE_SIZE);
-
- seq_buf_printf(&s, "shmem %llu\n",
- (u64)memcg_page_state(memcg, NR_SHMEM) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file_mapped %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file_dirty %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file_writeback %llu\n",
- (u64)memcg_page_state(memcg, NR_WRITEBACK) *
- PAGE_SIZE);
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- seq_buf_printf(&s, "anon_thp %llu\n",
- (u64)memcg_page_state(memcg, NR_ANON_THPS) *
- HPAGE_PMD_SIZE);
-#endif
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ u64 size;
- for (i = 0; i < NR_LRU_LISTS; i++)
- seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
- (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
+ size = memcg_page_state(memcg, memory_stats[i].idx);
+ size *= memory_stats[i].ratio;
+ seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
- seq_buf_printf(&s, "slab_reclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));
- seq_buf_printf(&s, "slab_unreclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));
+ if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
+ size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
+ seq_buf_printf(&s, "slab %llu\n", size);
+ }
+ }
/* Accumulated memory events */
@@ -1529,22 +1552,6 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
memcg_events(memcg, PGFAULT));
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
memcg_events(memcg, PGMAJFAULT));
-
- seq_buf_printf(&s, "workingset_refault_anon %lu\n",
- memcg_page_state(memcg, WORKINGSET_REFAULT_ANON));
- seq_buf_printf(&s, "workingset_refault_file %lu\n",
- memcg_page_state(memcg, WORKINGSET_REFAULT_FILE));
- seq_buf_printf(&s, "workingset_activate_anon %lu\n",
- memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
- seq_buf_printf(&s, "workingset_activate_file %lu\n",
- memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
- seq_buf_printf(&s, "workingset_restore %lu\n",
- memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
- seq_buf_printf(&s, "workingset_restore %lu\n",
- memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
- seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
- memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
-
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
memcg_events(memcg, PGREFILL));
seq_buf_printf(&s, "pgscan %lu\n",
@@ -1641,17 +1648,19 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
*/
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
- unsigned long max;
+ unsigned long max = READ_ONCE(memcg->memory.max);
- max = READ_ONCE(memcg->memory.max);
- if (mem_cgroup_swappiness(memcg)) {
- unsigned long memsw_max;
- unsigned long swap_max;
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ if (mem_cgroup_swappiness(memcg))
+ max += min(READ_ONCE(memcg->swap.max),
+ (unsigned long)total_swap_pages);
+ } else { /* v1 */
+ if (mem_cgroup_swappiness(memcg)) {
+ /* Calculate swap excess capacity from memsw limit */
+ unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
- memsw_max = memcg->memsw.max;
- swap_max = READ_ONCE(memcg->swap.max);
- swap_max = min(swap_max, (unsigned long)total_swap_pages);
- max = min(max + swap_max, memsw_max);
+ max += min(swap, (unsigned long)total_swap_pages);
+ }
}
return max;
}
@@ -1817,8 +1826,8 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
struct mem_cgroup *iter;
/*
- * When a new child is created while the hierarchy is under oom,
- * mem_cgroup_oom_lock() may not be called. Watch for underflow.
+ * Be careful about under_oom underflows becase a child memcg
+ * could have been added after mem_cgroup_mark_under_oom.
*/
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
@@ -2888,6 +2897,17 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
page = virt_to_head_page(p);
/*
+ * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
+ * or a pointer to obj_cgroup vector. In the latter case the lowest
+ * bit of the pointer is set.
+ * The page->mem_cgroup pointer can be asynchronously changed
+ * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
+ * from a valid memcg pointer to objcg vector or back.
+ */
+ if (!page->mem_cgroup)
+ return NULL;
+
+ /*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
* the page->obj_cgroups.
@@ -4255,17 +4275,16 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
new->size = size;
/* Copy thresholds (if any) to new array */
- if (thresholds->primary) {
- memcpy(new->entries, thresholds->primary->entries, (size - 1) *
- sizeof(struct mem_cgroup_threshold));
- }
+ if (thresholds->primary)
+ memcpy(new->entries, thresholds->primary->entries,
+ flex_array_size(new, entries, size - 1));
/* Add new threshold */
new->entries[size - 1].eventfd = eventfd;
new->entries[size - 1].threshold = threshold;
/* Sort thresholds. Registering of new threshold isn't time-critical */
- sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
+ sort(new->entries, size, sizeof(*new->entries),
compare_thresholds, NULL);
/* Find current threshold */
@@ -5291,13 +5310,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
memcg->use_hierarchy = true;
page_counter_init(&memcg->memory, &parent->memory);
page_counter_init(&memcg->swap, &parent->swap);
- page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else {
page_counter_init(&memcg->memory, NULL);
page_counter_init(&memcg->swap, NULL);
- page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
page_counter_init(&memcg->tcpmem, NULL);
/*
@@ -5426,7 +5443,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
- page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
page_counter_set_min(&memcg->memory, 0);
@@ -5500,7 +5516,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
struct page *page = NULL;
swp_entry_t ent = pte_to_swp_entry(ptent);
- if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
+ if (!(mc.flags & MOVE_ANON))
return NULL;
/*
@@ -5519,6 +5535,9 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
return page;
}
+ if (non_swap_entry(ent))
+ return NULL;
+
/*
* Because lookup_swap_cache() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
@@ -5539,35 +5558,15 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
- struct page *page = NULL;
- struct address_space *mapping;
- pgoff_t pgoff;
-
if (!vma->vm_file) /* anonymous vma */
return NULL;
if (!(mc.flags & MOVE_FILE))
return NULL;
- mapping = vma->vm_file->f_mapping;
- pgoff = linear_page_index(vma, addr);
-
/* page is moved even if it's not RSS of this task(page-faulted). */
-#ifdef CONFIG_SWAP
/* shmem/tmpfs may report page out on swap: account for that too. */
- if (shmem_mapping(mapping)) {
- page = find_get_entry(mapping, pgoff);
- if (xa_is_value(page)) {
- swp_entry_t swp = radix_to_swp_entry(page);
- *entry = swp;
- page = find_get_page(swap_address_space(swp),
- swp_offset(swp));
- }
- } else
- page = find_get_page(mapping, pgoff);
-#else
- page = find_get_page(mapping, pgoff);
-#endif
- return page;
+ return find_get_incore_page(vma->vm_file->f_mapping,
+ linear_page_index(vma, addr));
}
/**
@@ -5643,7 +5642,7 @@ static int mem_cgroup_move_account(struct page *page,
if (PageDirty(page)) {
struct address_space *mapping = page_mapping(page);
- if (mapping_cap_account_dirty(mapping)) {
+ if (mapping_can_writeback(mapping)) {
__mod_lruvec_state(from_vec, NR_FILE_DIRTY,
-nr_pages);
__mod_lruvec_state(to_vec, NR_FILE_DIRTY,
@@ -6393,6 +6392,35 @@ static int memory_stat_show(struct seq_file *m, void *v)
return 0;
}
+#ifdef CONFIG_NUMA
+static int memory_numa_stat_show(struct seq_file *m, void *v)
+{
+ int i;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ int nid;
+
+ if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
+ continue;
+
+ seq_printf(m, "%s", memory_stats[i].name);
+ for_each_node_state(nid, N_MEMORY) {
+ u64 size;
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ size = lruvec_page_state(lruvec, memory_stats[i].idx);
+ size *= memory_stats[i].ratio;
+ seq_printf(m, " N%d=%llu", nid, size);
+ }
+ seq_putc(m, '\n');
+ }
+
+ return 0;
+}
+#endif
+
static int memory_oom_group_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
@@ -6470,6 +6498,12 @@ static struct cftype memory_files[] = {
.name = "stat",
.seq_show = memory_stat_show,
},
+#ifdef CONFIG_NUMA
+ {
+ .name = "numa_stat",
+ .seq_show = memory_numa_stat_show,
+ },
+#endif
{
.name = "oom.group",
.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
@@ -6774,6 +6808,9 @@ static void uncharge_batch(const struct uncharge_gather *ug)
__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
+
+ /* drop reference from uncharge_page */
+ css_put(&ug->memcg->css);
}
static void uncharge_page(struct page *page, struct uncharge_gather *ug)
@@ -6797,6 +6834,9 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
uncharge_gather_clear(ug);
}
ug->memcg = page->mem_cgroup;
+
+ /* pairs with css_put in uncharge_batch */
+ css_get(&ug->memcg->css);
}
nr_pages = compound_nr(page);