summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2018-02-01 03:16:45 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2018-02-01 04:18:36 +0300
commita983b5ebee57209c99f68c8327072f25e0e6e3da (patch)
treeae227de941d587feb9fc94e40165607b24579266 /include/linux
parent284542656e22c43fdada8c8cc0ca9ede8453eed7 (diff)
downloadlinux-a983b5ebee57209c99f68c8327072f25e0e6e3da.tar.xz
mm: memcontrol: fix excessive complexity in memory.stat reporting
We've seen memory.stat reads in top-level cgroups take up to fourteen seconds during a userspace bug that created tens of thousands of ghost cgroups pinned by lingering page cache. Even with a more reasonable number of cgroups, aggregating memory.stat is unnecessarily heavy. The complexity is this: nr_cgroups * nr_stat_items * nr_possible_cpus where the stat items are ~70 at this point. With 128 cgroups and 128 CPUs - decent, not enormous setups - reading the top-level memory.stat has to aggregate over a million per-cpu counters. This doesn't scale. Instead of spreading the source of truth across all CPUs, use the per-cpu counters merely to batch updates to shared atomic counters. This is the same as the per-cpu stocks we use for charging memory to the shared atomic page_counters, and also the way the global vmstat counters are implemented. Vmstat has elaborate spilling thresholds that depend on the number of CPUs, amount of memory, and memory pressure - carefully balancing the cost of counter updates with the amount of per-cpu error. That's because the vmstat counters are system-wide, but also used for decisions inside the kernel (e.g. NR_FREE_PAGES in the allocator). Neither is true for the memory controller. Use the same static batch size we already use for page_counter updates during charging. The per-cpu error in the stats will be 128k, which is an acceptable ratio of cores to memory accounting granularity. [hannes@cmpxchg.org: fix warning in __this_cpu_xchg() calls] Link: http://lkml.kernel.org/r/20171201135750.GB8097@cmpxchg.org Link: http://lkml.kernel.org/r/20171103153336.24044-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/memcontrol.h96
1 files changed, 62 insertions, 34 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1ffc54ac4cc9..882046863581 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -108,7 +108,10 @@ struct lruvec_stat {
*/
struct mem_cgroup_per_node {
struct lruvec lruvec;
- struct lruvec_stat __percpu *lruvec_stat;
+
+ struct lruvec_stat __percpu *lruvec_stat_cpu;
+ atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS];
+
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
@@ -227,10 +230,10 @@ struct mem_cgroup {
spinlock_t move_lock;
struct task_struct *move_lock_task;
unsigned long move_lock_flags;
- /*
- * percpu counter.
- */
- struct mem_cgroup_stat_cpu __percpu *stat;
+
+ struct mem_cgroup_stat_cpu __percpu *stat_cpu;
+ atomic_long_t stat[MEMCG_NR_STAT];
+ atomic_long_t events[MEMCG_NR_EVENTS];
unsigned long socket_pressure;
@@ -265,6 +268,12 @@ struct mem_cgroup {
/* WARNING: nodeinfo must be the last member here */
};
+/*
+ * size of first charge trial. "32" comes from vmscan.c's magic value.
+ * TODO: maybe necessary to use big numbers in big irons.
+ */
+#define MEMCG_CHARGE_BATCH 32U
+
extern struct mem_cgroup *root_mem_cgroup;
static inline bool mem_cgroup_disabled(void)
@@ -485,32 +494,38 @@ void unlock_page_memcg(struct page *page);
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
int idx)
{
- long val = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- val += per_cpu(memcg->stat->count[idx], cpu);
-
- if (val < 0)
- val = 0;
-
- return val;
+ long x = atomic_long_read(&memcg->stat[idx]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
}
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void __mod_memcg_state(struct mem_cgroup *memcg,
int idx, int val)
{
- if (!mem_cgroup_disabled())
- __this_cpu_add(memcg->stat->count[idx], val);
+ long x;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ x = val + __this_cpu_read(memcg->stat_cpu->count[idx]);
+ if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+ atomic_long_add(x, &memcg->stat[idx]);
+ x = 0;
+ }
+ __this_cpu_write(memcg->stat_cpu->count[idx], x);
}
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void mod_memcg_state(struct mem_cgroup *memcg,
int idx, int val)
{
- if (!mem_cgroup_disabled())
- this_cpu_add(memcg->stat->count[idx], val);
+ preempt_disable();
+ __mod_memcg_state(memcg, idx, val);
+ preempt_enable();
}
/**
@@ -548,26 +563,25 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
enum node_stat_item idx)
{
struct mem_cgroup_per_node *pn;
- long val = 0;
- int cpu;
+ long x;
if (mem_cgroup_disabled())
return node_page_state(lruvec_pgdat(lruvec), idx);
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- for_each_possible_cpu(cpu)
- val += per_cpu(pn->lruvec_stat->count[idx], cpu);
-
- if (val < 0)
- val = 0;
-
- return val;
+ x = atomic_long_read(&pn->lruvec_stat[idx]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
}
static inline void __mod_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx, int val)
{
struct mem_cgroup_per_node *pn;
+ long x;
/* Update node */
__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
@@ -581,7 +595,12 @@ static inline void __mod_lruvec_state(struct lruvec *lruvec,
__mod_memcg_state(pn->memcg, idx, val);
/* Update lruvec */
- __this_cpu_add(pn->lruvec_stat->count[idx], val);
+ x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
+ if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+ atomic_long_add(x, &pn->lruvec_stat[idx]);
+ x = 0;
+ }
+ __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
}
static inline void mod_lruvec_state(struct lruvec *lruvec,
@@ -624,16 +643,25 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
static inline void __count_memcg_events(struct mem_cgroup *memcg,
int idx, unsigned long count)
{
- if (!mem_cgroup_disabled())
- __this_cpu_add(memcg->stat->events[idx], count);
+ unsigned long x;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ x = count + __this_cpu_read(memcg->stat_cpu->events[idx]);
+ if (unlikely(x > MEMCG_CHARGE_BATCH)) {
+ atomic_long_add(x, &memcg->events[idx]);
+ x = 0;
+ }
+ __this_cpu_write(memcg->stat_cpu->events[idx], x);
}
-/* idx can be of type enum memcg_event_item or vm_event_item */
static inline void count_memcg_events(struct mem_cgroup *memcg,
int idx, unsigned long count)
{
- if (!mem_cgroup_disabled())
- this_cpu_add(memcg->stat->events[idx], count);
+ preempt_disable();
+ __count_memcg_events(memcg, idx, count);
+ preempt_enable();
}
/* idx can be of type enum memcg_event_item or vm_event_item */