diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 285 |
1 files changed, 270 insertions, 15 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cdbb7a84cb6e..597d58101872 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -87,6 +87,10 @@ int do_swap_account __read_mostly; #define do_swap_account 0 #endif +#ifdef CONFIG_CGROUP_WRITEBACK +static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); +#endif + /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { @@ -752,15 +756,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update memcg */ __mod_memcg_state(memcg, idx, val); + /* Update lruvec */ + __this_cpu_add(pn->lruvec_stat_local->count[idx], val); + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { struct mem_cgroup_per_node *pi; - /* - * Batch local counters to keep them in sync with - * the hierarchical ones. - */ - __this_cpu_add(pn->lruvec_stat_local->count[idx], x); for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) atomic_long_add(x, &pi->lruvec_stat[idx]); x = 0; @@ -768,6 +770,26 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); } +void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) +{ + struct page *page = virt_to_head_page(p); + pg_data_t *pgdat = page_pgdat(page); + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + rcu_read_lock(); + memcg = memcg_from_slab_page(page); + + /* Untracked pages have no memcg, no lruvec. Update only the node */ + if (!memcg || memcg == root_mem_cgroup) { + __mod_node_page_state(pgdat, idx, val); + } else { + lruvec = mem_cgroup_lruvec(pgdat, memcg); + __mod_lruvec_state(lruvec, idx, val); + } + rcu_read_unlock(); +} + /** * __count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup @@ -1130,26 +1152,45 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, css_put(&prev->css); } -static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) +static void __invalidate_reclaim_iterators(struct mem_cgroup *from, + struct mem_cgroup *dead_memcg) { - struct mem_cgroup *memcg = dead_memcg; struct mem_cgroup_reclaim_iter *iter; struct mem_cgroup_per_node *mz; int nid; int i; - for (; memcg; memcg = parent_mem_cgroup(memcg)) { - for_each_node(nid) { - mz = mem_cgroup_nodeinfo(memcg, nid); - for (i = 0; i <= DEF_PRIORITY; i++) { - iter = &mz->iter[i]; - cmpxchg(&iter->position, - dead_memcg, NULL); - } + for_each_node(nid) { + mz = mem_cgroup_nodeinfo(from, nid); + for (i = 0; i <= DEF_PRIORITY; i++) { + iter = &mz->iter[i]; + cmpxchg(&iter->position, + dead_memcg, NULL); } } } +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) +{ + struct mem_cgroup *memcg = dead_memcg; + struct mem_cgroup *last; + + do { + __invalidate_reclaim_iterators(memcg, dead_memcg); + last = memcg; + } while ((memcg = parent_mem_cgroup(memcg))); + + /* + * When cgruop1 non-hierarchy mode is used, + * parent_mem_cgroup() does not walk all the way up to the + * cgroup root (root_mem_cgroup). So we have to handle + * dead_memcg from cgroup root separately. + */ + if (last != root_mem_cgroup) + __invalidate_reclaim_iterators(root_mem_cgroup, + dead_memcg); +} + /** * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy * @memcg: hierarchy root @@ -3221,6 +3262,72 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } +static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only) +{ + unsigned long stat[MEMCG_NR_STAT]; + struct mem_cgroup *mi; + int node, cpu, i; + int min_idx, max_idx; + + if (slab_only) { + min_idx = NR_SLAB_RECLAIMABLE; + max_idx = NR_SLAB_UNRECLAIMABLE; + } else { + min_idx = 0; + max_idx = MEMCG_NR_STAT; + } + + for (i = min_idx; i < max_idx; i++) + stat[i] = 0; + + for_each_online_cpu(cpu) + for (i = min_idx; i < max_idx; i++) + stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); + + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + for (i = min_idx; i < max_idx; i++) + atomic_long_add(stat[i], &mi->vmstats[i]); + + if (!slab_only) + max_idx = NR_VM_NODE_STAT_ITEMS; + + for_each_node(node) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + struct mem_cgroup_per_node *pi; + + for (i = min_idx; i < max_idx; i++) + stat[i] = 0; + + for_each_online_cpu(cpu) + for (i = min_idx; i < max_idx; i++) + stat[i] += per_cpu( + pn->lruvec_stat_cpu->count[i], cpu); + + for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) + for (i = min_idx; i < max_idx; i++) + atomic_long_add(stat[i], &pi->lruvec_stat[i]); + } +} + +static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) +{ + unsigned long events[NR_VM_EVENT_ITEMS]; + struct mem_cgroup *mi; + int cpu, i; + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + events[i] = 0; + + for_each_online_cpu(cpu) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + events[i] += per_cpu(memcg->vmstats_percpu->events[i], + cpu); + + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + atomic_long_add(events[i], &mi->vmevents[i]); +} + #ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { @@ -3270,7 +3377,14 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) if (!parent) parent = root_mem_cgroup; + /* + * Deactivate and reparent kmem_caches. Then flush percpu + * slab statistics to have precise values at the parent and + * all ancestor levels. It's required to keep slab stats + * accurate after the reparenting of kmem_caches. + */ memcg_deactivate_kmem_caches(memcg, parent); + memcg_flush_percpu_vmstats(memcg, true); kmemcg_id = memcg->kmemcg_id; BUG_ON(kmemcg_id < 0); @@ -4062,6 +4176,8 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, #ifdef CONFIG_CGROUP_WRITEBACK +#include <trace/events/writeback.h> + static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) { return wb_domain_init(&memcg->cgwb_domain, gfp); @@ -4145,6 +4261,130 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, } } +/* + * Foreign dirty flushing + * + * There's an inherent mismatch between memcg and writeback. The former + * trackes ownership per-page while the latter per-inode. This was a + * deliberate design decision because honoring per-page ownership in the + * writeback path is complicated, may lead to higher CPU and IO overheads + * and deemed unnecessary given that write-sharing an inode across + * different cgroups isn't a common use-case. + * + * Combined with inode majority-writer ownership switching, this works well + * enough in most cases but there are some pathological cases. For + * example, let's say there are two cgroups A and B which keep writing to + * different but confined parts of the same inode. B owns the inode and + * A's memory is limited far below B's. A's dirty ratio can rise enough to + * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid + * triggering background writeback. A will be slowed down without a way to + * make writeback of the dirty pages happen. + * + * Conditions like the above can lead to a cgroup getting repatedly and + * severely throttled after making some progress after each + * dirty_expire_interval while the underyling IO device is almost + * completely idle. + * + * Solving this problem completely requires matching the ownership tracking + * granularities between memcg and writeback in either direction. However, + * the more egregious behaviors can be avoided by simply remembering the + * most recent foreign dirtying events and initiating remote flushes on + * them when local writeback isn't enough to keep the memory clean enough. + * + * The following two functions implement such mechanism. When a foreign + * page - a page whose memcg and writeback ownerships don't match - is + * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning + * bdi_writeback on the page owning memcg. When balance_dirty_pages() + * decides that the memcg needs to sleep due to high dirty ratio, it calls + * mem_cgroup_flush_foreign() which queues writeback on the recorded + * foreign bdi_writebacks which haven't expired. Both the numbers of + * recorded bdi_writebacks and concurrent in-flight foreign writebacks are + * limited to MEMCG_CGWB_FRN_CNT. + * + * The mechanism only remembers IDs and doesn't hold any object references. + * As being wrong occasionally doesn't matter, updates and accesses to the + * records are lockless and racy. + */ +void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, + struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = page->mem_cgroup; + struct memcg_cgwb_frn *frn; + u64 now = get_jiffies_64(); + u64 oldest_at = now; + int oldest = -1; + int i; + + trace_track_foreign_dirty(page, wb); + + /* + * Pick the slot to use. If there is already a slot for @wb, keep + * using it. If not replace the oldest one which isn't being + * written out. + */ + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { + frn = &memcg->cgwb_frn[i]; + if (frn->bdi_id == wb->bdi->id && + frn->memcg_id == wb->memcg_css->id) + break; + if (time_before64(frn->at, oldest_at) && + atomic_read(&frn->done.cnt) == 1) { + oldest = i; + oldest_at = frn->at; + } + } + + if (i < MEMCG_CGWB_FRN_CNT) { + /* + * Re-using an existing one. Update timestamp lazily to + * avoid making the cacheline hot. We want them to be + * reasonably up-to-date and significantly shorter than + * dirty_expire_interval as that's what expires the record. + * Use the shorter of 1s and dirty_expire_interval / 8. + */ + unsigned long update_intv = + min_t(unsigned long, HZ, + msecs_to_jiffies(dirty_expire_interval * 10) / 8); + + if (time_before64(frn->at, now - update_intv)) + frn->at = now; + } else if (oldest >= 0) { + /* replace the oldest free one */ + frn = &memcg->cgwb_frn[oldest]; + frn->bdi_id = wb->bdi->id; + frn->memcg_id = wb->memcg_css->id; + frn->at = now; + } +} + +/* issue foreign writeback flushes for recorded foreign dirtying events */ +void mem_cgroup_flush_foreign(struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); + u64 now = jiffies_64; + int i; + + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { + struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; + + /* + * If the record is older than dirty_expire_interval, + * writeback on it has already started. No need to kick it + * off again. Also, don't start a new one if there's + * already one in flight. + */ + if (time_after64(frn->at, now - intv) && + atomic_read(&frn->done.cnt) == 1) { + frn->at = 0; + trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); + cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, + WB_REASON_FOREIGN_FLUSH, + &frn->done); + } + } +} + #else /* CONFIG_CGROUP_WRITEBACK */ static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) @@ -4643,6 +4883,12 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; + /* + * Flush percpu vmstats and vmevents to guarantee the value correctness + * on parent's and all ancestor levels. + */ + memcg_flush_percpu_vmstats(memcg, false); + memcg_flush_percpu_vmevents(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); @@ -4661,6 +4907,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) struct mem_cgroup *memcg; unsigned int size; int node; + int __maybe_unused i; size = sizeof(struct mem_cgroup); size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); @@ -4704,6 +4951,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) #endif #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) + memcg->cgwb_frn[i].done = + __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); #endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; @@ -4833,7 +5083,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + int __maybe_unused i; +#ifdef CONFIG_CGROUP_WRITEBACK + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) + wb_wait_for_completion(&memcg->cgwb_frn[i].done); +#endif if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_dec(&memcg_sockets_enabled_key); |