diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 387 |
1 files changed, 130 insertions, 257 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 28928ce9b07f..d6ac0e33e150 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -318,9 +318,6 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; - /* set when res.limit == memsw.limit */ - bool memsw_is_minimum; - /* protect arrays of thresholds */ struct mutex thresholds_lock; @@ -484,14 +481,6 @@ enum res_type { #define OOM_CONTROL (0) /* - * Reclaim flags for mem_cgroup_hierarchical_reclaim - */ -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 -#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) -#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 -#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) - -/* * The memcg_create_mutex will be held whenever a new cgroup is created. * As a consequence, any change that needs to protect against new child cgroups * appearing has to hold it as well. @@ -649,11 +638,13 @@ int memcg_limited_groups_array_size; struct static_key memcg_kmem_enabled_key; EXPORT_SYMBOL(memcg_kmem_enabled_key); +static void memcg_free_cache_id(int id); + static void disarm_kmem_keys(struct mem_cgroup *memcg) { if (memcg_kmem_is_active(memcg)) { static_key_slow_dec(&memcg_kmem_enabled_key); - ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); + memcg_free_cache_id(memcg->kmemcg_id); } /* * This check can't live in kmem destruction function, @@ -1545,12 +1536,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) * start move here. */ -/* for quick checking without looking up memcg */ -atomic_t memcg_moving __read_mostly; - static void mem_cgroup_start_move(struct mem_cgroup *memcg) { - atomic_inc(&memcg_moving); atomic_inc(&memcg->moving_account); synchronize_rcu(); } @@ -1561,10 +1548,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) * Now, mem_cgroup_clear_mc() may call this function with NULL. * We check NULL in callee rather than caller. */ - if (memcg) { - atomic_dec(&memcg_moving); + if (memcg) atomic_dec(&memcg->moving_account); - } } /* @@ -1806,42 +1791,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, NULL, "Memory cgroup out of memory"); } -static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, - gfp_t gfp_mask, - unsigned long flags) -{ - unsigned long total = 0; - bool noswap = false; - int loop; - - if (flags & MEM_CGROUP_RECLAIM_NOSWAP) - noswap = true; - if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) - noswap = true; - - for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { - if (loop) - drain_all_stock_async(memcg); - total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); - /* - * Allow limit shrinkers, which are triggered directly - * by userspace, to catch signals and stop reclaim - * after minimal progress, regardless of the margin. - */ - if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) - break; - if (mem_cgroup_margin(memcg)) - break; - /* - * If nothing was reclaimed after two attempts, there - * may be no reclaimable pages in this hierarchy. - */ - if (loop && !total) - break; - } - return total; -} - /** * test_mem_cgroup_node_reclaimable * @memcg: the target memcg @@ -2249,41 +2198,52 @@ cleanup: return true; } -/* - * Used to update mapped file or writeback or other statistics. +/** + * mem_cgroup_begin_page_stat - begin a page state statistics transaction + * @page: page that is going to change accounted state + * @locked: &memcg->move_lock slowpath was taken + * @flags: IRQ-state flags for &memcg->move_lock * - * Notes: Race condition + * This function must mark the beginning of an accounted page state + * change to prevent double accounting when the page is concurrently + * being moved to another memcg: * - * Charging occurs during page instantiation, while the page is - * unmapped and locked in page migration, or while the page table is - * locked in THP migration. No race is possible. + * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + * if (TestClearPageState(page)) + * mem_cgroup_update_page_stat(memcg, state, -1); + * mem_cgroup_end_page_stat(memcg, locked, flags); * - * Uncharge happens to pages with zero references, no race possible. + * The RCU lock is held throughout the transaction. The fast path can + * get away without acquiring the memcg->move_lock (@locked is false) + * because page moving starts with an RCU grace period. * - * Charge moving between groups is protected by checking mm->moving - * account and taking the move_lock in the slowpath. + * The RCU lock also protects the memcg from being freed when the page + * state that is going to change is the only thing preventing the page + * from being uncharged. E.g. end-writeback clearing PageWriteback(), + * which allows migration to go ahead and uncharge the page before the + * account transaction might be complete. */ - -void __mem_cgroup_begin_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, + bool *locked, + unsigned long *flags) { struct mem_cgroup *memcg; struct page_cgroup *pc; + rcu_read_lock(); + + if (mem_cgroup_disabled()) + return NULL; + pc = lookup_page_cgroup(page); again: memcg = pc->mem_cgroup; if (unlikely(!memcg || !PageCgroupUsed(pc))) - return; - /* - * If this memory cgroup is not under account moving, we don't - * need to take move_lock_mem_cgroup(). Because we already hold - * rcu_read_lock(), any calls to move_account will be delayed until - * rcu_read_unlock(). - */ - VM_BUG_ON(!rcu_read_lock_held()); + return NULL; + + *locked = false; if (atomic_read(&memcg->moving_account) <= 0) - return; + return memcg; move_lock_mem_cgroup(memcg, flags); if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { @@ -2291,36 +2251,40 @@ again: goto again; } *locked = true; + + return memcg; } -void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) +/** + * mem_cgroup_end_page_stat - finish a page state statistics transaction + * @memcg: the memcg that was accounted against + * @locked: value received from mem_cgroup_begin_page_stat() + * @flags: value received from mem_cgroup_begin_page_stat() + */ +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, + unsigned long flags) { - struct page_cgroup *pc = lookup_page_cgroup(page); + if (memcg && locked) + move_unlock_mem_cgroup(memcg, &flags); - /* - * It's guaranteed that pc->mem_cgroup never changes while - * lock is held because a routine modifies pc->mem_cgroup - * should take move_lock_mem_cgroup(). - */ - move_unlock_mem_cgroup(pc->mem_cgroup, flags); + rcu_read_unlock(); } -void mem_cgroup_update_page_stat(struct page *page, +/** + * mem_cgroup_update_page_stat - update page state statistics + * @memcg: memcg to account against + * @idx: page state item to account + * @val: number of pages (positive or negative) + * + * See mem_cgroup_begin_page_stat() for locking requirements. + */ +void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx, int val) { - struct mem_cgroup *memcg; - struct page_cgroup *pc = lookup_page_cgroup(page); - unsigned long uninitialized_var(flags); - - if (mem_cgroup_disabled()) - return; - VM_BUG_ON(!rcu_read_lock_held()); - memcg = pc->mem_cgroup; - if (unlikely(!memcg || !PageCgroupUsed(pc))) - return; - this_cpu_add(memcg->stat->count[idx], val); + if (memcg) + this_cpu_add(memcg->stat->count[idx], val); } /* @@ -2544,8 +2508,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; unsigned long nr_reclaimed; - unsigned long flags = 0; unsigned long long size; + bool may_swap = true; + bool drained = false; int ret = 0; if (mem_cgroup_is_root(memcg)) @@ -2555,16 +2520,17 @@ retry: goto done; size = batch * PAGE_SIZE; - if (!res_counter_charge(&memcg->res, size, &fail_res)) { - if (!do_swap_account) - goto done_restock; - if (!res_counter_charge(&memcg->memsw, size, &fail_res)) + if (!do_swap_account || + !res_counter_charge(&memcg->memsw, size, &fail_res)) { + if (!res_counter_charge(&memcg->res, size, &fail_res)) goto done_restock; - res_counter_uncharge(&memcg->res, size); - mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); - flags |= MEM_CGROUP_RECLAIM_NOSWAP; - } else + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, size); mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + } else { + mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); + may_swap = false; + } if (batch > nr_pages) { batch = nr_pages; @@ -2588,11 +2554,18 @@ retry: if (!(gfp_mask & __GFP_WAIT)) goto nomem; - nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); + nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, + gfp_mask, may_swap); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) goto retry; + if (!drained) { + drain_all_stock_async(mem_over_limit); + drained = true; + goto retry; + } + if (gfp_mask & __GFP_NORETRY) goto nomem; /* @@ -2798,12 +2771,6 @@ static DEFINE_MUTEX(memcg_slab_mutex); static DEFINE_MUTEX(activate_kmem_mutex); -static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) -{ - return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && - memcg_kmem_is_active(memcg); -} - /* * This is a bit cumbersome, but it is rarely used and avoids a backpointer * in the memcg_cache_params struct. @@ -2823,7 +2790,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); struct memcg_cache_params *params; - if (!memcg_can_account_kmem(memcg)) + if (!memcg_kmem_is_active(memcg)) return -EIO; print_slabinfo_header(m); @@ -2906,19 +2873,44 @@ int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -static size_t memcg_caches_array_size(int num_groups) +static int memcg_alloc_cache_id(void) { - ssize_t size; - if (num_groups <= 0) - return 0; + int id, size; + int err; + + id = ida_simple_get(&kmem_limited_groups, + 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); + if (id < 0) + return id; + + if (id < memcg_limited_groups_array_size) + return id; - size = 2 * num_groups; + /* + * There's no space for the new id in memcg_caches arrays, + * so we have to grow them. + */ + + size = 2 * (id + 1); if (size < MEMCG_CACHES_MIN_SIZE) size = MEMCG_CACHES_MIN_SIZE; else if (size > MEMCG_CACHES_MAX_SIZE) size = MEMCG_CACHES_MAX_SIZE; - return size; + mutex_lock(&memcg_slab_mutex); + err = memcg_update_all_caches(size); + mutex_unlock(&memcg_slab_mutex); + + if (err) { + ida_simple_remove(&kmem_limited_groups, id); + return err; + } + return id; +} + +static void memcg_free_cache_id(int id) +{ + ida_simple_remove(&kmem_limited_groups, id); } /* @@ -2928,97 +2920,7 @@ static size_t memcg_caches_array_size(int num_groups) */ void memcg_update_array_size(int num) { - if (num > memcg_limited_groups_array_size) - memcg_limited_groups_array_size = memcg_caches_array_size(num); -} - -int memcg_update_cache_size(struct kmem_cache *s, int num_groups) -{ - struct memcg_cache_params *cur_params = s->memcg_params; - - VM_BUG_ON(!is_root_cache(s)); - - if (num_groups > memcg_limited_groups_array_size) { - int i; - struct memcg_cache_params *new_params; - ssize_t size = memcg_caches_array_size(num_groups); - - size *= sizeof(void *); - size += offsetof(struct memcg_cache_params, memcg_caches); - - new_params = kzalloc(size, GFP_KERNEL); - if (!new_params) - return -ENOMEM; - - new_params->is_root_cache = true; - - /* - * There is the chance it will be bigger than - * memcg_limited_groups_array_size, if we failed an allocation - * in a cache, in which case all caches updated before it, will - * have a bigger array. - * - * But if that is the case, the data after - * memcg_limited_groups_array_size is certainly unused - */ - for (i = 0; i < memcg_limited_groups_array_size; i++) { - if (!cur_params->memcg_caches[i]) - continue; - new_params->memcg_caches[i] = - cur_params->memcg_caches[i]; - } - - /* - * Ideally, we would wait until all caches succeed, and only - * then free the old one. But this is not worth the extra - * pointer per-cache we'd have to have for this. - * - * It is not a big deal if some caches are left with a size - * bigger than the others. And all updates will reset this - * anyway. - */ - rcu_assign_pointer(s->memcg_params, new_params); - if (cur_params) - kfree_rcu(cur_params, rcu_head); - } - return 0; -} - -int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, - struct kmem_cache *root_cache) -{ - size_t size; - - if (!memcg_kmem_enabled()) - return 0; - - if (!memcg) { - size = offsetof(struct memcg_cache_params, memcg_caches); - size += memcg_limited_groups_array_size * sizeof(void *); - } else - size = sizeof(struct memcg_cache_params); - - s->memcg_params = kzalloc(size, GFP_KERNEL); - if (!s->memcg_params) - return -ENOMEM; - - if (memcg) { - s->memcg_params->memcg = memcg; - s->memcg_params->root_cache = root_cache; - css_get(&memcg->css); - } else - s->memcg_params->is_root_cache = true; - - return 0; -} - -void memcg_free_cache_params(struct kmem_cache *s) -{ - if (!s->memcg_params) - return; - if (!s->memcg_params->is_root_cache) - css_put(&s->memcg_params->memcg->css); - kfree(s->memcg_params); + memcg_limited_groups_array_size = num; } static void memcg_register_cache(struct mem_cgroup *memcg, @@ -3051,6 +2953,7 @@ static void memcg_register_cache(struct mem_cgroup *memcg, if (!cachep) return; + css_get(&memcg->css); list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); /* @@ -3084,6 +2987,9 @@ static void memcg_unregister_cache(struct kmem_cache *cachep) list_del(&cachep->memcg_params->list); kmem_cache_destroy(cachep); + + /* drop the reference taken in memcg_register_cache */ + css_put(&memcg->css); } /* @@ -3261,7 +3167,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); - if (!memcg_can_account_kmem(memcg)) + if (!memcg_kmem_is_active(memcg)) goto out; memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); @@ -3346,7 +3252,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) memcg = get_mem_cgroup_from_mm(current->mm); - if (!memcg_can_account_kmem(memcg)) { + if (!memcg_kmem_is_active(memcg)) { css_put(&memcg->css); return true; } @@ -3688,7 +3594,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { int retry_count; - u64 memswlimit, memlimit; int ret = 0; int children = mem_cgroup_count_children(memcg); u64 curusage, oldusage; @@ -3715,31 +3620,23 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, * We have to guarantee memcg->res.limit <= memcg->memsw.limit. */ mutex_lock(&set_limit_mutex); - memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - if (memswlimit < val) { + if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) { ret = -EINVAL; mutex_unlock(&set_limit_mutex); break; } - memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); - if (memlimit < val) + if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val) enlarge = 1; ret = res_counter_set_limit(&memcg->res, val); - if (!ret) { - if (memswlimit == val) - memcg->memsw_is_minimum = true; - else - memcg->memsw_is_minimum = false; - } mutex_unlock(&set_limit_mutex); if (!ret) break; - mem_cgroup_reclaim(memcg, GFP_KERNEL, - MEM_CGROUP_RECLAIM_SHRINK); + try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); + curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3757,7 +3654,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, unsigned long long val) { int retry_count; - u64 memlimit, memswlimit, oldusage, curusage; + u64 oldusage, curusage; int children = mem_cgroup_count_children(memcg); int ret = -EBUSY; int enlarge = 0; @@ -3776,30 +3673,21 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, * We have to guarantee memcg->res.limit <= memcg->memsw.limit. */ mutex_lock(&set_limit_mutex); - memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); - if (memlimit > val) { + if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) { ret = -EINVAL; mutex_unlock(&set_limit_mutex); break; } - memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - if (memswlimit < val) + if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) enlarge = 1; ret = res_counter_set_limit(&memcg->memsw, val); - if (!ret) { - if (memlimit == val) - memcg->memsw_is_minimum = true; - else - memcg->memsw_is_minimum = false; - } mutex_unlock(&set_limit_mutex); if (!ret) break; - mem_cgroup_reclaim(memcg, GFP_KERNEL, - MEM_CGROUP_RECLAIM_NOSWAP | - MEM_CGROUP_RECLAIM_SHRINK); + try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); + curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -4048,8 +3936,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) if (signal_pending(current)) return -EINTR; - progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, - false); + progress = try_to_free_mem_cgroup_pages(memcg, 1, + GFP_KERNEL, true); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -4214,23 +4102,12 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, if (err) goto out; - memcg_id = ida_simple_get(&kmem_limited_groups, - 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); + memcg_id = memcg_alloc_cache_id(); if (memcg_id < 0) { err = memcg_id; goto out; } - /* - * Make sure we have enough space for this cgroup in each root cache's - * memcg_params. - */ - mutex_lock(&memcg_slab_mutex); - err = memcg_update_all_caches(memcg_id + 1); - mutex_unlock(&memcg_slab_mutex); - if (err) - goto out_rmid; - memcg->kmemcg_id = memcg_id; INIT_LIST_HEAD(&memcg->memcg_slab_caches); @@ -4251,10 +4128,6 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, out: memcg_resume_kmem_account(); return err; - -out_rmid: - ida_simple_remove(&kmem_limited_groups, memcg_id); - goto out; } static int memcg_activate_kmem(struct mem_cgroup *memcg, |