diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 262 |
1 files changed, 119 insertions, 143 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1c52ddbc839b..796820925de0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -54,6 +54,7 @@ #include <linux/page_cgroup.h> #include <linux/cpu.h> #include <linux/oom.h> +#include <linux/lockdep.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -498,6 +499,29 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } +/* + * We restrict the id in the range of [1, 65535], so it can fit into + * an unsigned short. + */ +#define MEM_CGROUP_ID_MAX USHRT_MAX + +static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) +{ + /* + * The ID of the root cgroup is 0, but memcg treat 0 as an + * invalid ID, so we return (cgroup_id + 1). + */ + return memcg->css.cgroup->id + 1; +} + +static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + struct cgroup_subsys_state *css; + + css = css_from_id(id - 1, &mem_cgroup_subsys); + return mem_cgroup_from_css(css); +} + /* Writing them here to avoid exposing memcg's inner layout */ #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) @@ -569,16 +593,11 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) #ifdef CONFIG_MEMCG_KMEM /* * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. - * There are two main reasons for not using the css_id for this: - * 1) this works better in sparse environments, where we have a lot of memcgs, - * but only a few kmem-limited. Or also, if we have, for instance, 200 - * memcgs, and none but the 200th is kmem-limited, we'd have to have a - * 200 entry array for that. - * - * 2) In order not to violate the cgroup API, we would like to do all memory - * allocation in ->create(). At that point, we haven't yet allocated the - * css_id. Having a separate index prevents us from messing with the cgroup - * core for this + * The main reason for not using cgroup id for this: + * this works better in sparse environments, where we have a lot of memcgs, + * but only a few kmem-limited. Or also, if we have, for instance, 200 + * memcgs, and none but the 200th is kmem-limited, we'd have to have a + * 200 entry array for that. * * The current size of the caches array is stored in * memcg_limited_groups_array_size. It will double each time we have to @@ -593,14 +612,14 @@ int memcg_limited_groups_array_size; * cgroups is a reasonable guess. In the future, it could be a parameter or * tunable, but that is strictly not necessary. * - * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get + * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get * this constant directly from cgroup, but it is understandable that this is * better kept as an internal representation in cgroup.c. In any case, the - * css_id space is not getting any smaller, and we don't have to necessarily + * cgrp_id space is not getting any smaller, and we don't have to necessarily * increase ours as well if it increases. */ #define MEMCG_CACHES_MIN_SIZE 4 -#define MEMCG_CACHES_MAX_SIZE 65535 +#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX /* * A lot of the calls to the cache allocation functions are expected to be @@ -866,6 +885,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, unsigned long val = 0; int cpu; + get_online_cpus(); for_each_online_cpu(cpu) val += per_cpu(memcg->stat->events[idx], cpu); #ifdef CONFIG_HOTPLUG_CPU @@ -873,6 +893,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, val += memcg->nocpu_base.events[idx]; spin_unlock(&memcg->pcp_counter_lock); #endif + put_online_cpus(); return val; } @@ -1405,7 +1426,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, return true; if (!root_memcg->use_hierarchy || !memcg) return false; - return css_is_ancestor(&memcg->css, &root_memcg->css); + return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); } static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, @@ -2044,6 +2065,12 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, return total; } +#ifdef CONFIG_LOCKDEP +static struct lockdep_map memcg_oom_lock_dep_map = { + .name = "memcg_oom_lock", +}; +#endif + static DEFINE_SPINLOCK(memcg_oom_lock); /* @@ -2081,7 +2108,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) } iter->oom_lock = false; } - } + } else + mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); spin_unlock(&memcg_oom_lock); @@ -2093,6 +2121,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) struct mem_cgroup *iter; spin_lock(&memcg_oom_lock); + mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); for_each_mem_cgroup_tree(iter, memcg) iter->oom_lock = false; spin_unlock(&memcg_oom_lock); @@ -2159,110 +2188,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) memcg_wakeup_oom(memcg); } -/* - * try to call OOM killer - */ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - bool locked; - int wakeups; - if (!current->memcg_oom.may_oom) return; - - current->memcg_oom.in_memcg_oom = 1; - /* - * As with any blocking lock, a contender needs to start - * listening for wakeups before attempting the trylock, - * otherwise it can miss the wakeup from the unlock and sleep - * indefinitely. This is just open-coded because our locking - * is so particular to memcg hierarchies. + * We are in the middle of the charge context here, so we + * don't want to block when potentially sitting on a callstack + * that holds all kinds of filesystem and mm locks. + * + * Also, the caller may handle a failed allocation gracefully + * (like optional page cache readahead) and so an OOM killer + * invocation might not even be necessary. + * + * That's why we don't do anything here except remember the + * OOM context and then deal with it at the end of the page + * fault when the stack is unwound, the locks are released, + * and when we know whether the fault was overall successful. */ - wakeups = atomic_read(&memcg->oom_wakeups); - mem_cgroup_mark_under_oom(memcg); - - locked = mem_cgroup_oom_trylock(memcg); - - if (locked) - mem_cgroup_oom_notify(memcg); - - if (locked && !memcg->oom_kill_disable) { - mem_cgroup_unmark_under_oom(memcg); - mem_cgroup_out_of_memory(memcg, mask, order); - mem_cgroup_oom_unlock(memcg); - /* - * There is no guarantee that an OOM-lock contender - * sees the wakeups triggered by the OOM kill - * uncharges. Wake any sleepers explicitely. - */ - memcg_oom_recover(memcg); - } else { - /* - * A system call can just return -ENOMEM, but if this - * is a page fault and somebody else is handling the - * OOM already, we need to sleep on the OOM waitqueue - * for this memcg until the situation is resolved. - * Which can take some time because it might be - * handled by a userspace task. - * - * However, this is the charge context, which means - * that we may sit on a large call stack and hold - * various filesystem locks, the mmap_sem etc. and we - * don't want the OOM handler to deadlock on them - * while we sit here and wait. Store the current OOM - * context in the task_struct, then return -ENOMEM. - * At the end of the page fault handler, with the - * stack unwound, pagefault_out_of_memory() will check - * back with us by calling - * mem_cgroup_oom_synchronize(), possibly putting the - * task to sleep. - */ - current->memcg_oom.oom_locked = locked; - current->memcg_oom.wakeups = wakeups; - css_get(&memcg->css); - current->memcg_oom.wait_on_memcg = memcg; - } + css_get(&memcg->css); + current->memcg_oom.memcg = memcg; + current->memcg_oom.gfp_mask = mask; + current->memcg_oom.order = order; } /** * mem_cgroup_oom_synchronize - complete memcg OOM handling + * @handle: actually kill/wait or just clean up the OOM state * - * This has to be called at the end of a page fault if the the memcg - * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. + * This has to be called at the end of a page fault if the memcg OOM + * handler was enabled. * - * Memcg supports userspace OOM handling, so failed allocations must + * Memcg supports userspace OOM handling where failed allocations must * sleep on a waitqueue until the userspace task resolves the * situation. Sleeping directly in the charge context with all kinds * of locks held is not a good idea, instead we remember an OOM state * in the task and mem_cgroup_oom_synchronize() has to be called at - * the end of the page fault to put the task to sleep and clean up the - * OOM state. + * the end of the page fault to complete the OOM handling. * * Returns %true if an ongoing memcg OOM situation was detected and - * finalized, %false otherwise. + * completed, %false otherwise. */ -bool mem_cgroup_oom_synchronize(void) +bool mem_cgroup_oom_synchronize(bool handle) { + struct mem_cgroup *memcg = current->memcg_oom.memcg; struct oom_wait_info owait; - struct mem_cgroup *memcg; + bool locked; /* OOM is global, do not handle */ - if (!current->memcg_oom.in_memcg_oom) - return false; - - /* - * We invoked the OOM killer but there is a chance that a kill - * did not free up any charges. Everybody else might already - * be sleeping, so restart the fault and keep the rampage - * going until some charges are released. - */ - memcg = current->memcg_oom.wait_on_memcg; if (!memcg) - goto out; + return false; - if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) - goto out_memcg; + if (!handle) + goto cleanup; owait.memcg = memcg; owait.wait.flags = 0; @@ -2271,13 +2249,25 @@ bool mem_cgroup_oom_synchronize(void) INIT_LIST_HEAD(&owait.wait.task_list); prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); - /* Only sleep if we didn't miss any wakeups since OOM */ - if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) + mem_cgroup_mark_under_oom(memcg); + + locked = mem_cgroup_oom_trylock(memcg); + + if (locked) + mem_cgroup_oom_notify(memcg); + + if (locked && !memcg->oom_kill_disable) { + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, + current->memcg_oom.order); + } else { schedule(); - finish_wait(&memcg_oom_waitq, &owait.wait); -out_memcg: - mem_cgroup_unmark_under_oom(memcg); - if (current->memcg_oom.oom_locked) { + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + } + + if (locked) { mem_cgroup_oom_unlock(memcg); /* * There is no guarantee that an OOM-lock contender @@ -2286,10 +2276,9 @@ out_memcg: */ memcg_oom_recover(memcg); } +cleanup: + current->memcg_oom.memcg = NULL; css_put(&memcg->css); - current->memcg_oom.wait_on_memcg = NULL; -out: - current->memcg_oom.in_memcg_oom = 0; return true; } @@ -2703,6 +2692,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, || fatal_signal_pending(current))) goto bypass; + if (unlikely(task_in_memcg_oom(current))) + goto bypass; + /* * We always charge the cgroup the mm_struct belongs to. * The mm_struct's mem_cgroup changes on task migration if the @@ -2800,8 +2792,10 @@ done: *ptr = memcg; return 0; nomem: - *ptr = NULL; - return -ENOMEM; + if (!(gfp_mask & __GFP_NOFAIL)) { + *ptr = NULL; + return -ENOMEM; + } bypass: *ptr = root_mem_cgroup; return -EINTR; @@ -2850,15 +2844,10 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, */ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) { - struct cgroup_subsys_state *css; - /* ID 0 is unused ID */ if (!id) return NULL; - css = css_lookup(&mem_cgroup_subsys, id); - if (!css) - return NULL; - return mem_cgroup_from_css(css); + return mem_cgroup_from_id(id); } struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) @@ -3806,8 +3795,7 @@ void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, { /* Update stat data for mem_cgroup */ preempt_disable(); - WARN_ON_ONCE(from->stat->count[idx] < nr_pages); - __this_cpu_add(from->stat->count[idx], -nr_pages); + __this_cpu_sub(from->stat->count[idx], nr_pages); __this_cpu_add(to->stat->count[idx], nr_pages); preempt_enable(); } @@ -4375,7 +4363,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) * css_get() was called in uncharge(). */ if (do_swap_account && swapout && memcg) - swap_cgroup_record(ent, css_id(&memcg->css)); + swap_cgroup_record(ent, mem_cgroup_id(memcg)); } #endif @@ -4427,8 +4415,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, { unsigned short old_id, new_id; - old_id = css_id(&from->css); - new_id = css_id(&to->css); + old_id = mem_cgroup_id(from); + new_id = mem_cgroup_id(to); if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { mem_cgroup_swap_statistics(from, false); @@ -4983,31 +4971,18 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) } while (usage > 0); } -/* - * This mainly exists for tests during the setting of set of use_hierarchy. - * Since this is the very setting we are changing, the current hierarchy value - * is meaningless - */ -static inline bool __memcg_has_children(struct mem_cgroup *memcg) -{ - struct cgroup_subsys_state *pos; - - /* bounce at first found */ - css_for_each_child(pos, &memcg->css) - return true; - return false; -} - -/* - * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed - * to be already dead (as in mem_cgroup_force_empty, for instance). This is - * from mem_cgroup_count_children(), in the sense that we don't really care how - * many children we have; we only need to know if we have any. It also counts - * any memcg without hierarchy as infertile. - */ static inline bool memcg_has_children(struct mem_cgroup *memcg) { - return memcg->use_hierarchy && __memcg_has_children(memcg); + lockdep_assert_held(&memcg_create_mutex); + /* + * The lock does not prevent addition or deletion to the list + * of children, but it prevents a new child from being + * initialized based on this parent in css_online(), so it's + * enough to decide whether hierarchically inherited + * attributes can still be changed or not. + */ + return memcg->use_hierarchy && + !list_empty(&memcg->css.cgroup->children); } /* @@ -5087,7 +5062,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, */ if ((!parent_memcg || !parent_memcg->use_hierarchy) && (val == 1 || val == 0)) { - if (!__memcg_has_children(memcg)) + if (list_empty(&memcg->css.cgroup->children)) memcg->use_hierarchy = val; else retval = -EBUSY; @@ -6204,7 +6179,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) size_t size = memcg_size(); mem_cgroup_remove_from_trees(memcg); - free_css_id(&mem_cgroup_subsys, &memcg->css); for_each_node(node) free_mem_cgroup_per_zone_info(memcg, node); @@ -6307,6 +6281,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); int error = 0; + if (css->cgroup->id > MEM_CGROUP_ID_MAX) + return -ENOSPC; + if (!parent) return 0; @@ -6578,7 +6555,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, } /* There is a swap entry and a page doesn't exist or isn't charged */ if (ent.val && !ret && - css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { + mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { ret = MC_TARGET_SWAP; if (target) target->ent = ent; @@ -6998,7 +6975,6 @@ struct cgroup_subsys mem_cgroup_subsys = { .bind = mem_cgroup_bind, .base_cftypes = mem_cgroup_files, .early_init = 0, - .use_id = 1, }; #ifdef CONFIG_MEMCG_SWAP |