diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 342 |
1 files changed, 194 insertions, 148 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7ddf91c4295f..ca194864d802 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -334,7 +334,7 @@ static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, if (!old) return 0; - new = kvmalloc(sizeof(*new) + size, GFP_KERNEL); + new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); if (!new) return -ENOMEM; @@ -378,7 +378,7 @@ static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) mutex_lock(&memcg_shrinker_map_mutex); size = memcg_shrinker_map_size; for_each_node(nid) { - map = kvzalloc(sizeof(*map) + size, GFP_KERNEL); + map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); if (!map) { memcg_free_shrinker_maps(memcg); ret = -ENOMEM; @@ -656,7 +656,7 @@ retry: */ __mem_cgroup_remove_exceeded(mz, mctz); if (!soft_limit_excess(mz->memcg) || - !css_tryget_online(&mz->memcg->css)) + !css_tryget(&mz->memcg->css)) goto retry; done: return mz; @@ -759,13 +759,12 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) { - struct page *page = virt_to_head_page(p); - pg_data_t *pgdat = page_pgdat(page); + pg_data_t *pgdat = page_pgdat(virt_to_page(p)); struct mem_cgroup *memcg; struct lruvec *lruvec; rcu_read_lock(); - memcg = memcg_from_slab_page(page); + memcg = mem_cgroup_from_obj(p); /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!memcg || memcg == root_mem_cgroup) { @@ -973,7 +972,8 @@ struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) return NULL; rcu_read_lock(); - if (!memcg || !css_tryget_online(&memcg->css)) + /* Page should not get uncharged and freed memcg under us. */ + if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) memcg = root_mem_cgroup; rcu_read_unlock(); return memcg; @@ -986,10 +986,13 @@ EXPORT_SYMBOL(get_mem_cgroup_from_page); static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) { if (unlikely(current->active_memcg)) { - struct mem_cgroup *memcg = root_mem_cgroup; + struct mem_cgroup *memcg; rcu_read_lock(); - if (css_tryget_online(¤t->active_memcg->css)) + /* current->active_memcg must hold a ref. */ + if (WARN_ON_ONCE(!css_tryget(¤t->active_memcg->css))) + memcg = root_mem_cgroup; + else memcg = current->active_memcg; rcu_read_unlock(); return memcg; @@ -1518,11 +1521,11 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), - K((u64)memcg->memory.max), memcg->memory.failcnt); + K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->swap)), - K((u64)memcg->swap.max), memcg->swap.failcnt); + K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); else { pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memsw)), @@ -1549,13 +1552,13 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { unsigned long max; - max = memcg->memory.max; + max = READ_ONCE(memcg->memory.max); if (mem_cgroup_swappiness(memcg)) { unsigned long memsw_max; unsigned long swap_max; memsw_max = memcg->memsw.max; - swap_max = memcg->swap.max; + swap_max = READ_ONCE(memcg->swap.max); swap_max = min(swap_max, (unsigned long)total_swap_pages); max = min(max + swap_max, memsw_max); } @@ -1928,6 +1931,14 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, goto out; /* + * If the victim task has been asynchronously moved to a different + * memory cgroup, we might end up killing tasks outside oom_domain. + * In this case it's better to ignore memory.group.oom. + */ + if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain))) + goto out; + + /* * Traverse the memory cgroup hierarchy from the victim task's * cgroup up to the OOMing cgroup (or root) to find the * highest-level memory cgroup with oom.group set. @@ -2239,7 +2250,7 @@ static void reclaim_high(struct mem_cgroup *memcg, gfp_t gfp_mask) { do { - if (page_counter_read(&memcg->memory) <= memcg->high) + if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high)) continue; memcg_memory_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); @@ -2579,7 +2590,7 @@ done_restock: * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) > memcg->high) { + if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) { /* Don't bother a random interrupted task */ if (in_interrupt()) { schedule_work(&memcg->high_work); @@ -2882,18 +2893,16 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) } /** - * __memcg_kmem_charge_memcg: charge a kmem page - * @page: page to charge - * @gfp: reclaim mode - * @order: allocation order + * __memcg_kmem_charge: charge a number of kernel pages to a memcg * @memcg: memory cgroup to charge + * @gfp: reclaim mode + * @nr_pages: number of pages to charge * * Returns 0 on success, an error code on failure. */ -int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, - struct mem_cgroup *memcg) +int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, + unsigned int nr_pages) { - unsigned int nr_pages = 1 << order; struct page_counter *counter; int ret; @@ -2920,14 +2929,29 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, } /** - * __memcg_kmem_charge: charge a kmem page to the current memory cgroup + * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg + * @memcg: memcg to uncharge + * @nr_pages: number of pages to uncharge + */ +void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); + + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); +} + +/** + * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup * @page: page to charge * @gfp: reclaim mode * @order: allocation order * * Returns 0 on success, an error code on failure. */ -int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; int ret = 0; @@ -2937,7 +2961,7 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { - ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); + ret = __memcg_kmem_charge(memcg, gfp, 1 << order); if (!ret) { page->mem_cgroup = memcg; __SetPageKmemcg(page); @@ -2948,26 +2972,11 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) } /** - * __memcg_kmem_uncharge_memcg: uncharge a kmem page - * @memcg: memcg to uncharge - * @nr_pages: number of pages to uncharge - */ -void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg, - unsigned int nr_pages) -{ - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->kmem, nr_pages); - - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); -} -/** - * __memcg_kmem_uncharge: uncharge a kmem page + * __memcg_kmem_uncharge_page: uncharge a kmem page * @page: page to uncharge * @order: allocation order */ -void __memcg_kmem_uncharge(struct page *page, int order) +void __memcg_kmem_uncharge_page(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; unsigned int nr_pages = 1 << order; @@ -2976,7 +2985,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - __memcg_kmem_uncharge_memcg(memcg, nr_pages); + __memcg_kmem_uncharge(memcg, nr_pages); page->mem_cgroup = NULL; /* slab pages do not have PageKmemcg flag set */ @@ -3067,7 +3076,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, * Make sure that the new limit (memsw or memory limit) doesn't * break our basic invariant rule memory.max <= memsw.max. */ - limits_invariant = memsw ? max >= memcg->memory.max : + limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : max <= memcg->memsw.max; if (!limits_invariant) { mutex_unlock(&memcg_max_mutex); @@ -3814,8 +3823,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) /* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { - memory = min(memory, mi->memory.max); - memsw = min(memsw, mi->memsw.max); + memory = min(memory, READ_ONCE(mi->memory.max)); + memsw = min(memsw, READ_ONCE(mi->memsw.max)); } seq_printf(m, "hierarchical_memory_limit %llu\n", (u64)memory * PAGE_SIZE); @@ -4324,7 +4333,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, *pheadroom = PAGE_COUNTER_MAX; while ((parent = parent_mem_cgroup(memcg))) { - unsigned long ceiling = min(memcg->memory.max, memcg->high); + unsigned long ceiling = min(READ_ONCE(memcg->memory.max), + READ_ONCE(memcg->high)); unsigned long used = page_counter_read(&memcg->memory); *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); @@ -4792,7 +4802,8 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, -#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) +#if defined(CONFIG_MEMCG_KMEM) && \ + (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) { .name = "kmem.slabinfo", .seq_start = memcg_slab_start, @@ -4861,7 +4872,8 @@ static void mem_cgroup_id_remove(struct mem_cgroup *memcg) } } -static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) +static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, + unsigned int n) { refcount_add(n, &memcg->id.ref); } @@ -5044,7 +5056,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (!memcg) return ERR_PTR(error); - memcg->high = PAGE_COUNTER_MAX; + WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); @@ -5197,7 +5209,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); - memcg->high = PAGE_COUNTER_MAX; + WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); } @@ -6013,7 +6025,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, if (err) return err; - memcg->high = high; + WRITE_ONCE(memcg->high, high); for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); @@ -6236,6 +6248,117 @@ struct cgroup_subsys memory_cgrp_subsys = { .early_init = 0, }; +/* + * This function calculates an individual cgroup's effective + * protection which is derived from its own memory.min/low, its + * parent's and siblings' settings, as well as the actual memory + * distribution in the tree. + * + * The following rules apply to the effective protection values: + * + * 1. At the first level of reclaim, effective protection is equal to + * the declared protection in memory.min and memory.low. + * + * 2. To enable safe delegation of the protection configuration, at + * subsequent levels the effective protection is capped to the + * parent's effective protection. + * + * 3. To make complex and dynamic subtrees easier to configure, the + * user is allowed to overcommit the declared protection at a given + * level. If that is the case, the parent's effective protection is + * distributed to the children in proportion to how much protection + * they have declared and how much of it they are utilizing. + * + * This makes distribution proportional, but also work-conserving: + * if one cgroup claims much more protection than it uses memory, + * the unused remainder is available to its siblings. + * + * 4. Conversely, when the declared protection is undercommitted at a + * given level, the distribution of the larger parental protection + * budget is NOT proportional. A cgroup's protection from a sibling + * is capped to its own memory.min/low setting. + * + * 5. However, to allow protecting recursive subtrees from each other + * without having to declare each individual cgroup's fixed share + * of the ancestor's claim to protection, any unutilized - + * "floating" - protection from up the tree is distributed in + * proportion to each cgroup's *usage*. This makes the protection + * neutral wrt sibling cgroups and lets them compete freely over + * the shared parental protection budget, but it protects the + * subtree as a whole from neighboring subtrees. + * + * Note that 4. and 5. are not in conflict: 4. is about protecting + * against immediate siblings whereas 5. is about protecting against + * neighboring subtrees. + */ +static unsigned long effective_protection(unsigned long usage, + unsigned long parent_usage, + unsigned long setting, + unsigned long parent_effective, + unsigned long siblings_protected) +{ + unsigned long protected; + unsigned long ep; + + protected = min(usage, setting); + /* + * If all cgroups at this level combined claim and use more + * protection then what the parent affords them, distribute + * shares in proportion to utilization. + * + * We are using actual utilization rather than the statically + * claimed protection in order to be work-conserving: claimed + * but unused protection is available to siblings that would + * otherwise get a smaller chunk than what they claimed. + */ + if (siblings_protected > parent_effective) + return protected * parent_effective / siblings_protected; + + /* + * Ok, utilized protection of all children is within what the + * parent affords them, so we know whatever this child claims + * and utilizes is effectively protected. + * + * If there is unprotected usage beyond this value, reclaim + * will apply pressure in proportion to that amount. + * + * If there is unutilized protection, the cgroup will be fully + * shielded from reclaim, but we do return a smaller value for + * protection than what the group could enjoy in theory. This + * is okay. With the overcommit distribution above, effective + * protection is always dependent on how memory is actually + * consumed among the siblings anyway. + */ + ep = protected; + + /* + * If the children aren't claiming (all of) the protection + * afforded to them by the parent, distribute the remainder in + * proportion to the (unprotected) memory of each cgroup. That + * way, cgroups that aren't explicitly prioritized wrt each + * other compete freely over the allowance, but they are + * collectively protected from neighboring trees. + * + * We're using unprotected memory for the weight so that if + * some cgroups DO claim explicit protection, we don't protect + * the same bytes twice. + */ + if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) + return ep; + + if (parent_effective > siblings_protected && usage > protected) { + unsigned long unclaimed; + + unclaimed = parent_effective - siblings_protected; + unclaimed *= usage - protected; + unclaimed /= parent_usage - siblings_protected; + + ep += unclaimed; + } + + return ep; +} + /** * mem_cgroup_protected - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked @@ -6249,70 +6372,12 @@ struct cgroup_subsys memory_cgrp_subsys = { * MEMCG_PROT_LOW: cgroup memory is protected as long there is * an unprotected supply of reclaimable memory from other cgroups. * MEMCG_PROT_MIN: cgroup memory is protected - * - * @root is exclusive; it is never protected when looked at directly - * - * To provide a proper hierarchical behavior, effective memory.min/low values - * are used. Below is the description of how effective memory.low is calculated. - * Effective memory.min values is calculated in the same way. - * - * Effective memory.low is always equal or less than the original memory.low. - * If there is no memory.low overcommittment (which is always true for - * top-level memory cgroups), these two values are equal. - * Otherwise, it's a part of parent's effective memory.low, - * calculated as a cgroup's memory.low usage divided by sum of sibling's - * memory.low usages, where memory.low usage is the size of actually - * protected memory. - * - * low_usage - * elow = min( memory.low, parent->elow * ------------------ ), - * siblings_low_usage - * - * | memory.current, if memory.current < memory.low - * low_usage = | - * | 0, otherwise. - * - * - * Such definition of the effective memory.low provides the expected - * hierarchical behavior: parent's memory.low value is limiting - * children, unprotected memory is reclaimed first and cgroups, - * which are not using their guarantee do not affect actual memory - * distribution. - * - * For example, if there are memcgs A, A/B, A/C, A/D and A/E: - * - * A A/memory.low = 2G, A/memory.current = 6G - * //\\ - * BC DE B/memory.low = 3G B/memory.current = 2G - * C/memory.low = 1G C/memory.current = 2G - * D/memory.low = 0 D/memory.current = 2G - * E/memory.low = 10G E/memory.current = 0 - * - * and the memory pressure is applied, the following memory distribution - * is expected (approximately): - * - * A/memory.current = 2G - * - * B/memory.current = 1.3G - * C/memory.current = 0.6G - * D/memory.current = 0 - * E/memory.current = 0 - * - * These calculations require constant tracking of the actual low usages - * (see propagate_protected_usage()), as well as recursive calculation of - * effective memory.low values. But as we do call mem_cgroup_protected() - * path for each memory cgroup top-down from the reclaim, - * it's possible to optimize this part, and save calculated elow - * for next usage. This part is intentionally racy, but it's ok, - * as memory.low is a best-effort mechanism. */ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg) { + unsigned long usage, parent_usage; struct mem_cgroup *parent; - unsigned long emin, parent_emin; - unsigned long elow, parent_elow; - unsigned long usage; if (mem_cgroup_disabled()) return MEMCG_PROT_NONE; @@ -6326,52 +6391,32 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, if (!usage) return MEMCG_PROT_NONE; - emin = memcg->memory.min; - elow = memcg->memory.low; - parent = parent_mem_cgroup(memcg); /* No parent means a non-hierarchical mode on v1 memcg */ if (!parent) return MEMCG_PROT_NONE; - if (parent == root) - goto exit; - - parent_emin = READ_ONCE(parent->memory.emin); - emin = min(emin, parent_emin); - if (emin && parent_emin) { - unsigned long min_usage, siblings_min_usage; - - min_usage = min(usage, memcg->memory.min); - siblings_min_usage = atomic_long_read( - &parent->memory.children_min_usage); - - if (min_usage && siblings_min_usage) - emin = min(emin, parent_emin * min_usage / - siblings_min_usage); + if (parent == root) { + memcg->memory.emin = READ_ONCE(memcg->memory.min); + memcg->memory.elow = memcg->memory.low; + goto out; } - parent_elow = READ_ONCE(parent->memory.elow); - elow = min(elow, parent_elow); - if (elow && parent_elow) { - unsigned long low_usage, siblings_low_usage; + parent_usage = page_counter_read(&parent->memory); - low_usage = min(usage, memcg->memory.low); - siblings_low_usage = atomic_long_read( - &parent->memory.children_low_usage); + WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage, + READ_ONCE(memcg->memory.min), + READ_ONCE(parent->memory.emin), + atomic_long_read(&parent->memory.children_min_usage))); - if (low_usage && siblings_low_usage) - elow = min(elow, parent_elow * low_usage / - siblings_low_usage); - } + WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, + memcg->memory.low, READ_ONCE(parent->memory.elow), + atomic_long_read(&parent->memory.children_low_usage))); -exit: - memcg->memory.emin = emin; - memcg->memory.elow = elow; - - if (usage <= emin) +out: + if (usage <= memcg->memory.emin) return MEMCG_PROT_MIN; - else if (usage <= elow) + else if (usage <= memcg->memory.elow) return MEMCG_PROT_LOW; else return MEMCG_PROT_NONE; @@ -6759,7 +6804,7 @@ void mem_cgroup_sk_alloc(struct sock *sk) goto out; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) goto out; - if (css_tryget_online(&memcg->css)) + if (css_tryget(&memcg->css)) sk->sk_memcg = memcg; out: rcu_read_unlock(); @@ -7080,7 +7125,8 @@ bool mem_cgroup_swap_full(struct page *page) return false; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) - if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max) + if (page_counter_read(&memcg->swap) * 2 >= + READ_ONCE(memcg->swap.max)) return true; return false; |