diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 267 |
1 files changed, 157 insertions, 110 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 66beca1ad92f..ae052b5e3315 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -921,6 +921,43 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) iter = mem_cgroup_iter(NULL, iter, NULL)) /** + * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy + * @memcg: hierarchy root + * @fn: function to call for each task + * @arg: argument passed to @fn + * + * This function iterates over tasks attached to @memcg or to any of its + * descendants and calls @fn for each task. If @fn returns a non-zero + * value, the function breaks the iteration loop and returns the value. + * Otherwise, it will iterate over all tasks and return 0. + * + * This function must not be called for the root memory cgroup. + */ +int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, + int (*fn)(struct task_struct *, void *), void *arg) +{ + struct mem_cgroup *iter; + int ret = 0; + + BUG_ON(memcg == root_mem_cgroup); + + for_each_mem_cgroup_tree(iter, memcg) { + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&iter->css, &it); + while (!ret && (task = css_task_iter_next(&it))) + ret = fn(task, arg); + css_task_iter_end(&it); + if (ret) { + mem_cgroup_iter_break(memcg, iter); + break; + } + } + return ret; +} + +/** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page * @zone: zone of the page @@ -1178,7 +1215,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) /* * Return the memory (and swap, if configured) limit for a memcg. */ -static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) { unsigned long limit; @@ -1205,79 +1242,12 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, .gfp_mask = gfp_mask, .order = order, }; - struct mem_cgroup *iter; - unsigned long chosen_points = 0; - unsigned long totalpages; - unsigned int points = 0; - struct task_struct *chosen = NULL; + bool ret; mutex_lock(&oom_lock); - - /* - * If current has a pending SIGKILL or is exiting, then automatically - * select it. The goal is to allow it to allocate so that it may - * quickly exit and free its memory. - */ - if (task_will_free_mem(current)) { - mark_oom_victim(current); - wake_oom_reaper(current); - goto unlock; - } - - check_panic_on_oom(&oc, CONSTRAINT_MEMCG); - totalpages = mem_cgroup_get_limit(memcg) ? : 1; - for_each_mem_cgroup_tree(iter, memcg) { - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&iter->css, &it); - while ((task = css_task_iter_next(&it))) { - switch (oom_scan_process_thread(&oc, task)) { - case OOM_SCAN_SELECT: - if (chosen) - put_task_struct(chosen); - chosen = task; - chosen_points = ULONG_MAX; - get_task_struct(chosen); - /* fall through */ - case OOM_SCAN_CONTINUE: - continue; - case OOM_SCAN_ABORT: - css_task_iter_end(&it); - mem_cgroup_iter_break(memcg, iter); - if (chosen) - put_task_struct(chosen); - /* Set a dummy value to return "true". */ - chosen = (void *) 1; - goto unlock; - case OOM_SCAN_OK: - break; - }; - points = oom_badness(task, memcg, NULL, totalpages); - if (!points || points < chosen_points) - continue; - /* Prefer thread group leaders for display purposes */ - if (points == chosen_points && - thread_group_leader(chosen)) - continue; - - if (chosen) - put_task_struct(chosen); - chosen = task; - chosen_points = points; - get_task_struct(chosen); - } - css_task_iter_end(&it); - } - - if (chosen) { - points = chosen_points * 1000 / totalpages; - oom_kill_process(&oc, chosen, points, totalpages, - "Memory cgroup out of memory"); - } -unlock: + ret = out_of_memory(&oc); mutex_unlock(&oom_lock); - return chosen; + return ret; } #if MAX_NUMNODES > 1 @@ -1600,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle) if (!memcg) return false; - if (!handle || oom_killer_disabled) + if (!handle) goto cleanup; owait.memcg = memcg; @@ -1740,17 +1710,22 @@ static DEFINE_MUTEX(percpu_charge_mutex); static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; + unsigned long flags; bool ret = false; if (nr_pages > CHARGE_BATCH) return ret; - stock = &get_cpu_var(memcg_stock); + local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); if (memcg == stock->cached && stock->nr_pages >= nr_pages) { stock->nr_pages -= nr_pages; ret = true; } - put_cpu_var(memcg_stock); + + local_irq_restore(flags); + return ret; } @@ -1771,15 +1746,18 @@ static void drain_stock(struct memcg_stock_pcp *stock) stock->cached = NULL; } -/* - * This must be called under preempt disabled or must be called by - * a thread which is pinned to local cpu. - */ static void drain_local_stock(struct work_struct *dummy) { - struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); + struct memcg_stock_pcp *stock; + unsigned long flags; + + local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); + + local_irq_restore(flags); } /* @@ -1788,14 +1766,19 @@ static void drain_local_stock(struct work_struct *dummy) */ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { - struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); + struct memcg_stock_pcp *stock; + unsigned long flags; + + local_irq_save(flags); + stock = this_cpu_ptr(&memcg_stock); if (stock->cached != memcg) { /* reset if necessary */ drain_stock(stock); stock->cached = memcg; } stock->nr_pages += nr_pages; - put_cpu_var(memcg_stock); + + local_irq_restore(flags); } /* @@ -2337,8 +2320,11 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) return 0; memcg = get_mem_cgroup_from_mm(current->mm); - if (!mem_cgroup_is_root(memcg)) + if (!mem_cgroup_is_root(memcg)) { ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); + if (!ret) + __SetPageKmemcg(page); + } css_put(&memcg->css); return ret; } @@ -2365,6 +2351,11 @@ void memcg_kmem_uncharge(struct page *page, int order) page_counter_uncharge(&memcg->memsw, nr_pages); page->mem_cgroup = NULL; + + /* slab pages do not have PageKmemcg flag set */ + if (PageKmemcg(page)) + __ClearPageKmemcg(page); + css_put_many(&memcg->css, nr_pages); } #endif /* !CONFIG_SLOB */ @@ -2948,16 +2939,16 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) /* * The active flag needs to be written after the static_key * update. This is what guarantees that the socket activation - * function is the last one to run. See sock_update_memcg() for - * details, and note that we don't mark any socket as belonging - * to this memcg until that flag is up. + * function is the last one to run. See mem_cgroup_sk_alloc() + * for details, and note that we don't mark any socket as + * belonging to this memcg until that flag is up. * * We need to do this, because static_keys will span multiple * sites, but we can't control their order. If we mark a socket * as accounted, but the accounting functions are not patched in * yet, we'll lose accounting. * - * We never race with the readers in sock_update_memcg(), + * We never race with the readers in mem_cgroup_sk_alloc(), * because when this value change, the code to process it is not * patched in yet. */ @@ -4069,14 +4060,16 @@ static struct cftype mem_cgroup_legacy_files[] = { static DEFINE_IDR(mem_cgroup_idr); -static void mem_cgroup_id_get(struct mem_cgroup *memcg) +static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { - atomic_inc(&memcg->id.ref); + VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); + atomic_add(n, &memcg->id.ref); } -static void mem_cgroup_id_put(struct mem_cgroup *memcg) +static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { - if (atomic_dec_and_test(&memcg->id.ref)) { + VM_BUG_ON(atomic_read(&memcg->id.ref) < n); + if (atomic_sub_and_test(n, &memcg->id.ref)) { idr_remove(&mem_cgroup_idr, memcg->id.id); memcg->id.id = 0; @@ -4085,6 +4078,16 @@ static void mem_cgroup_id_put(struct mem_cgroup *memcg) } } +static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) +{ + mem_cgroup_id_get_many(memcg, 1); +} + +static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + mem_cgroup_id_put_many(memcg, 1); +} + /** * mem_cgroup_from_id - look up a memcg from a memcg id * @id: the memcg id to look up @@ -4254,8 +4257,10 @@ fail: static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + /* Online state pins memcg ID, memcg ID pins CSS */ - mem_cgroup_id_get(mem_cgroup_from_css(css)); + atomic_set(&memcg->id.ref, 1); css_get(css); return 0; } @@ -4403,7 +4408,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * Because lookup_swap_cache() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ - page = find_get_page(swap_address_space(ent), ent.val); + page = find_get_page(swap_address_space(ent), swp_offset(ent)); if (do_memsw_account()) entry->val = ent.val; @@ -4441,7 +4446,8 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, swp_entry_t swp = radix_to_swp_entry(page); if (do_memsw_account()) *entry = swp; - page = find_get_page(swap_address_space(swp), swp.val); + page = find_get_page(swap_address_space(swp), + swp_offset(swp)); } } else page = find_get_page(mapping, pgoff); @@ -4676,7 +4682,8 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) .mm = mm, }; down_read(&mm->mmap_sem); - walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk); + walk_page_range(0, mm->highest_vm_end, + &mem_cgroup_count_precharge_walk); up_read(&mm->mmap_sem); precharge = mc.precharge; @@ -4719,6 +4726,8 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.from)) page_counter_uncharge(&mc.from->memsw, mc.moved_swap); + mem_cgroup_id_put_many(mc.from, mc.moved_swap); + /* * we charged both to->memory and to->memsw, so we * should uncharge to->memory. @@ -4726,9 +4735,9 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.to)) page_counter_uncharge(&mc.to->memory, mc.moved_swap); - css_put_many(&mc.from->css, mc.moved_swap); + mem_cgroup_id_get_many(mc.to, mc.moved_swap); + css_put_many(&mc.to->css, mc.moved_swap); - /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } memcg_oom_recover(from); @@ -4962,7 +4971,8 @@ retry: * When we have consumed all precharges and failed in doing * additional charge, the page walk just aborts. */ - walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); + walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk); + up_read(&mc.mm->mmap_sem); atomic_dec(&mc.from->moving_account); } @@ -5537,8 +5547,10 @@ static void uncharge_list(struct list_head *page_list) else nr_file += nr_pages; pgpgout++; - } else + } else { nr_kmem += 1 << compound_order(page); + __ClearPageKmemcg(page); + } page->mem_cgroup = NULL; } while (next != page_list); @@ -5639,11 +5651,15 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); EXPORT_SYMBOL(memcg_sockets_enabled_key); -void sock_update_memcg(struct sock *sk) +void mem_cgroup_sk_alloc(struct sock *sk) { struct mem_cgroup *memcg; - /* Socket cloning can throw us here with sk_cgrp already + if (!mem_cgroup_sockets_enabled) + return; + + /* + * Socket cloning can throw us here with sk_memcg already * filled. It won't however, necessarily happen from * process context. So the test for root memcg given * the current task's memcg won't help us in this case. @@ -5668,12 +5684,11 @@ void sock_update_memcg(struct sock *sk) out: rcu_read_unlock(); } -EXPORT_SYMBOL(sock_update_memcg); -void sock_release_memcg(struct sock *sk) +void mem_cgroup_sk_free(struct sock *sk) { - WARN_ON(!sk->sk_memcg); - css_put(&sk->sk_memcg->css); + if (sk->sk_memcg) + css_put(&sk->sk_memcg->css); } /** @@ -5781,6 +5796,24 @@ static int __init mem_cgroup_init(void) subsys_initcall(mem_cgroup_init); #ifdef CONFIG_MEMCG_SWAP +static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +{ + while (!atomic_inc_not_zero(&memcg->id.ref)) { + /* + * The root cgroup cannot be destroyed, so it's refcount must + * always be >= 1. + */ + if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { + VM_BUG_ON(1); + break; + } + memcg = parent_mem_cgroup(memcg); + if (!memcg) + memcg = root_mem_cgroup; + } + return memcg; +} + /** * mem_cgroup_swapout - transfer a memsw charge to swap * @page: page whose memsw charge to transfer @@ -5790,7 +5823,7 @@ subsys_initcall(mem_cgroup_init); */ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *swap_memcg; unsigned short oldid; VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5805,16 +5838,27 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!memcg) return; - mem_cgroup_id_get(memcg); - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + /* + * In case the memcg owning these pages has been offlined and doesn't + * have an ID allocated to it anymore, charge the closest online + * ancestor for the swap instead and transfer the memory+swap charge. + */ + swap_memcg = mem_cgroup_id_get_online(memcg); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg)); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, true); + mem_cgroup_swap_statistics(swap_memcg, true); page->mem_cgroup = NULL; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); + if (memcg != swap_memcg) { + if (!mem_cgroup_is_root(swap_memcg)) + page_counter_charge(&swap_memcg->memsw, 1); + page_counter_uncharge(&memcg->memsw, 1); + } + /* * Interrupts should be disabled here because the caller holds the * mapping->tree_lock lock which is taken with interrupts-off. It is @@ -5853,11 +5897,14 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!memcg) return 0; + memcg = mem_cgroup_id_get_online(memcg); + if (!mem_cgroup_is_root(memcg) && - !page_counter_try_charge(&memcg->swap, 1, &counter)) + !page_counter_try_charge(&memcg->swap, 1, &counter)) { + mem_cgroup_id_put(memcg); return -ENOMEM; + } - mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); |