diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 752 |
1 files changed, 473 insertions, 279 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8cc617ede7e2..8d9ceea7fe4d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -73,8 +73,6 @@ EXPORT_SYMBOL(memory_cgrp_subsys); struct mem_cgroup *root_mem_cgroup __read_mostly; -#define MEM_CGROUP_RECLAIM_RETRIES 5 - /* Socket memory accounting disabled? */ static bool cgroup_memory_nosocket; @@ -257,8 +255,100 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) } #ifdef CONFIG_MEMCG_KMEM +extern spinlock_t css_set_lock; + +static void obj_cgroup_release(struct percpu_ref *ref) +{ + struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); + struct mem_cgroup *memcg; + unsigned int nr_bytes; + unsigned int nr_pages; + unsigned long flags; + + /* + * At this point all allocated objects are freed, and + * objcg->nr_charged_bytes can't have an arbitrary byte value. + * However, it can be PAGE_SIZE or (x * PAGE_SIZE). + * + * The following sequence can lead to it: + * 1) CPU0: objcg == stock->cached_objcg + * 2) CPU1: we do a small allocation (e.g. 92 bytes), + * PAGE_SIZE bytes are charged + * 3) CPU1: a process from another memcg is allocating something, + * the stock if flushed, + * objcg->nr_charged_bytes = PAGE_SIZE - 92 + * 5) CPU0: we do release this object, + * 92 bytes are added to stock->nr_bytes + * 6) CPU0: stock is flushed, + * 92 bytes are added to objcg->nr_charged_bytes + * + * In the result, nr_charged_bytes == PAGE_SIZE. + * This page will be uncharged in obj_cgroup_release(). + */ + nr_bytes = atomic_read(&objcg->nr_charged_bytes); + WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); + nr_pages = nr_bytes >> PAGE_SHIFT; + + spin_lock_irqsave(&css_set_lock, flags); + memcg = obj_cgroup_memcg(objcg); + if (nr_pages) + __memcg_kmem_uncharge(memcg, nr_pages); + list_del(&objcg->list); + mem_cgroup_put(memcg); + spin_unlock_irqrestore(&css_set_lock, flags); + + percpu_ref_exit(ref); + kfree_rcu(objcg, rcu); +} + +static struct obj_cgroup *obj_cgroup_alloc(void) +{ + struct obj_cgroup *objcg; + int ret; + + objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL); + if (!objcg) + return NULL; + + ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, + GFP_KERNEL); + if (ret) { + kfree(objcg); + return NULL; + } + INIT_LIST_HEAD(&objcg->list); + return objcg; +} + +static void memcg_reparent_objcgs(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ + struct obj_cgroup *objcg, *iter; + + objcg = rcu_replace_pointer(memcg->objcg, NULL, true); + + spin_lock_irq(&css_set_lock); + + /* Move active objcg to the parent's list */ + xchg(&objcg->memcg, parent); + css_get(&parent->css); + list_add(&objcg->list, &parent->objcg_list); + + /* Move already reparented objcgs to the parent's list */ + list_for_each_entry(iter, &memcg->objcg_list, list) { + css_get(&parent->css); + xchg(&iter->memcg, parent); + css_put(&memcg->css); + } + list_splice(&memcg->objcg_list, &parent->objcg_list); + + spin_unlock_irq(&css_set_lock); + + percpu_ref_kill(&objcg->refcnt); +} + /* - * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. + * This will be used as a shrinker list's index. * The main reason for not using cgroup id for this: * this works better in sparse environments, where we have a lot of memcgs, * but only a few kmem-limited. Or also, if we have, for instance, 200 @@ -301,14 +391,12 @@ void memcg_put_cache_ids(void) /* * A lot of the calls to the cache allocation functions are expected to be - * inlined by the compiler. Since the calls to memcg_kmem_get_cache are + * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are * conditional to this static branch, we'll have to allow modules that does * kmem_cache_alloc and the such to see this symbol as well */ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); - -struct workqueue_struct *memcg_kmem_cache_wq; #endif static int memcg_shrinker_map_size; @@ -477,10 +565,17 @@ ino_t page_cgroup_ino(struct page *page) unsigned long ino = 0; rcu_read_lock(); - if (PageSlab(page) && !PageTail(page)) - memcg = memcg_from_slab_page(page); - else - memcg = READ_ONCE(page->mem_cgroup); + memcg = page->mem_cgroup; + + /* + * The lowest bit set means that memcg isn't a valid + * memcg pointer, but a obj_cgroups pointer. + * In this case the page is shared and doesn't belong + * to any specific memory cgroup. + */ + if ((unsigned long) memcg & 0x1UL) + memcg = NULL; + while (memcg && !(memcg->css.flags & CSS_ONLINE)) memcg = parent_mem_cgroup(memcg); if (memcg) @@ -681,13 +776,16 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) */ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { - long x; + long x, threshold = MEMCG_CHARGE_BATCH; if (mem_cgroup_disabled()) return; + if (vmstat_item_in_bytes(idx)) + threshold <<= PAGE_SHIFT; + x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + if (unlikely(abs(x) > threshold)) { struct mem_cgroup *mi; /* @@ -713,29 +811,12 @@ parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) return mem_cgroup_nodeinfo(parent, nid); } -/** - * __mod_lruvec_state - update lruvec memory statistics - * @lruvec: the lruvec - * @idx: the stat item - * @val: delta to add to the counter, can be negative - * - * The lruvec is the intersection of the NUMA node and a cgroup. This - * function updates the all three counters that are affected by a - * change of state at this level: per-node, per-cgroup, per-lruvec. - */ -void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, - int val) +void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + int val) { - pg_data_t *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup_per_node *pn; struct mem_cgroup *memcg; - long x; - - /* Update node */ - __mod_node_page_state(pgdat, idx, val); - - if (mem_cgroup_disabled()) - return; + long x, threshold = MEMCG_CHARGE_BATCH; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; @@ -746,8 +827,12 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update lruvec */ __this_cpu_add(pn->lruvec_stat_local->count[idx], val); + if (vmstat_item_in_bytes(idx)) + threshold <<= PAGE_SHIFT; + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + if (unlikely(abs(x) > threshold)) { + pg_data_t *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup_per_node *pi; for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) @@ -757,6 +842,27 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); } +/** + * __mod_lruvec_state - update lruvec memory statistics + * @lruvec: the lruvec + * @idx: the stat item + * @val: delta to add to the counter, can be negative + * + * The lruvec is the intersection of the NUMA node and a cgroup. This + * function updates the all three counters that are affected by a + * change of state at this level: per-node, per-cgroup, per-lruvec. + */ +void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + int val) +{ + /* Update node */ + __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + + /* Update memcg and lruvec */ + if (!mem_cgroup_disabled()) + __mod_memcg_lruvec_state(lruvec, idx, val); +} + void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) { pg_data_t *pgdat = page_pgdat(virt_to_page(p)); @@ -1377,12 +1483,11 @@ static char *memory_stat_format(struct mem_cgroup *memcg) (u64)memcg_page_state(memcg, NR_FILE_PAGES) * PAGE_SIZE); seq_buf_printf(&s, "kernel_stack %llu\n", - (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * + (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) * 1024); seq_buf_printf(&s, "slab %llu\n", - (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + - memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * - PAGE_SIZE); + (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + + memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B))); seq_buf_printf(&s, "sock %llu\n", (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE); @@ -1412,11 +1517,9 @@ static char *memory_stat_format(struct mem_cgroup *memcg) PAGE_SIZE); seq_buf_printf(&s, "slab_reclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * - PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B)); seq_buf_printf(&s, "slab_unreclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * - PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)); /* Accumulated memory events */ @@ -1560,15 +1663,21 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, .gfp_mask = gfp_mask, .order = order, }; - bool ret; + bool ret = true; if (mutex_lock_killable(&oom_lock)) return true; + + if (mem_cgroup_margin(memcg) >= (1 << order)) + goto unlock; + /* * A few threads which were not waiting at mutex_lock_killable() can * fail to bail out. Therefore, check again after holding oom_lock. */ ret = should_force_charge() || out_of_memory(&oc); + +unlock: mutex_unlock(&oom_lock); return ret; } @@ -2039,6 +2148,12 @@ EXPORT_SYMBOL(unlock_page_memcg); struct memcg_stock_pcp { struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; + +#ifdef CONFIG_MEMCG_KMEM + struct obj_cgroup *cached_objcg; + unsigned int nr_bytes; +#endif + struct work_struct work; unsigned long flags; #define FLUSHING_CACHED_CHARGE 0 @@ -2046,6 +2161,22 @@ struct memcg_stock_pcp { static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); static DEFINE_MUTEX(percpu_charge_mutex); +#ifdef CONFIG_MEMCG_KMEM +static void drain_obj_stock(struct memcg_stock_pcp *stock); +static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg); + +#else +static inline void drain_obj_stock(struct memcg_stock_pcp *stock) +{ +} +static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg) +{ + return false; +} +#endif + /** * consume_stock: Try to consume stocked charge on this cpu. * @memcg: memcg to consume from. @@ -2086,13 +2217,17 @@ static void drain_stock(struct memcg_stock_pcp *stock) { struct mem_cgroup *old = stock->cached; + if (!old) + return; + if (stock->nr_pages) { page_counter_uncharge(&old->memory, stock->nr_pages); if (do_memsw_account()) page_counter_uncharge(&old->memsw, stock->nr_pages); - css_put_many(&old->css, stock->nr_pages); stock->nr_pages = 0; } + + css_put(&old->css); stock->cached = NULL; } @@ -2108,6 +2243,7 @@ static void drain_local_stock(struct work_struct *dummy) local_irq_save(flags); stock = this_cpu_ptr(&memcg_stock); + drain_obj_stock(stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); @@ -2128,6 +2264,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) stock = this_cpu_ptr(&memcg_stock); if (stock->cached != memcg) { /* reset if necessary */ drain_stock(stock); + css_get(&memcg->css); stock->cached = memcg; } stock->nr_pages += nr_pages; @@ -2166,6 +2303,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) if (memcg && stock->nr_pages && mem_cgroup_is_descendant(memcg, root_memcg)) flush = true; + if (obj_stock_flush_required(stock, root_memcg)) + flush = true; rcu_read_unlock(); if (flush && @@ -2228,18 +2367,29 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) return 0; } -static void reclaim_high(struct mem_cgroup *memcg, - unsigned int nr_pages, - gfp_t gfp_mask) +static unsigned long reclaim_high(struct mem_cgroup *memcg, + unsigned int nr_pages, + gfp_t gfp_mask) { + unsigned long nr_reclaimed = 0; + do { + unsigned long pflags; + if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->memory.high)) continue; + memcg_memory_event(memcg, MEMCG_HIGH); - try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + + psi_memstall_enter(&pflags); + nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, + gfp_mask, true); + psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); + + return nr_reclaimed; } static void high_work_func(struct work_struct *work) @@ -2395,16 +2545,32 @@ void mem_cgroup_handle_over_high(void) { unsigned long penalty_jiffies; unsigned long pflags; + unsigned long nr_reclaimed; unsigned int nr_pages = current->memcg_nr_pages_over_high; + int nr_retries = MAX_RECLAIM_RETRIES; struct mem_cgroup *memcg; + bool in_retry = false; if (likely(!nr_pages)) return; memcg = get_mem_cgroup_from_mm(current->mm); - reclaim_high(memcg, nr_pages, GFP_KERNEL); current->memcg_nr_pages_over_high = 0; +retry_reclaim: + /* + * The allocating task should reclaim at least the batch size, but for + * subsequent retries we only want to do what's necessary to prevent oom + * or breaching resource isolation. + * + * This is distinct from memory.max or page allocator behaviour because + * memory.high is currently batched, whereas memory.max and the page + * allocator run every time an allocation is made. + */ + nr_reclaimed = reclaim_high(memcg, + in_retry ? SWAP_CLUSTER_MAX : nr_pages, + GFP_KERNEL); + /* * memory.high is breached and reclaim is unable to keep up. Throttle * allocators proactively to slow down excessive growth. @@ -2432,6 +2598,16 @@ void mem_cgroup_handle_over_high(void) goto out; /* + * If reclaim is making forward progress but we're still over + * memory.high, we want to encourage that rather than doing allocator + * throttling. + */ + if (nr_reclaimed || nr_retries--) { + in_retry = true; + goto retry_reclaim; + } + + /* * If we exit early, we're guaranteed to die (since * schedule_timeout_killable sets TASK_KILLABLE). This means we don't * need to account for any ill-begotten jiffies to pay them off later. @@ -2448,13 +2624,14 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages) { unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); - int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + int nr_retries = MAX_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct page_counter *counter; + enum oom_status oom_status; unsigned long nr_reclaimed; bool may_swap = true; bool drained = false; - enum oom_status oom_status; + unsigned long pflags; if (mem_cgroup_is_root(memcg)) return 0; @@ -2514,8 +2691,10 @@ retry: memcg_memory_event(mem_over_limit, MEMCG_MAX); + psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, may_swap); + psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) goto retry; @@ -2567,7 +2746,7 @@ retry: get_order(nr_pages * PAGE_SIZE)); switch (oom_status) { case OOM_SUCCESS: - nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + nr_retries = MAX_RECLAIM_RETRIES; goto retry; case OOM_FAILED: goto force; @@ -2586,12 +2765,10 @@ force: page_counter_charge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_charge(&memcg->memsw, nr_pages); - css_get_many(&memcg->css, nr_pages); return 0; done_restock: - css_get_many(&memcg->css, batch); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); @@ -2649,8 +2826,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) page_counter_uncharge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); - - css_put_many(&memcg->css, nr_pages); } #endif @@ -2669,6 +2844,26 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) } #ifdef CONFIG_MEMCG_KMEM +int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, + gfp_t gfp) +{ + unsigned int objects = objs_per_slab_page(s, page); + void *vec; + + vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, + page_to_nid(page)); + if (!vec) + return -ENOMEM; + + if (cmpxchg(&page->obj_cgroups, NULL, + (struct obj_cgroup **) ((unsigned long)vec | 0x1UL))) + kfree(vec); + else + kmemleak_not_leak(vec); + + return 0; +} + /* * Returns a pointer to the memory cgroup to which the kernel object is charged. * @@ -2685,17 +2880,50 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p) page = virt_to_head_page(p); /* - * Slab pages don't have page->mem_cgroup set because corresponding - * kmem caches can be reparented during the lifetime. That's why - * memcg_from_slab_page() should be used instead. + * Slab objects are accounted individually, not per-page. + * Memcg membership data for each individual object is saved in + * the page->obj_cgroups. */ - if (PageSlab(page)) - return memcg_from_slab_page(page); + if (page_has_obj_cgroups(page)) { + struct obj_cgroup *objcg; + unsigned int off; + + off = obj_to_index(page->slab_cache, page, p); + objcg = page_obj_cgroups(page)[off]; + if (objcg) + return obj_cgroup_memcg(objcg); + + return NULL; + } /* All other pages use page->mem_cgroup */ return page->mem_cgroup; } +__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) +{ + struct obj_cgroup *objcg = NULL; + struct mem_cgroup *memcg; + + if (unlikely(!current->mm && !current->active_memcg)) + return NULL; + + rcu_read_lock(); + if (unlikely(current->active_memcg)) + memcg = rcu_dereference(current->active_memcg); + else + memcg = mem_cgroup_from_task(current); + + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + objcg = rcu_dereference(memcg->objcg); + if (objcg && obj_cgroup_tryget(objcg)) + break; + } + rcu_read_unlock(); + + return objcg; +} + static int memcg_alloc_cache_id(void) { int id, size; @@ -2721,9 +2949,7 @@ static int memcg_alloc_cache_id(void) else if (size > MEMCG_CACHES_MAX_SIZE) size = MEMCG_CACHES_MAX_SIZE; - err = memcg_update_all_caches(size); - if (!err) - err = memcg_update_all_list_lrus(size); + err = memcg_update_all_list_lrus(size); if (!err) memcg_nr_cache_ids = size; @@ -2741,150 +2967,6 @@ static void memcg_free_cache_id(int id) ida_simple_remove(&memcg_cache_ida, id); } -struct memcg_kmem_cache_create_work { - struct mem_cgroup *memcg; - struct kmem_cache *cachep; - struct work_struct work; -}; - -static void memcg_kmem_cache_create_func(struct work_struct *w) -{ - struct memcg_kmem_cache_create_work *cw = - container_of(w, struct memcg_kmem_cache_create_work, work); - struct mem_cgroup *memcg = cw->memcg; - struct kmem_cache *cachep = cw->cachep; - - memcg_create_kmem_cache(memcg, cachep); - - css_put(&memcg->css); - kfree(cw); -} - -/* - * Enqueue the creation of a per-memcg kmem_cache. - */ -static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, - struct kmem_cache *cachep) -{ - struct memcg_kmem_cache_create_work *cw; - - if (!css_tryget_online(&memcg->css)) - return; - - cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); - if (!cw) { - css_put(&memcg->css); - return; - } - - cw->memcg = memcg; - cw->cachep = cachep; - INIT_WORK(&cw->work, memcg_kmem_cache_create_func); - - queue_work(memcg_kmem_cache_wq, &cw->work); -} - -static inline bool memcg_kmem_bypass(void) -{ - if (in_interrupt()) - return true; - - /* Allow remote memcg charging in kthread contexts. */ - if ((!current->mm || (current->flags & PF_KTHREAD)) && - !current->active_memcg) - return true; - return false; -} - -/** - * memcg_kmem_get_cache: select the correct per-memcg cache for allocation - * @cachep: the original global kmem cache - * - * Return the kmem_cache we're supposed to use for a slab allocation. - * We try to use the current memcg's version of the cache. - * - * If the cache does not exist yet, if we are the first user of it, we - * create it asynchronously in a workqueue and let the current allocation - * go through with the original cache. - * - * This function takes a reference to the cache it returns to assure it - * won't get destroyed while we are working with it. Once the caller is - * done with it, memcg_kmem_put_cache() must be called to release the - * reference. - */ -struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) -{ - struct mem_cgroup *memcg; - struct kmem_cache *memcg_cachep; - struct memcg_cache_array *arr; - int kmemcg_id; - - VM_BUG_ON(!is_root_cache(cachep)); - - if (memcg_kmem_bypass()) - return cachep; - - rcu_read_lock(); - - if (unlikely(current->active_memcg)) - memcg = current->active_memcg; - else - memcg = mem_cgroup_from_task(current); - - if (!memcg || memcg == root_mem_cgroup) - goto out_unlock; - - kmemcg_id = READ_ONCE(memcg->kmemcg_id); - if (kmemcg_id < 0) - goto out_unlock; - - arr = rcu_dereference(cachep->memcg_params.memcg_caches); - - /* - * Make sure we will access the up-to-date value. The code updating - * memcg_caches issues a write barrier to match the data dependency - * barrier inside READ_ONCE() (see memcg_create_kmem_cache()). - */ - memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]); - - /* - * If we are in a safe context (can wait, and not in interrupt - * context), we could be be predictable and return right away. - * This would guarantee that the allocation being performed - * already belongs in the new cache. - * - * However, there are some clashes that can arrive from locking. - * For instance, because we acquire the slab_mutex while doing - * memcg_create_kmem_cache, this means no further allocation - * could happen with the slab_mutex held. So it's better to - * defer everything. - * - * If the memcg is dying or memcg_cache is about to be released, - * don't bother creating new kmem_caches. Because memcg_cachep - * is ZEROed as the fist step of kmem offlining, we don't need - * percpu_ref_tryget_live() here. css_tryget_online() check in - * memcg_schedule_kmem_cache_create() will prevent us from - * creation of a new kmem_cache. - */ - if (unlikely(!memcg_cachep)) - memcg_schedule_kmem_cache_create(memcg, cachep); - else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt)) - cachep = memcg_cachep; -out_unlock: - rcu_read_unlock(); - return cachep; -} - -/** - * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache - * @cachep: the cache returned by memcg_kmem_get_cache - */ -void memcg_kmem_put_cache(struct kmem_cache *cachep) -{ - if (!is_root_cache(cachep)) - percpu_ref_put(&cachep->memcg_params.refcnt); -} - /** * __memcg_kmem_charge: charge a number of kernel pages to a memcg * @memcg: memory cgroup to charge @@ -2958,6 +3040,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) if (!ret) { page->mem_cgroup = memcg; __SetPageKmemcg(page); + return 0; } } css_put(&memcg->css); @@ -2980,13 +3063,146 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); __memcg_kmem_uncharge(memcg, nr_pages); page->mem_cgroup = NULL; + css_put(&memcg->css); /* slab pages do not have PageKmemcg flag set */ if (PageKmemcg(page)) __ClearPageKmemcg(page); +} + +static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +{ + struct memcg_stock_pcp *stock; + unsigned long flags; + bool ret = false; + + local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); + if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { + stock->nr_bytes -= nr_bytes; + ret = true; + } + + local_irq_restore(flags); + + return ret; +} + +static void drain_obj_stock(struct memcg_stock_pcp *stock) +{ + struct obj_cgroup *old = stock->cached_objcg; + + if (!old) + return; + + if (stock->nr_bytes) { + unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; + unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); + + if (nr_pages) { + rcu_read_lock(); + __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages); + rcu_read_unlock(); + } + + /* + * The leftover is flushed to the centralized per-memcg value. + * On the next attempt to refill obj stock it will be moved + * to a per-cpu stock (probably, on an other CPU), see + * refill_obj_stock(). + * + * How often it's flushed is a trade-off between the memory + * limit enforcement accuracy and potential CPU contention, + * so it might be changed in the future. + */ + atomic_add(nr_bytes, &old->nr_charged_bytes); + stock->nr_bytes = 0; + } + + obj_cgroup_put(old); + stock->cached_objcg = NULL; +} + +static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg) +{ + struct mem_cgroup *memcg; + + if (stock->cached_objcg) { + memcg = obj_cgroup_memcg(stock->cached_objcg); + if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) + return true; + } + + return false; +} + +static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +{ + struct memcg_stock_pcp *stock; + unsigned long flags; + + local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); + if (stock->cached_objcg != objcg) { /* reset if necessary */ + drain_obj_stock(stock); + obj_cgroup_get(objcg); + stock->cached_objcg = objcg; + stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0); + } + stock->nr_bytes += nr_bytes; + + if (stock->nr_bytes > PAGE_SIZE) + drain_obj_stock(stock); + + local_irq_restore(flags); +} + +int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +{ + struct mem_cgroup *memcg; + unsigned int nr_pages, nr_bytes; + int ret; + + if (consume_obj_stock(objcg, size)) + return 0; + + /* + * In theory, memcg->nr_charged_bytes can have enough + * pre-charged bytes to satisfy the allocation. However, + * flushing memcg->nr_charged_bytes requires two atomic + * operations, and memcg->nr_charged_bytes can't be big, + * so it's better to ignore it and try grab some new pages. + * memcg->nr_charged_bytes will be flushed in + * refill_obj_stock(), called from this function or + * independently later. + */ + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); + css_get(&memcg->css); + rcu_read_unlock(); + + nr_pages = size >> PAGE_SHIFT; + nr_bytes = size & (PAGE_SIZE - 1); + + if (nr_bytes) + nr_pages += 1; - css_put_many(&memcg->css, nr_pages); + ret = __memcg_kmem_charge(memcg, gfp, nr_pages); + if (!ret && nr_bytes) + refill_obj_stock(objcg, PAGE_SIZE - nr_bytes); + + css_put(&memcg->css); + return ret; } + +void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) +{ + refill_obj_stock(objcg, size); +} + #endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2997,13 +3213,16 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) */ void mem_cgroup_split_huge_fixup(struct page *head) { + struct mem_cgroup *memcg = head->mem_cgroup; int i; if (mem_cgroup_disabled()) return; - for (i = 1; i < HPAGE_PMD_NR; i++) - head[i].mem_cgroup = head->mem_cgroup; + for (i = 1; i < HPAGE_PMD_NR; i++) { + css_get(&memcg->css); + head[i].mem_cgroup = memcg; + } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -3207,7 +3426,7 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) */ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) { - int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + int nr_retries = MAX_RECLAIM_RETRIES; /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); @@ -3404,6 +3623,7 @@ static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) #ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { + struct obj_cgroup *objcg; int memcg_id; if (cgroup_memory_nokmem) @@ -3416,7 +3636,16 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) if (memcg_id < 0) return memcg_id; - static_branch_inc(&memcg_kmem_enabled_key); + objcg = obj_cgroup_alloc(); + if (!objcg) { + memcg_free_cache_id(memcg_id); + return -ENOMEM; + } + objcg->memcg = memcg; + rcu_assign_pointer(memcg->objcg, objcg); + + static_branch_enable(&memcg_kmem_enabled_key); + /* * A memory cgroup is considered kmem-online as soon as it gets * kmemcg_id. Setting the id after enabling static branching will @@ -3425,7 +3654,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) */ memcg->kmemcg_id = memcg_id; memcg->kmem_state = KMEM_ONLINE; - INIT_LIST_HEAD(&memcg->kmem_caches); return 0; } @@ -3438,22 +3666,14 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) if (memcg->kmem_state != KMEM_ONLINE) return; - /* - * Clear the online state before clearing memcg_caches array - * entries. The slab_mutex in memcg_deactivate_kmem_caches() - * guarantees that no cache will be created for this cgroup - * after we are done (see memcg_create_kmem_cache()). - */ + memcg->kmem_state = KMEM_ALLOCATED; parent = parent_mem_cgroup(memcg); if (!parent) parent = root_mem_cgroup; - /* - * Deactivate and reparent kmem_caches. - */ - memcg_deactivate_kmem_caches(memcg, parent); + memcg_reparent_objcgs(memcg, parent); kmemcg_id = memcg->kmemcg_id; BUG_ON(kmemcg_id < 0); @@ -3486,11 +3706,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) /* css_alloc() failed, offlining didn't happen */ if (unlikely(memcg->kmem_state == KMEM_ONLINE)) memcg_offline_kmem(memcg); - - if (memcg->kmem_state == KMEM_ALLOCATED) { - WARN_ON(!list_empty(&memcg->kmem_caches)); - static_branch_dec(&memcg_kmem_enabled_key); - } } #else static int memcg_online_kmem(struct mem_cgroup *memcg) @@ -4800,9 +5015,6 @@ static struct cftype mem_cgroup_legacy_files[] = { (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) { .name = "kmem.slabinfo", - .seq_start = memcg_slab_start, - .seq_next = memcg_slab_next, - .seq_stop = memcg_slab_stop, .seq_show = memcg_slab_show, }, #endif @@ -5022,6 +5234,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->socket_pressure = jiffies; #ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; + INIT_LIST_HEAD(&memcg->objcg_list); #endif #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); @@ -5084,9 +5297,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) /* The following stuff does not apply to the root */ if (!parent) { -#ifdef CONFIG_MEMCG_KMEM - INIT_LIST_HEAD(&memcg->kmem_caches); -#endif root_mem_cgroup = memcg; return &memcg->css; } @@ -5448,7 +5658,10 @@ static int mem_cgroup_move_account(struct page *page, */ smp_mb(); - page->mem_cgroup = to; /* caller should have done css_get */ + css_get(&to->css); + css_put(&from->css); + + page->mem_cgroup = to; __unlock_page_memcg(from); @@ -5669,8 +5882,6 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.to)) page_counter_uncharge(&mc.to->memory, mc.moved_swap); - css_put_many(&mc.to->css, mc.moved_swap); - mc.moved_swap = 0; } memcg_oom_recover(from); @@ -6036,7 +6247,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + unsigned int nr_retries = MAX_RECLAIM_RETRIES; bool drained = false; unsigned long high; int err; @@ -6046,8 +6257,6 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, if (err) return err; - page_counter_set_high(&memcg->memory, high); - for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); unsigned long reclaimed; @@ -6071,6 +6280,10 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, break; } + page_counter_set_high(&memcg->memory, high); + + memcg_wb_domain_size_changed(memcg); + return nbytes; } @@ -6084,7 +6297,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES; + unsigned int nr_reclaims = MAX_RECLAIM_RETRIES; bool drained = false; unsigned long max; int err; @@ -6391,40 +6604,42 @@ static unsigned long effective_protection(unsigned long usage, * * WARNING: This function is not stateless! It can only be used as part * of a top-down tree iteration, not for isolated queries. - * - * Returns one of the following: - * MEMCG_PROT_NONE: cgroup memory is not protected - * MEMCG_PROT_LOW: cgroup memory is protected as long there is - * an unprotected supply of reclaimable memory from other cgroups. - * MEMCG_PROT_MIN: cgroup memory is protected */ -enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, - struct mem_cgroup *memcg) +void mem_cgroup_calculate_protection(struct mem_cgroup *root, + struct mem_cgroup *memcg) { unsigned long usage, parent_usage; struct mem_cgroup *parent; if (mem_cgroup_disabled()) - return MEMCG_PROT_NONE; + return; if (!root) root = root_mem_cgroup; + + /* + * Effective values of the reclaim targets are ignored so they + * can be stale. Have a look at mem_cgroup_protection for more + * details. + * TODO: calculation should be more robust so that we do not need + * that special casing. + */ if (memcg == root) - return MEMCG_PROT_NONE; + return; usage = page_counter_read(&memcg->memory); if (!usage) - return MEMCG_PROT_NONE; + return; parent = parent_mem_cgroup(memcg); /* No parent means a non-hierarchical mode on v1 memcg */ if (!parent) - return MEMCG_PROT_NONE; + return; if (parent == root) { memcg->memory.emin = READ_ONCE(memcg->memory.min); memcg->memory.elow = READ_ONCE(memcg->memory.low); - goto out; + return; } parent_usage = page_counter_read(&parent->memory); @@ -6438,14 +6653,6 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, READ_ONCE(memcg->memory.low), READ_ONCE(parent->memory.elow), atomic_long_read(&parent->memory.children_low_usage))); - -out: - if (usage <= memcg->memory.emin) - return MEMCG_PROT_MIN; - else if (usage <= memcg->memory.elow) - return MEMCG_PROT_LOW; - else - return MEMCG_PROT_NONE; } /** @@ -6498,6 +6705,7 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) if (ret) goto out_put; + css_get(&memcg->css); commit_charge(page, memcg); local_irq_disable(); @@ -6552,9 +6760,6 @@ static void uncharge_batch(const struct uncharge_gather *ug) __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); - - if (!mem_cgroup_is_root(ug->memcg)) - css_put_many(&ug->memcg->css, ug->nr_pages); } static void uncharge_page(struct page *page, struct uncharge_gather *ug) @@ -6592,6 +6797,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) ug->dummy_page = page; page->mem_cgroup = NULL; + css_put(&ug->memcg->css); } static void uncharge_list(struct list_head *page_list) @@ -6697,8 +6903,8 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) page_counter_charge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_charge(&memcg->memsw, nr_pages); - css_get_many(&memcg->css, nr_pages); + css_get(&memcg->css); commit_charge(newpage, memcg); local_irq_save(flags); @@ -6821,17 +7027,6 @@ static int __init mem_cgroup_init(void) { int cpu, node; -#ifdef CONFIG_MEMCG_KMEM - /* - * Kmem cache creation is mostly done with the slab_mutex held, - * so use a workqueue with limited concurrency to avoid stalling - * all worker threads in case lots of cgroups are created and - * destroyed simultaneously. - */ - memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1); - BUG_ON(!memcg_kmem_cache_wq); -#endif - cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, memcg_hotplug_cpu_dead); @@ -6935,8 +7130,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) mem_cgroup_charge_statistics(memcg, page, -nr_entries); memcg_check_events(memcg, page); - if (!mem_cgroup_is_root(memcg)) - css_put_many(&memcg->css, nr_entries); + css_put(&memcg->css); } /** |