diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 13 | ||||
-rw-r--r-- | mm/hugetlb.c | 11 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 19 | ||||
-rw-r--r-- | mm/kmemleak.c | 3 | ||||
-rw-r--r-- | mm/memcontrol.c | 1242 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 18 | ||||
-rw-r--r-- | mm/mprotect.c | 30 | ||||
-rw-r--r-- | mm/page_alloc.c | 38 | ||||
-rw-r--r-- | mm/slab.c | 94 | ||||
-rw-r--r-- | mm/slab.h | 137 | ||||
-rw-r--r-- | mm/slab_common.c | 118 | ||||
-rw-r--r-- | mm/slob.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 150 | ||||
-rw-r--r-- | mm/vmscan.c | 14 |
14 files changed, 1740 insertions, 149 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 71259e052ce8..278e3ab1f169 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -149,7 +149,18 @@ config MOVABLE_NODE depends on NO_BOOTMEM depends on X86_64 depends on NUMA - depends on BROKEN + default n + help + Allow a node to have only movable memory. Pages used by the kernel, + such as direct mapping pages cannot be migrated. So the corresponding + memory device cannot be hotplugged. This option allows users to + online all the memory of a node as movable memory so that the whole + node can be hotplugged. Users who don't use the memory hotplug + feature are fine with this option on since they don't online memory + as movable. + + Say Y here if you want to hotplug a whole node. + Say N here if you want kernel to use memory on all nodes evenly. # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e5318c7793ae..4f3ea0b1e57c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void) default_hstate.max_huge_pages = default_hstate_max_huge_pages; hugetlb_init_hstates(); - gather_bootmem_prealloc(); - report_hugepages(); hugetlb_sysfs_init(); - hugetlb_register_all_nodes(); + hugetlb_cgroup_file_init(); return 0; } @@ -1943,13 +1941,6 @@ void __init hugetlb_add_hstate(unsigned order) h->next_nid_to_free = first_node(node_states[N_MEMORY]); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); - /* - * Add cgroup control files only if the huge page consists - * of more than two normal pages. This is because we use - * page[2].lru.next for storing cgoup details. - */ - if (order >= HUGETLB_CGROUP_MIN_ORDER) - hugetlb_cgroup_file_init(hugetlb_max_hstate - 1); parsed_hstate = h; } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index b5bde7a5c017..9cea7de22ffb 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -333,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize) return buf; } -int __init hugetlb_cgroup_file_init(int idx) +static void __init __hugetlb_cgroup_file_init(int idx) { char buf[32]; struct cftype *cft; @@ -375,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx) WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); - return 0; + return; +} + +void __init hugetlb_cgroup_file_init(void) +{ + struct hstate *h; + + for_each_hstate(h) { + /* + * Add cgroup control files only if the huge page consists + * of more than two normal pages. This is because we use + * page[2].lru.next for storing cgroup details. + */ + if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) + __hugetlb_cgroup_file_init(hstate_index(h)); + } } /* diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a217cc544060..752a705c77c2 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str) struct kmemleak_object *object; unsigned long addr; - addr= simple_strtoul(str, NULL, 0); + if (kstrtoul(str, 0, &addr)) + return -EINVAL; object = find_and_get_object(addr, 0); if (!object) { pr_info("Unknown object at 0x%08lx\n", addr); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bbfac5063ca8..f3009b4bae51 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -10,6 +10,10 @@ * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * + * Kernel Memory Controller + * Copyright (C) 2012 Parallels Inc. and Google Inc. + * Authors: Glauber Costa and Suleiman Souhlal + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -268,6 +272,10 @@ struct mem_cgroup { }; /* + * the counter to account for kernel memory usage. + */ + struct res_counter kmem; + /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. */ @@ -282,6 +290,7 @@ struct mem_cgroup { * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; + unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ bool oom_lock; atomic_t under_oom; @@ -332,8 +341,61 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct tcp_memcontrol tcp_mem; #endif +#if defined(CONFIG_MEMCG_KMEM) + /* analogous to slab_common's slab_caches list. per-memcg */ + struct list_head memcg_slab_caches; + /* Not a spinlock, we can take a lot of time walking the list */ + struct mutex slab_caches_mutex; + /* Index in the kmem_cache->memcg_params->memcg_caches array */ + int kmemcg_id; +#endif }; +/* internal only representation about the status of kmem accounting. */ +enum { + KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ + KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ + KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ +}; + +/* We account when limit is on, but only after call sites are patched */ +#define KMEM_ACCOUNTED_MASK \ + ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) + +#ifdef CONFIG_MEMCG_KMEM +static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) +{ + set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} + +static bool memcg_kmem_is_active(struct mem_cgroup *memcg) +{ + return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} + +static void memcg_kmem_set_activated(struct mem_cgroup *memcg) +{ + set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); +} + +static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) +{ + clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); +} + +static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) +{ + if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) + set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); +} + +static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) +{ + return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, + &memcg->kmem_account_flags); +} +#endif + /* Stuffs for move charges at task migration. */ /* * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a @@ -388,9 +450,13 @@ enum charge_type { }; /* for encoding cft->private value on file */ -#define _MEM (0) -#define _MEMSWAP (1) -#define _OOM_TYPE (2) +enum res_type { + _MEM, + _MEMSWAP, + _OOM_TYPE, + _KMEM, +}; + #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) @@ -487,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) } #endif +#ifdef CONFIG_MEMCG_KMEM +/* + * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. + * There are two main reasons for not using the css_id for this: + * 1) this works better in sparse environments, where we have a lot of memcgs, + * but only a few kmem-limited. Or also, if we have, for instance, 200 + * memcgs, and none but the 200th is kmem-limited, we'd have to have a + * 200 entry array for that. + * + * 2) In order not to violate the cgroup API, we would like to do all memory + * allocation in ->create(). At that point, we haven't yet allocated the + * css_id. Having a separate index prevents us from messing with the cgroup + * core for this + * + * The current size of the caches array is stored in + * memcg_limited_groups_array_size. It will double each time we have to + * increase it. + */ +static DEFINE_IDA(kmem_limited_groups); +int memcg_limited_groups_array_size; + +/* + * MIN_SIZE is different than 1, because we would like to avoid going through + * the alloc/free process all the time. In a small machine, 4 kmem-limited + * cgroups is a reasonable guess. In the future, it could be a parameter or + * tunable, but that is strictly not necessary. + * + * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get + * this constant directly from cgroup, but it is understandable that this is + * better kept as an internal representation in cgroup.c. In any case, the + * css_id space is not getting any smaller, and we don't have to necessarily + * increase ours as well if it increases. + */ +#define MEMCG_CACHES_MIN_SIZE 4 +#define MEMCG_CACHES_MAX_SIZE 65535 + +/* + * A lot of the calls to the cache allocation functions are expected to be + * inlined by the compiler. Since the calls to memcg_kmem_get_cache are + * conditional to this static branch, we'll have to allow modules that does + * kmem_cache_alloc and the such to see this symbol as well + */ +struct static_key memcg_kmem_enabled_key; +EXPORT_SYMBOL(memcg_kmem_enabled_key); + +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ + if (memcg_kmem_is_active(memcg)) { + static_key_slow_dec(&memcg_kmem_enabled_key); + ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); + } + /* + * This check can't live in kmem destruction function, + * since the charges will outlive the cgroup + */ + WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); +} +#else +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +static void disarm_static_keys(struct mem_cgroup *memcg) +{ + disarm_sock_keys(memcg); + disarm_kmem_keys(memcg); +} + static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * @@ -1453,6 +1588,10 @@ done: res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); + printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", + res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, + res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); } /* @@ -2060,20 +2199,28 @@ struct memcg_stock_pcp { static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); static DEFINE_MUTEX(percpu_charge_mutex); -/* - * Try to consume stocked charge on this cpu. If success, one page is consumed - * from local stock and true is returned. If the stock is 0 or charges from a - * cgroup which is not current target, returns false. This stock will be - * refilled. +/** + * consume_stock: Try to consume stocked charge on this cpu. + * @memcg: memcg to consume from. + * @nr_pages: how many pages to charge. + * + * The charges will only happen if @memcg matches the current cpu's memcg + * stock, and at least @nr_pages are available in that stock. Failure to + * service an allocation will refill the stock. + * + * returns true if successful, false otherwise. */ -static bool consume_stock(struct mem_cgroup *memcg) +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; bool ret = true; + if (nr_pages > CHARGE_BATCH) + return false; + stock = &get_cpu_var(memcg_stock); - if (memcg == stock->cached && stock->nr_pages) - stock->nr_pages--; + if (memcg == stock->cached && stock->nr_pages >= nr_pages) + stock->nr_pages -= nr_pages; else /* need to call res_counter_charge */ ret = false; put_cpu_var(memcg_stock); @@ -2250,7 +2397,8 @@ enum { }; static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, bool oom_check) + unsigned int nr_pages, unsigned int min_pages, + bool oom_check) { unsigned long csize = nr_pages * PAGE_SIZE; struct mem_cgroup *mem_over_limit; @@ -2273,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); /* - * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch - * of regular pages (CHARGE_BATCH), or a single regular page (1). - * * Never reclaim on behalf of optional batching, retry with a * single page instead. */ - if (nr_pages == CHARGE_BATCH) + if (nr_pages > min_pages) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; + if (gfp_mask & __GFP_NORETRY) + return CHARGE_NOMEM; + ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; @@ -2297,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_pages == 1 && ret) + if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) return CHARGE_RETRY; /* @@ -2371,7 +2519,7 @@ again: memcg = *ptr; if (mem_cgroup_is_root(memcg)) goto done; - if (nr_pages == 1 && consume_stock(memcg)) + if (consume_stock(memcg, nr_pages)) goto done; css_get(&memcg->css); } else { @@ -2396,7 +2544,7 @@ again: rcu_read_unlock(); goto done; } - if (nr_pages == 1 && consume_stock(memcg)) { + if (consume_stock(memcg, nr_pages)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -2431,7 +2579,8 @@ again: nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; } - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); + ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, + oom_check); switch (ret) { case CHARGE_OK: break; @@ -2624,6 +2773,766 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, memcg_check_events(memcg, page); } +static DEFINE_MUTEX(set_limit_mutex); + +#ifdef CONFIG_MEMCG_KMEM +static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) +{ + return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && + (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); +} + +/* + * This is a bit cumbersome, but it is rarely used and avoids a backpointer + * in the memcg_cache_params struct. + */ +static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) +{ + struct kmem_cache *cachep; + + VM_BUG_ON(p->is_root_cache); + cachep = p->root_cache; + return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; +} + +#ifdef CONFIG_SLABINFO +static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, + struct seq_file *m) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct memcg_cache_params *params; + + if (!memcg_can_account_kmem(memcg)) + return -EIO; + + print_slabinfo_header(m); + + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) + cache_show(memcg_params_to_cache(params), m); + mutex_unlock(&memcg->slab_caches_mutex); + + return 0; +} +#endif + +static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +{ + struct res_counter *fail_res; + struct mem_cgroup *_memcg; + int ret = 0; + bool may_oom; + + ret = res_counter_charge(&memcg->kmem, size, &fail_res); + if (ret) + return ret; + + /* + * Conditions under which we can wait for the oom_killer. Those are + * the same conditions tested by the core page allocator + */ + may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); + + _memcg = memcg; + ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, + &_memcg, may_oom); + + if (ret == -EINTR) { + /* + * __mem_cgroup_try_charge() chosed to bypass to root due to + * OOM kill or fatal signal. Since our only options are to + * either fail the allocation or charge it to this cgroup, do + * it as a temporary condition. But we can't fail. From a + * kmem/slab perspective, the cache has already been selected, + * by mem_cgroup_kmem_get_cache(), so it is too late to change + * our minds. + * + * This condition will only trigger if the task entered + * memcg_charge_kmem in a sane state, but was OOM-killed during + * __mem_cgroup_try_charge() above. Tasks that were already + * dying when the allocation triggers should have been already + * directed to the root cgroup in memcontrol.h + */ + res_counter_charge_nofail(&memcg->res, size, &fail_res); + if (do_swap_account) + res_counter_charge_nofail(&memcg->memsw, size, + &fail_res); + ret = 0; + } else if (ret) + res_counter_uncharge(&memcg->kmem, size); + + return ret; +} + +static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +{ + res_counter_uncharge(&memcg->res, size); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, size); + + /* Not down to 0 */ + if (res_counter_uncharge(&memcg->kmem, size)) + return; + + if (memcg_kmem_test_and_clear_dead(memcg)) + mem_cgroup_put(memcg); +} + +void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) +{ + if (!memcg) + return; + + mutex_lock(&memcg->slab_caches_mutex); + list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); + mutex_unlock(&memcg->slab_caches_mutex); +} + +/* + * helper for acessing a memcg's index. It will be used as an index in the + * child cache array in kmem_cache, and also to derive its name. This function + * will return -1 when this is not a kmem-limited memcg. + */ +int memcg_cache_id(struct mem_cgroup *memcg) +{ + return memcg ? memcg->kmemcg_id : -1; +} + +/* + * This ends up being protected by the set_limit mutex, during normal + * operation, because that is its main call site. + * + * But when we create a new cache, we can call this as well if its parent + * is kmem-limited. That will have to hold set_limit_mutex as well. + */ +int memcg_update_cache_sizes(struct mem_cgroup *memcg) +{ + int num, ret; + + num = ida_simple_get(&kmem_limited_groups, + 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); + if (num < 0) + return num; + /* + * After this point, kmem_accounted (that we test atomically in + * the beginning of this conditional), is no longer 0. This + * guarantees only one process will set the following boolean + * to true. We don't need test_and_set because we're protected + * by the set_limit_mutex anyway. + */ + memcg_kmem_set_activated(memcg); + + ret = memcg_update_all_caches(num+1); + if (ret) { + ida_simple_remove(&kmem_limited_groups, num); + memcg_kmem_clear_activated(memcg); + return ret; + } + + memcg->kmemcg_id = num; + INIT_LIST_HEAD(&memcg->memcg_slab_caches); + mutex_init(&memcg->slab_caches_mutex); + return 0; +} + +static size_t memcg_caches_array_size(int num_groups) +{ + ssize_t size; + if (num_groups <= 0) + return 0; + + size = 2 * num_groups; + if (size < MEMCG_CACHES_MIN_SIZE) + size = MEMCG_CACHES_MIN_SIZE; + else if (size > MEMCG_CACHES_MAX_SIZE) + size = MEMCG_CACHES_MAX_SIZE; + + return size; +} + +/* + * We should update the current array size iff all caches updates succeed. This + * can only be done from the slab side. The slab mutex needs to be held when + * calling this. + */ +void memcg_update_array_size(int num) +{ + if (num > memcg_limited_groups_array_size) + memcg_limited_groups_array_size = memcg_caches_array_size(num); +} + +int memcg_update_cache_size(struct kmem_cache *s, int num_groups) +{ + struct memcg_cache_params *cur_params = s->memcg_params; + + VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); + + if (num_groups > memcg_limited_groups_array_size) { + int i; + ssize_t size = memcg_caches_array_size(num_groups); + + size *= sizeof(void *); + size += sizeof(struct memcg_cache_params); + + s->memcg_params = kzalloc(size, GFP_KERNEL); + if (!s->memcg_params) { + s->memcg_params = cur_params; + return -ENOMEM; + } + + s->memcg_params->is_root_cache = true; + + /* + * There is the chance it will be bigger than + * memcg_limited_groups_array_size, if we failed an allocation + * in a cache, in which case all caches updated before it, will + * have a bigger array. + * + * But if that is the case, the data after + * memcg_limited_groups_array_size is certainly unused + */ + for (i = 0; i < memcg_limited_groups_array_size; i++) { + if (!cur_params->memcg_caches[i]) + continue; + s->memcg_params->memcg_caches[i] = + cur_params->memcg_caches[i]; + } + + /* + * Ideally, we would wait until all caches succeed, and only + * then free the old one. But this is not worth the extra + * pointer per-cache we'd have to have for this. + * + * It is not a big deal if some caches are left with a size + * bigger than the others. And all updates will reset this + * anyway. + */ + kfree(cur_params); + } + return 0; +} + +int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, + struct kmem_cache *root_cache) +{ + size_t size = sizeof(struct memcg_cache_params); + + if (!memcg_kmem_enabled()) + return 0; + + if (!memcg) + size += memcg_limited_groups_array_size * sizeof(void *); + + s->memcg_params = kzalloc(size, GFP_KERNEL); + if (!s->memcg_params) + return -ENOMEM; + + if (memcg) { + s->memcg_params->memcg = memcg; + s->memcg_params->root_cache = root_cache; + } + return 0; +} + +void memcg_release_cache(struct kmem_cache *s) +{ + struct kmem_cache *root; + struct mem_cgroup *memcg; + int id; + + /* + * This happens, for instance, when a root cache goes away before we + * add any memcg. + */ + if (!s->memcg_params) + return; + + if (s->memcg_params->is_root_cache) + goto out; + + memcg = s->memcg_params->memcg; + id = memcg_cache_id(memcg); + + root = s->memcg_params->root_cache; + root->memcg_params->memcg_caches[id] = NULL; + mem_cgroup_put(memcg); + + mutex_lock(&memcg->slab_caches_mutex); + list_del(&s->memcg_params->list); + mutex_unlock(&memcg->slab_caches_mutex); + +out: + kfree(s->memcg_params); +} + +/* + * During the creation a new cache, we need to disable our accounting mechanism + * altogether. This is true even if we are not creating, but rather just + * enqueing new caches to be created. + * + * This is because that process will trigger allocations; some visible, like + * explicit kmallocs to auxiliary data structures, name strings and internal + * cache structures; some well concealed, like INIT_WORK() that can allocate + * objects during debug. + * + * If any allocation happens during memcg_kmem_get_cache, we will recurse back + * to it. This may not be a bounded recursion: since the first cache creation + * failed to complete (waiting on the allocation), we'll just try to create the + * cache again, failing at the same point. + * + * memcg_kmem_get_cache is prepared to abort after seeing a positive count of + * memcg_kmem_skip_account. So we enclose anything that might allocate memory + * inside the following two functions. + */ +static inline void memcg_stop_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account++; +} + +static inline void memcg_resume_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account--; +} + +static void kmem_cache_destroy_work_func(struct work_struct *w) +{ + struct kmem_cache *cachep; + struct memcg_cache_params *p; + + p = container_of(w, struct memcg_cache_params, destroy); + + cachep = memcg_params_to_cache(p); + + /* + * If we get down to 0 after shrink, we could delete right away. + * However, memcg_release_pages() already puts us back in the workqueue + * in that case. If we proceed deleting, we'll get a dangling + * reference, and removing the object from the workqueue in that case + * is unnecessary complication. We are not a fast path. + * + * Note that this case is fundamentally different from racing with + * shrink_slab(): if memcg_cgroup_destroy_cache() is called in + * kmem_cache_shrink, not only we would be reinserting a dead cache + * into the queue, but doing so from inside the worker racing to + * destroy it. + * + * So if we aren't down to zero, we'll just schedule a worker and try + * again + */ + if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) + return; + } else + kmem_cache_destroy(cachep); +} + +void mem_cgroup_destroy_cache(struct kmem_cache *cachep) +{ + if (!cachep->memcg_params->dead) + return; + + /* + * There are many ways in which we can get here. + * + * We can get to a memory-pressure situation while the delayed work is + * still pending to run. The vmscan shrinkers can then release all + * cache memory and get us to destruction. If this is the case, we'll + * be executed twice, which is a bug (the second time will execute over + * bogus data). In this case, cancelling the work should be fine. + * + * But we can also get here from the worker itself, if + * kmem_cache_shrink is enough to shake all the remaining objects and + * get the page count to 0. In this case, we'll deadlock if we try to + * cancel the work (the worker runs with an internal lock held, which + * is the same lock we would hold for cancel_work_sync().) + * + * Since we can't possibly know who got us here, just refrain from + * running if there is already work pending + */ + if (work_pending(&cachep->memcg_params->destroy)) + return; + /* + * We have to defer the actual destroying to a workqueue, because + * we might currently be in a context that cannot sleep. + */ + schedule_work(&cachep->memcg_params->destroy); +} + +static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) +{ + char *name; + struct dentry *dentry; + + rcu_read_lock(); + dentry = rcu_dereference(memcg->css.cgroup->dentry); + rcu_read_unlock(); + + BUG_ON(dentry == NULL); + + name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name, + memcg_cache_id(memcg), dentry->d_name.name); + + return name; +} + +static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, + struct kmem_cache *s) +{ + char *name; + struct kmem_cache *new; + + name = memcg_cache_name(memcg, s); + if (!name) + return NULL; + + new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, + (s->flags & ~SLAB_PANIC), s->ctor, s); + + if (new) + new->allocflags |= __GFP_KMEMCG; + + kfree(name); + return new; +} + +/* + * This lock protects updaters, not readers. We want readers to be as fast as + * they can, and they will either see NULL or a valid cache value. Our model + * allow them to see NULL, in which case the root memcg will be selected. + * + * We need this lock because multiple allocations to the same cache from a non + * will span more than one worker. Only one of them can create the cache. + */ +static DEFINE_MUTEX(memcg_cache_mutex); +static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + struct kmem_cache *new_cachep; + int idx; + + BUG_ON(!memcg_can_account_kmem(memcg)); + + idx = memcg_cache_id(memcg); + + mutex_lock(&memcg_cache_mutex); + new_cachep = cachep->memcg_params->memcg_caches[idx]; + if (new_cachep) + goto out; + + new_cachep = kmem_cache_dup(memcg, cachep); + if (new_cachep == NULL) { + new_cachep = cachep; + goto out; + } + + mem_cgroup_get(memcg); + atomic_set(&new_cachep->memcg_params->nr_pages , 0); + + cachep->memcg_params->memcg_caches[idx] = new_cachep; + /* + * the readers won't lock, make sure everybody sees the updated value, + * so they won't put stuff in the queue again for no reason + */ + wmb(); +out: + mutex_unlock(&memcg_cache_mutex); + return new_cachep; +} + +void kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ + struct kmem_cache *c; + int i; + + if (!s->memcg_params) + return; + if (!s->memcg_params->is_root_cache) + return; + + /* + * If the cache is being destroyed, we trust that there is no one else + * requesting objects from it. Even if there are, the sanity checks in + * kmem_cache_destroy should caught this ill-case. + * + * Still, we don't want anyone else freeing memcg_caches under our + * noses, which can happen if a new memcg comes to life. As usual, + * we'll take the set_limit_mutex to protect ourselves against this. + */ + mutex_lock(&set_limit_mutex); + for (i = 0; i < memcg_limited_groups_array_size; i++) { + c = s->memcg_params->memcg_caches[i]; + if (!c) + continue; + + /* + * We will now manually delete the caches, so to avoid races + * we need to cancel all pending destruction workers and + * proceed with destruction ourselves. + * + * kmem_cache_destroy() will call kmem_cache_shrink internally, + * and that could spawn the workers again: it is likely that + * the cache still have active pages until this very moment. + * This would lead us back to mem_cgroup_destroy_cache. + * + * But that will not execute at all if the "dead" flag is not + * set, so flip it down to guarantee we are in control. + */ + c->memcg_params->dead = false; + cancel_work_sync(&c->memcg_params->destroy); + kmem_cache_destroy(c); + } + mutex_unlock(&set_limit_mutex); +} + +struct create_work { + struct mem_cgroup *memcg; + struct kmem_cache *cachep; + struct work_struct work; +}; + +static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ + struct kmem_cache *cachep; + struct memcg_cache_params *params; + + if (!memcg_kmem_is_active(memcg)) + return; + + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + cachep = memcg_params_to_cache(params); + cachep->memcg_params->dead = true; + INIT_WORK(&cachep->memcg_params->destroy, + kmem_cache_destroy_work_func); + schedule_work(&cachep->memcg_params->destroy); + } + mutex_unlock(&memcg->slab_caches_mutex); +} + +static void memcg_create_cache_work_func(struct work_struct *w) +{ + struct create_work *cw; + + cw = container_of(w, struct create_work, work); + memcg_create_kmem_cache(cw->memcg, cw->cachep); + /* Drop the reference gotten when we enqueued. */ + css_put(&cw->memcg->css); + kfree(cw); +} + +/* + * Enqueue the creation of a per-memcg kmem_cache. + * Called with rcu_read_lock. + */ +static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + struct create_work *cw; + + cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); + if (cw == NULL) + return; + + /* The corresponding put will be done in the workqueue. */ + if (!css_tryget(&memcg->css)) { + kfree(cw); + return; + } + + cw->memcg = memcg; + cw->cachep = cachep; + + INIT_WORK(&cw->work, memcg_create_cache_work_func); + schedule_work(&cw->work); +} + +static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + /* + * We need to stop accounting when we kmalloc, because if the + * corresponding kmalloc cache is not yet created, the first allocation + * in __memcg_create_cache_enqueue will recurse. + * + * However, it is better to enclose the whole function. Depending on + * the debugging options enabled, INIT_WORK(), for instance, can + * trigger an allocation. This too, will make us recurse. Because at + * this point we can't allow ourselves back into memcg_kmem_get_cache, + * the safest choice is to do it like this, wrapping the whole function. + */ + memcg_stop_kmem_account(); + __memcg_create_cache_enqueue(memcg, cachep); + memcg_resume_kmem_account(); +} +/* + * Return the kmem_cache we're supposed to use for a slab allocation. + * We try to use the current memcg's version of the cache. + * + * If the cache does not exist yet, if we are the first user of it, + * we either create it immediately, if possible, or create it asynchronously + * in a workqueue. + * In the latter case, we will let the current allocation go through with + * the original cache. + * + * Can't be called in interrupt context or from kernel threads. + * This function needs to be called with rcu_read_lock() held. + */ +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, + gfp_t gfp) +{ + struct mem_cgroup *memcg; + int idx; + + VM_BUG_ON(!cachep->memcg_params); + VM_BUG_ON(!cachep->memcg_params->is_root_cache); + + if (!current->mm || current->memcg_kmem_skip_account) + return cachep; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); + rcu_read_unlock(); + + if (!memcg_can_account_kmem(memcg)) + return cachep; + + idx = memcg_cache_id(memcg); + + /* + * barrier to mare sure we're always seeing the up to date value. The + * code updating memcg_caches will issue a write barrier to match this. + */ + read_barrier_depends(); + if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) { + /* + * If we are in a safe context (can wait, and not in interrupt + * context), we could be be predictable and return right away. + * This would guarantee that the allocation being performed + * already belongs in the new cache. + * + * However, there are some clashes that can arrive from locking. + * For instance, because we acquire the slab_mutex while doing + * kmem_cache_dup, this means no further allocation could happen + * with the slab_mutex held. + * + * Also, because cache creation issue get_online_cpus(), this + * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, + * that ends up reversed during cpu hotplug. (cpuset allocates + * a bunch of GFP_KERNEL memory during cpuup). Due to all that, + * better to defer everything. + */ + memcg_create_cache_enqueue(memcg, cachep); + return cachep; + } + + return cachep->memcg_params->memcg_caches[idx]; +} +EXPORT_SYMBOL(__memcg_kmem_get_cache); + +/* + * We need to verify if the allocation against current->mm->owner's memcg is + * possible for the given order. But the page is not allocated yet, so we'll + * need a further commit step to do the final arrangements. + * + * It is possible for the task to switch cgroups in this mean time, so at + * commit time, we can't rely on task conversion any longer. We'll then use + * the handle argument to return to the caller which cgroup we should commit + * against. We could also return the memcg directly and avoid the pointer + * passing, but a boolean return value gives better semantics considering + * the compiled-out case as well. + * + * Returning true means the allocation is possible. + */ +bool +__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) +{ + struct mem_cgroup *memcg; + int ret; + + *_memcg = NULL; + memcg = try_get_mem_cgroup_from_mm(current->mm); + + /* + * very rare case described in mem_cgroup_from_task. Unfortunately there + * isn't much we can do without complicating this too much, and it would + * be gfp-dependent anyway. Just let it go + */ + if (unlikely(!memcg)) + return true; + + if (!memcg_can_account_kmem(memcg)) { + css_put(&memcg->css); + return true; + } + + ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); + if (!ret) + *_memcg = memcg; + + css_put(&memcg->css); + return (ret == 0); +} + +void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, + int order) +{ + struct page_cgroup *pc; + + VM_BUG_ON(mem_cgroup_is_root(memcg)); + + /* The page allocation failed. Revert */ + if (!page) { + memcg_uncharge_kmem(memcg, PAGE_SIZE << order); + return; + } + + pc = lookup_page_cgroup(page); + lock_page_cgroup(pc); + pc->mem_cgroup = memcg; + SetPageCgroupUsed(pc); + unlock_page_cgroup(pc); +} + +void __memcg_kmem_uncharge_pages(struct page *page, int order) +{ + struct mem_cgroup *memcg = NULL; + struct page_cgroup *pc; + + + pc = lookup_page_cgroup(page); + /* + * Fast unlocked return. Theoretically might have changed, have to + * check again after locking. + */ + if (!PageCgroupUsed(pc)) + return; + + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { + memcg = pc->mem_cgroup; + ClearPageCgroupUsed(pc); + } + unlock_page_cgroup(pc); + + /* + * We trust that only if there is a memcg associated with the page, it + * is a valid allocation + */ + if (!memcg) + return; + + VM_BUG_ON(mem_cgroup_is_root(memcg)); + memcg_uncharge_kmem(memcg, PAGE_SIZE << order); +} +#else +static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) @@ -3486,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page) } #endif -static DEFINE_MUTEX(set_limit_mutex); - static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { @@ -3772,6 +4679,7 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) { int node, zid; + u64 usage; do { /* This is for making all *used* pages to be on LRU. */ @@ -3792,13 +4700,20 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) cond_resched(); /* + * Kernel memory may not necessarily be trackable to a specific + * process. So they are not migrated, and therefore we can't + * expect their value to drop to 0 here. + * Having res filled up with kmem only is enough. + * * This is a safety check because mem_cgroup_force_empty_list * could have raced with mem_cgroup_replace_page_cache callers * so the lru seemed empty but the page could have been added * right after the check. RES_USAGE should be safe as we always * charge before adding to the LRU. */ - } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0); + usage = res_counter_read_u64(&memcg->res, RES_USAGE) - + res_counter_read_u64(&memcg->kmem, RES_USAGE); + } while (usage > 0); } /* @@ -3942,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); char str[64]; u64 val; - int type, name, len; + int name, len; + enum res_type type; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); @@ -3963,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, else val = res_counter_read_u64(&memcg->memsw, name); break; + case _KMEM: + val = res_counter_read_u64(&memcg->kmem, name); + break; default: BUG(); } @@ -3970,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); return simple_read_from_buffer(buf, nbytes, ppos, str, len); } + +static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) +{ + int ret = -EINVAL; +#ifdef CONFIG_MEMCG_KMEM + bool must_inc_static_branch = false; + + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + /* + * For simplicity, we won't allow this to be disabled. It also can't + * be changed if the cgroup has children already, or if tasks had + * already joined. + * + * If tasks join before we set the limit, a person looking at + * kmem.usage_in_bytes will have no way to determine when it took + * place, which makes the value quite meaningless. + * + * After it first became limited, changes in the value of the limit are + * of course permitted. + * + * Taking the cgroup_lock is really offensive, but it is so far the only + * way to guarantee that no children will appear. There are plenty of + * other offenders, and they should all go away. Fine grained locking + * is probably the way to go here. When we are fully hierarchical, we + * can also get rid of the use_hierarchy check. + */ + cgroup_lock(); + mutex_lock(&set_limit_mutex); + if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { + if (cgroup_task_count(cont) || (memcg->use_hierarchy && + !list_empty(&cont->children))) { + ret = -EBUSY; + goto out; + } + ret = res_counter_set_limit(&memcg->kmem, val); + VM_BUG_ON(ret); + + ret = memcg_update_cache_sizes(memcg); + if (ret) { + res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); + goto out; + } + must_inc_static_branch = true; + /* + * kmem charges can outlive the cgroup. In the case of slab + * pages, for instance, a page contain objects from various + * processes, so it is unfeasible to migrate them away. We + * need to reference count the memcg because of that. + */ + mem_cgroup_get(memcg); + } else + ret = res_counter_set_limit(&memcg->kmem, val); +out: + mutex_unlock(&set_limit_mutex); + cgroup_unlock(); + + /* + * We are by now familiar with the fact that we can't inc the static + * branch inside cgroup_lock. See disarm functions for details. A + * worker here is overkill, but also wrong: After the limit is set, we + * must start accounting right away. Since this operation can't fail, + * we can safely defer it to here - no rollback will be needed. + * + * The boolean used to control this is also safe, because + * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be + * able to set it to true; + */ + if (must_inc_static_branch) { + static_key_slow_inc(&memcg_kmem_enabled_key); + /* + * setting the active bit after the inc will guarantee no one + * starts accounting before all call sites are patched + */ + memcg_kmem_set_active(memcg); + } + +#endif + return ret; +} + +static int memcg_propagate_kmem(struct mem_cgroup *memcg) +{ + int ret = 0; + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + if (!parent) + goto out; + + memcg->kmem_account_flags = parent->kmem_account_flags; +#ifdef CONFIG_MEMCG_KMEM + /* + * When that happen, we need to disable the static branch only on those + * memcgs that enabled it. To achieve this, we would be forced to + * complicate the code by keeping track of which memcgs were the ones + * that actually enabled limits, and which ones got it from its + * parents. + * + * It is a lot simpler just to do static_key_slow_inc() on every child + * that is accounted. + */ + if (!memcg_kmem_is_active(memcg)) + goto out; + + /* + * destroy(), called if we fail, will issue static_key_slow_inc() and + * mem_cgroup_put() if kmem is enabled. We have to either call them + * unconditionally, or clear the KMEM_ACTIVE flag. I personally find + * this more consistent, since it always leads to the same destroy path + */ + mem_cgroup_get(memcg); + static_key_slow_inc(&memcg_kmem_enabled_key); + + mutex_lock(&set_limit_mutex); + ret = memcg_update_cache_sizes(memcg); + mutex_unlock(&set_limit_mutex); +#endif +out: + return ret; +} + /* * The user of this function is... * RES_LIMIT. @@ -3978,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, const char *buffer) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - int type, name; + enum res_type type; + int name; unsigned long long val; int ret; @@ -4000,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, break; if (type == _MEM) ret = mem_cgroup_resize_limit(memcg, val); - else + else if (type == _MEMSWAP) ret = mem_cgroup_resize_memsw_limit(memcg, val); + else if (type == _KMEM) + ret = memcg_update_kmem_limit(cont, val); + else + return -EINVAL; break; case RES_SOFT_LIMIT: ret = res_counter_memparse_write_strategy(buffer, &val); @@ -4054,7 +5097,8 @@ out: static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - int type, name; + int name; + enum res_type type; type = MEMFILE_TYPE(event); name = MEMFILE_ATTR(event); @@ -4066,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) case RES_MAX_USAGE: if (type == _MEM) res_counter_reset_max(&memcg->res); - else + else if (type == _MEMSWAP) res_counter_reset_max(&memcg->memsw); + else if (type == _KMEM) + res_counter_reset_max(&memcg->kmem); + else + return -EINVAL; break; case RES_FAILCNT: if (type == _MEM) res_counter_reset_failcnt(&memcg->res); - else + else if (type == _MEMSWAP) res_counter_reset_failcnt(&memcg->memsw); + else if (type == _KMEM) + res_counter_reset_failcnt(&memcg->kmem); + else + return -EINVAL; break; } @@ -4390,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); u64 threshold, usage; int i, size, ret; @@ -4473,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); u64 usage; int i, j, size; @@ -4551,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *event; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); BUG_ON(type != _OOM_TYPE); event = kmalloc(sizeof(*event), GFP_KERNEL); @@ -4576,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *ev, *tmp; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); BUG_ON(type != _OOM_TYPE); @@ -4635,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, #ifdef CONFIG_MEMCG_KMEM static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { + int ret; + + memcg->kmemcg_id = -1; + ret = memcg_propagate_kmem(memcg); + if (ret) + return ret; + return mem_cgroup_sockets_init(memcg, ss); }; static void kmem_cgroup_destroy(struct mem_cgroup *memcg) { mem_cgroup_sockets_destroy(memcg); + + memcg_kmem_mark_dead(memcg); + + if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) + return; + + /* + * Charges already down to 0, undo mem_cgroup_get() done in the charge + * path here, being careful not to race with memcg_uncharge_kmem: it is + * possible that the charges went down to 0 between mark_dead and the + * res_counter read, so in that case, we don't need the put + */ + if (memcg_kmem_test_and_clear_dead(memcg)) + mem_cgroup_put(memcg); } #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) @@ -4749,6 +5822,37 @@ static struct cftype mem_cgroup_files[] = { .read = mem_cgroup_read, }, #endif +#ifdef CONFIG_MEMCG_KMEM + { + .name = "kmem.limit_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), + .write_string = mem_cgroup_write, + .read = mem_cgroup_read, + }, + { + .name = "kmem.usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), + .read = mem_cgroup_read, + }, + { + .name = "kmem.failcnt", + .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, + { + .name = "kmem.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, +#ifdef CONFIG_SLABINFO + { + .name = "kmem.slabinfo", + .read_seq_string = mem_cgroup_slabinfo_read, + }, +#endif +#endif { }, /* terminate */ }; @@ -4816,16 +5920,29 @@ out_free: } /* - * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, - * but in process context. The work_freeing structure is overlaid - * on the rcu_freeing structure, which itself is overlaid on memsw. + * At destroying mem_cgroup, references from swap_cgroup can remain. + * (scanning all at force_empty is too costly...) + * + * Instead of clearing all references at force_empty, we remember + * the number of reference from swap_cgroup and free mem_cgroup when + * it goes down to 0. + * + * Removal of cgroup itself succeeds regardless of refs from swap. */ -static void free_work(struct work_struct *work) + +static void __mem_cgroup_free(struct mem_cgroup *memcg) { - struct mem_cgroup *memcg; + int node; int size = sizeof(struct mem_cgroup); - memcg = container_of(work, struct mem_cgroup, work_freeing); + mem_cgroup_remove_from_trees(memcg); + free_css_id(&mem_cgroup_subsys, &memcg->css); + + for_each_node(node) + free_mem_cgroup_per_zone_info(memcg, node); + + free_percpu(memcg->stat); + /* * We need to make sure that (at least for now), the jump label * destruction code runs outside of the cgroup lock. This is because @@ -4837,45 +5954,34 @@ static void free_work(struct work_struct *work) * to move this code around, and make sure it is outside * the cgroup_lock. */ - disarm_sock_keys(memcg); + disarm_static_keys(memcg); if (size < PAGE_SIZE) kfree(memcg); else vfree(memcg); } -static void free_rcu(struct rcu_head *rcu_head) -{ - struct mem_cgroup *memcg; - - memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); - INIT_WORK(&memcg->work_freeing, free_work); - schedule_work(&memcg->work_freeing); -} /* - * At destroying mem_cgroup, references from swap_cgroup can remain. - * (scanning all at force_empty is too costly...) - * - * Instead of clearing all references at force_empty, we remember - * the number of reference from swap_cgroup and free mem_cgroup when - * it goes down to 0. - * - * Removal of cgroup itself succeeds regardless of refs from swap. + * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, + * but in process context. The work_freeing structure is overlaid + * on the rcu_freeing structure, which itself is overlaid on memsw. */ - -static void __mem_cgroup_free(struct mem_cgroup *memcg) +static void free_work(struct work_struct *work) { - int node; + struct mem_cgroup *memcg; - mem_cgroup_remove_from_trees(memcg); - free_css_id(&mem_cgroup_subsys, &memcg->css); + memcg = container_of(work, struct mem_cgroup, work_freeing); + __mem_cgroup_free(memcg); +} - for_each_node(node) - free_mem_cgroup_per_zone_info(memcg, node); +static void free_rcu(struct rcu_head *rcu_head) +{ + struct mem_cgroup *memcg; - free_percpu(memcg->stat); - call_rcu(&memcg->rcu_freeing, free_rcu); + memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); + INIT_WORK(&memcg->work_freeing, free_work); + schedule_work(&memcg->work_freeing); } static void mem_cgroup_get(struct mem_cgroup *memcg) @@ -4887,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) { if (atomic_sub_and_test(count, &memcg->refcnt)) { struct mem_cgroup *parent = parent_mem_cgroup(memcg); - __mem_cgroup_free(memcg); + call_rcu(&memcg->rcu_freeing, free_rcu); if (parent) mem_cgroup_put(parent); } @@ -4994,6 +6100,8 @@ mem_cgroup_css_alloc(struct cgroup *cont) if (parent && parent->use_hierarchy) { res_counter_init(&memcg->res, &parent->res); res_counter_init(&memcg->memsw, &parent->memsw); + res_counter_init(&memcg->kmem, &parent->kmem); + /* * We increment refcnt of the parent to ensure that we can * safely access it on res_counter_charge/uncharge. @@ -5004,6 +6112,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); + res_counter_init(&memcg->kmem, NULL); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -5043,6 +6152,7 @@ static void mem_cgroup_css_offline(struct cgroup *cont) struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); mem_cgroup_reparent_charges(memcg); + mem_cgroup_destroy_all_caches(memcg); } static void mem_cgroup_css_free(struct cgroup *cont) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 962e353aa86f..d04ed87bfacb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -590,18 +590,21 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, } #ifdef CONFIG_MOVABLE_NODE -/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ +/* + * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have + * normal memory. + */ static bool can_online_high_movable(struct zone *zone) { return true; } -#else /* #ifdef CONFIG_MOVABLE_NODE */ +#else /* CONFIG_MOVABLE_NODE */ /* ensure every online node has NORMAL memory */ static bool can_online_high_movable(struct zone *zone) { return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); } -#endif /* #ifdef CONFIG_MOVABLE_NODE */ +#endif /* CONFIG_MOVABLE_NODE */ /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, @@ -1112,12 +1115,15 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) } #ifdef CONFIG_MOVABLE_NODE -/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ +/* + * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have + * normal memory. + */ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) { return true; } -#else /* #ifdef CONFIG_MOVABLE_NODE */ +#else /* CONFIG_MOVABLE_NODE */ /* ensure the node has NORMAL memory if it is still online */ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) { @@ -1141,7 +1147,7 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) */ return present_pages == 0; } -#endif /* #ifdef CONFIG_MOVABLE_NODE */ +#endif /* CONFIG_MOVABLE_NODE */ /* check which state of node_states will be changed when offline memory */ static void node_states_check_changes_offline(unsigned long nr_pages, diff --git a/mm/mprotect.c b/mm/mprotect.c index 3dca970367db..94722a4d6b43 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -114,7 +114,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, #ifdef CONFIG_NUMA_BALANCING static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) + pmd_t *pmd) { spin_lock(&mm->page_table_lock); set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); @@ -122,15 +122,15 @@ static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, } #else static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) + pmd_t *pmd) { BUG(); } #endif /* CONFIG_NUMA_BALANCING */ -static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) +static inline unsigned long change_pmd_range(struct vm_area_struct *vma, + pud_t *pud, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) { pmd_t *pmd; unsigned long next; @@ -143,7 +143,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { + else if (change_huge_pmd(vma, pmd, addr, newprot, + prot_numa)) { pages += HPAGE_PMD_NR; continue; } @@ -167,9 +168,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * return pages; } -static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) +static inline unsigned long change_pud_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) { pud_t *pud; unsigned long next; @@ -304,7 +305,8 @@ success: dirty_accountable = 1; } - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); + change_protection(vma, start, end, vma->vm_page_prot, + dirty_accountable, 0); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); @@ -361,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = -EINVAL; if (!(vma->vm_flags & VM_GROWSDOWN)) goto out; - } - else { + } else { if (vma->vm_start > start) goto out; if (unlikely(grows & PROT_GROWSUP)) { @@ -378,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, for (nstart = start ; ; ) { unsigned long newflags; - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); + newflags = vm_flags; + newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d037c8bc1512..2ad2ad168efe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -371,8 +371,7 @@ static int destroy_compound_page(struct page *page, unsigned long order) int nr_pages = 1 << order; int bad = 0; - if (unlikely(compound_order(page) != order) || - unlikely(!PageHead(page))) { + if (unlikely(compound_order(page) != order)) { bad_page(page); bad++; } @@ -2613,6 +2612,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; + struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2631,6 +2631,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + /* + * Will only have any effect when __GFP_KMEMCG is set. This is + * verified in the (always inline) callee + */ + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + retry_cpuset: cpuset_mems_cookie = get_mems_allowed(); @@ -2666,6 +2673,8 @@ out: if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) goto retry_cpuset; + memcg_kmem_commit_charge(page, memcg, order); + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2718,6 +2727,31 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); +/* + * __free_memcg_kmem_pages and free_memcg_kmem_pages will free + * pages allocated with __GFP_KMEMCG. + * + * Those pages are accounted to a particular memcg, embedded in the + * corresponding page_cgroup. To avoid adding a hit in the allocator to search + * for that information only to find out that it is NULL for users who have no + * interest in that whatsoever, we provide these functions. + * + * The caller knows better which flags it relies on. + */ +void __free_memcg_kmem_pages(struct page *page, unsigned int order) +{ + memcg_kmem_uncharge_pages(page, order); + __free_pages(page, order); +} + +void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + VM_BUG_ON(!virt_addr_valid((void *)addr)); + __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + } +} + static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) { if (addr) { diff --git a/mm/slab.c b/mm/slab.c index 2c3a2e0394db..e7667a3584bc 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -87,7 +87,6 @@ */ #include <linux/slab.h> -#include "slab.h" #include <linux/mm.h> #include <linux/poison.h> #include <linux/swap.h> @@ -128,6 +127,8 @@ #include "internal.h" +#include "slab.h" + /* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). @@ -641,6 +642,26 @@ static void init_node_lock_keys(int q) } } +static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) +{ + struct kmem_list3 *l3; + l3 = cachep->nodelists[q]; + if (!l3) + return; + + slab_set_lock_classes(cachep, &on_slab_l3_key, + &on_slab_alc_key, q); +} + +static inline void on_slab_lock_classes(struct kmem_cache *cachep) +{ + int node; + + VM_BUG_ON(OFF_SLAB(cachep)); + for_each_node(node) + on_slab_lock_classes_node(cachep, node); +} + static inline void init_lock_keys(void) { int node; @@ -657,6 +678,14 @@ static inline void init_lock_keys(void) { } +static inline void on_slab_lock_classes(struct kmem_cache *cachep) +{ +} + +static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) +{ +} + static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) { } @@ -1385,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu) free_alien_cache(alien); if (cachep->flags & SLAB_DEBUG_OBJECTS) slab_set_debugobj_lock_classes_node(cachep, node); + else if (!OFF_SLAB(cachep) && + !(cachep->flags & SLAB_DESTROY_BY_RCU)) + on_slab_lock_classes_node(cachep, node); } init_node_lock_keys(node); @@ -1863,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (page->pfmemalloc) SetPageSlabPfmemalloc(page + i); } + memcg_bind_pages(cachep, cachep->gfporder); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); @@ -1899,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) __ClearPageSlab(page); page++; } + + memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - free_pages((unsigned long)addr, cachep->gfporder); + free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) @@ -2489,7 +2524,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); slab_set_debugobj_lock_classes(cachep); - } + } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) + on_slab_lock_classes(cachep); return 0; } @@ -3453,6 +3489,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (slab_should_failslab(cachep, flags)) return NULL; + cachep = memcg_kmem_get_cache(cachep, flags); + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3538,6 +3576,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) if (slab_should_failslab(cachep, flags)) return NULL; + cachep = memcg_kmem_get_cache(cachep, flags); + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); @@ -3851,6 +3891,9 @@ EXPORT_SYMBOL(__kmalloc); void kmem_cache_free(struct kmem_cache *cachep, void *objp) { unsigned long flags; + cachep = cache_from_obj(cachep, objp); + if (!cachep) + return; local_irq_save(flags); debug_check_no_locks_freed(objp, cachep->object_size); @@ -3998,7 +4041,7 @@ static void do_ccupdate_local(void *info) } /* Always called with the slab_mutex held */ -static int do_tune_cpucache(struct kmem_cache *cachep, int limit, +static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { struct ccupdate_struct *new; @@ -4041,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, return alloc_kmemlist(cachep, gfp); } +static int do_tune_cpucache(struct kmem_cache *cachep, int limit, + int batchcount, int shared, gfp_t gfp) +{ + int ret; + struct kmem_cache *c = NULL; + int i = 0; + + ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); + + if (slab_state < FULL) + return ret; + + if ((ret < 0) || !is_root_cache(cachep)) + return ret; + + VM_BUG_ON(!mutex_is_locked(&slab_mutex)); + for_each_memcg_cache_index(i) { + c = cache_from_memcg(cachep, i); + if (c) + /* return value determined by the parent cache only */ + __do_tune_cpucache(c, limit, batchcount, shared, gfp); + } + + return ret; +} + /* Called with slab_mutex held always */ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) { int err; - int limit, shared; + int limit = 0; + int shared = 0; + int batchcount = 0; + + if (!is_root_cache(cachep)) { + struct kmem_cache *root = memcg_root_cache(cachep); + limit = root->limit; + shared = root->shared; + batchcount = root->batchcount; + } + if (limit && shared && batchcount) + goto skip_setup; /* * The head array serves three purposes: * - create a LIFO ordering, i.e. return objects that are cache-warm @@ -4088,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) if (limit > 32) limit = 32; #endif - err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); + batchcount = (limit + 1) / 2; +skip_setup: + err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", cachep->name, -err); diff --git a/mm/slab.h b/mm/slab.h index 1cb9c9ee0e6f..34a98d642196 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -43,12 +43,15 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, extern void create_boot_cache(struct kmem_cache *, const char *name, size_t size, unsigned long flags); +struct mem_cgroup; #ifdef CONFIG_SLUB -struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)); +struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)); #else -static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +static inline struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) { return NULL; } #endif @@ -100,4 +103,130 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); + +#ifdef CONFIG_MEMCG_KMEM +static inline bool is_root_cache(struct kmem_cache *s) +{ + return !s->memcg_params || s->memcg_params->is_root_cache; +} + +static inline bool cache_match_memcg(struct kmem_cache *cachep, + struct mem_cgroup *memcg) +{ + return (is_root_cache(cachep) && !memcg) || + (cachep->memcg_params->memcg == memcg); +} + +static inline void memcg_bind_pages(struct kmem_cache *s, int order) +{ + if (!is_root_cache(s)) + atomic_add(1 << order, &s->memcg_params->nr_pages); +} + +static inline void memcg_release_pages(struct kmem_cache *s, int order) +{ + if (is_root_cache(s)) + return; + + if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) + mem_cgroup_destroy_cache(s); +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return (p == s) || + (s->memcg_params && (p == s->memcg_params->root_cache)); +} + +/* + * We use suffixes to the name in memcg because we can't have caches + * created in the system with the same name. But when we print them + * locally, better refer to them with the base name + */ +static inline const char *cache_name(struct kmem_cache *s) +{ + if (!is_root_cache(s)) + return s->memcg_params->root_cache->name; + return s->name; +} + +static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +{ + return s->memcg_params->memcg_caches[idx]; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) + return s; + return s->memcg_params->root_cache; +} +#else +static inline bool is_root_cache(struct kmem_cache *s) +{ + return true; +} + +static inline bool cache_match_memcg(struct kmem_cache *cachep, + struct mem_cgroup *memcg) +{ + return true; +} + +static inline void memcg_bind_pages(struct kmem_cache *s, int order) +{ +} + +static inline void memcg_release_pages(struct kmem_cache *s, int order) +{ +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return true; +} + +static inline const char *cache_name(struct kmem_cache *s) +{ + return s->name; +} + +static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +{ + return NULL; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + return s; +} +#endif + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + struct page *page; + + /* + * When kmemcg is not being used, both assignments should return the + * same value. but we don't want to pay the assignment price in that + * case. If it is not compiled in, the compiler should be smart enough + * to not do even the assignment. In that case, slab_equal_or_root + * will also be a constant. + */ + if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) + return s; + + page = virt_to_head_page(x); + cachep = page->slab_cache; + if (slab_equal_or_root(cachep, s)) + return cachep; + + pr_err("%s: Wrong slab cache. %s but object is from %s\n", + __FUNCTION__, cachep->name, s->name); + WARN_ON_ONCE(1); + return s; +} #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index a8e76d79ee65..3f3cd97d3fdf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -18,6 +18,7 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/page.h> +#include <linux/memcontrol.h> #include "slab.h" @@ -27,7 +28,8 @@ DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; #ifdef CONFIG_DEBUG_VM -static int kmem_cache_sanity_check(const char *name, size_t size) +static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, + size_t size) { struct kmem_cache *s = NULL; @@ -53,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size) continue; } - if (!strcmp(s->name, name)) { + /* + * For simplicity, we won't check this in the list of memcg + * caches. We have control over memcg naming, and if there + * aren't duplicates in the global list, there won't be any + * duplicates in the memcg lists as well. + */ + if (!memcg && !strcmp(s->name, name)) { pr_err("%s (%s): Cache name already exists.\n", __func__, name); dump_stack(); @@ -66,12 +74,41 @@ static int kmem_cache_sanity_check(const char *name, size_t size) return 0; } #else -static inline int kmem_cache_sanity_check(const char *name, size_t size) +static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, + const char *name, size_t size) { return 0; } #endif +#ifdef CONFIG_MEMCG_KMEM +int memcg_update_all_caches(int num_memcgs) +{ + struct kmem_cache *s; + int ret = 0; + mutex_lock(&slab_mutex); + + list_for_each_entry(s, &slab_caches, list) { + if (!is_root_cache(s)) + continue; + + ret = memcg_update_cache_size(s, num_memcgs); + /* + * See comment in memcontrol.c, memcg_update_cache_size: + * Instead of freeing the memory, we'll just leave the caches + * up to this point in an updated state. + */ + if (ret) + goto out; + } + + memcg_update_array_size(num_memcgs); +out: + mutex_unlock(&slab_mutex); + return ret; +} +#endif + /* * Figure out what the alignment of the objects will be given a set of * flags, a user specified alignment and the size of the objects. @@ -125,8 +162,10 @@ unsigned long calculate_alignment(unsigned long flags, * as davem. */ -struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) +struct kmem_cache * +kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *), + struct kmem_cache *parent_cache) { struct kmem_cache *s = NULL; int err = 0; @@ -134,7 +173,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align get_online_cpus(); mutex_lock(&slab_mutex); - if (!kmem_cache_sanity_check(name, size) == 0) + if (!kmem_cache_sanity_check(memcg, name, size) == 0) goto out_locked; /* @@ -145,7 +184,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align */ flags &= CACHE_CREATE_MASK; - s = __kmem_cache_alias(name, size, align, flags, ctor); + s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); if (s) goto out_locked; @@ -154,6 +193,13 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align s->object_size = s->size = size; s->align = calculate_alignment(flags, align, size); s->ctor = ctor; + + if (memcg_register_cache(memcg, s, parent_cache)) { + kmem_cache_free(kmem_cache, s); + err = -ENOMEM; + goto out_locked; + } + s->name = kstrdup(name, GFP_KERNEL); if (!s->name) { kmem_cache_free(kmem_cache, s); @@ -163,10 +209,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align err = __kmem_cache_create(s, flags); if (!err) { - s->refcount = 1; list_add(&s->list, &slab_caches); - + memcg_cache_list_add(memcg, s); } else { kfree(s->name); kmem_cache_free(kmem_cache, s); @@ -194,10 +239,20 @@ out_locked: return s; } + +struct kmem_cache * +kmem_cache_create(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ + return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); +} EXPORT_SYMBOL(kmem_cache_create); void kmem_cache_destroy(struct kmem_cache *s) { + /* Destroy all the children caches if we aren't a memcg cache */ + kmem_cache_destroy_memcg_children(s); + get_online_cpus(); mutex_lock(&slab_mutex); s->refcount--; @@ -209,6 +264,7 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); + memcg_release_cache(s); kfree(s->name); kmem_cache_free(kmem_cache, s); } else { @@ -267,7 +323,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, #ifdef CONFIG_SLABINFO -static void print_slabinfo_header(struct seq_file *m) +void print_slabinfo_header(struct seq_file *m) { /* * Output format version, so at least we can change it @@ -311,16 +367,43 @@ static void s_stop(struct seq_file *m, void *p) mutex_unlock(&slab_mutex); } -static int s_show(struct seq_file *m, void *p) +static void +memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) +{ + struct kmem_cache *c; + struct slabinfo sinfo; + int i; + + if (!is_root_cache(s)) + return; + + for_each_memcg_cache_index(i) { + c = cache_from_memcg(s, i); + if (!c) + continue; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(c, &sinfo); + + info->active_slabs += sinfo.active_slabs; + info->num_slabs += sinfo.num_slabs; + info->shared_avail += sinfo.shared_avail; + info->active_objs += sinfo.active_objs; + info->num_objs += sinfo.num_objs; + } +} + +int cache_show(struct kmem_cache *s, struct seq_file *m) { - struct kmem_cache *s = list_entry(p, struct kmem_cache, list); struct slabinfo sinfo; memset(&sinfo, 0, sizeof(sinfo)); get_slabinfo(s, &sinfo); + memcg_accumulate_slabinfo(s, &sinfo); + seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", - s->name, sinfo.active_objs, sinfo.num_objs, s->size, + cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size, sinfo.objects_per_slab, (1 << sinfo.cache_order)); seq_printf(m, " : tunables %4u %4u %4u", @@ -332,6 +415,15 @@ static int s_show(struct seq_file *m, void *p) return 0; } +static int s_show(struct seq_file *m, void *p) +{ + struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + + if (!is_root_cache(s)) + return 0; + return cache_show(s, m); +} + /* * slabinfo_op - iterator that generates /proc/slabinfo * diff --git a/mm/slob.c b/mm/slob.c index 795bab7d391d..a99fdf7a0907 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -58,7 +58,6 @@ #include <linux/kernel.h> #include <linux/slab.h> -#include "slab.h" #include <linux/mm.h> #include <linux/swap.h> /* struct reclaim_state */ @@ -73,6 +72,7 @@ #include <linux/atomic.h> +#include "slab.h" /* * slob_block has a field 'units', which indicates size of block if +ve, * or offset of next block if -ve (in SLOB_UNITs). diff --git a/mm/slub.c b/mm/slub.c index 87f9f32bf0cd..ba2ca53f6c3a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -31,6 +31,7 @@ #include <linux/fault-inject.h> #include <linux/stacktrace.h> #include <linux/prefetch.h> +#include <linux/memcontrol.h> #include <trace/events/kmem.h> @@ -200,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); - +static void memcg_propagate_slab_attrs(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } static inline void sysfs_slab_remove(struct kmem_cache *s) { } +static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } #endif static inline void stat(const struct kmem_cache *s, enum stat_item si) @@ -1343,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) void *start; void *last; void *p; + int order; BUG_ON(flags & GFP_SLAB_BUG_MASK); @@ -1351,7 +1354,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) goto out; + order = compound_order(page); inc_slabs_node(s, page_to_nid(page), page->objects); + memcg_bind_pages(s, order); page->slab_cache = s; __SetPageSlab(page); if (page->pfmemalloc) @@ -1360,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); + memset(start, POISON_INUSE, PAGE_SIZE << order); last = start; for_each_object(p, s, start, page->objects) { @@ -1401,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); + + memcg_release_pages(s, order); reset_page_mapcount(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_pages(page, order); + __free_memcg_kmem_pages(page, order); } #define need_reserve_slab_rcu \ @@ -2322,6 +2329,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, if (slab_pre_alloc_hook(s, gfpflags)) return NULL; + s = memcg_kmem_get_cache(s, gfpflags); redo: /* @@ -2610,19 +2618,10 @@ redo: void kmem_cache_free(struct kmem_cache *s, void *x) { - struct page *page; - - page = virt_to_head_page(x); - - if (kmem_cache_debug(s) && page->slab_cache != s) { - pr_err("kmem_cache_free: Wrong slab cache. %s but object" - " is from %s\n", page->slab_cache->name, s->name); - WARN_ON_ONCE(1); + s = cache_from_obj(s, x); + if (!s) return; - } - - slab_free(s, page, x, _RET_IP_); - + slab_free(s, virt_to_head_page(x), x, _RET_IP_); trace_kmem_cache_free(_RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); @@ -3154,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s) { int rc = kmem_cache_close(s); - if (!rc) + if (!rc) { + /* + * We do the same lock strategy around sysfs_slab_add, see + * __kmem_cache_create. Because this is pretty much the last + * operation we do and the lock will be released shortly after + * that in slab_common.c, we could just move sysfs_slab_remove + * to a later point in common code. We should do that when we + * have a common sysfs framework for all allocators. + */ + mutex_unlock(&slab_mutex); sysfs_slab_remove(s); + mutex_lock(&slab_mutex); + } return rc; } @@ -3292,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK; + flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; page = alloc_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3398,7 +3408,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kmemleak_free(x); - __free_pages(page, compound_order(page)); + __free_memcg_kmem_pages(page, compound_order(page)); return; } slab_free(page->slab_cache, page, object, _RET_IP_); @@ -3786,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s) return 0; } -static struct kmem_cache *find_mergeable(size_t size, +static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, size_t align, unsigned long flags, const char *name, void (*ctor)(void *)) { @@ -3822,17 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size, if (s->size - size >= sizeof(void *)) continue; + if (!cache_match_memcg(s, memcg)) + continue; + return s; } return NULL; } -struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - s = find_mergeable(size, align, flags, name, ctor); + s = find_mergeable(memcg, size, align, flags, name, ctor); if (s) { s->refcount++; /* @@ -3863,6 +3877,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) if (slab_state <= UP) return 0; + memcg_propagate_slab_attrs(s); mutex_unlock(&slab_mutex); err = sysfs_slab_add(s); mutex_lock(&slab_mutex); @@ -5096,10 +5111,95 @@ static ssize_t slab_attr_store(struct kobject *kobj, return -EIO; err = attribute->store(s, buf, len); +#ifdef CONFIG_MEMCG_KMEM + if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { + int i; + mutex_lock(&slab_mutex); + if (s->max_attr_size < len) + s->max_attr_size = len; + + /* + * This is a best effort propagation, so this function's return + * value will be determined by the parent cache only. This is + * basically because not all attributes will have a well + * defined semantics for rollbacks - most of the actions will + * have permanent effects. + * + * Returning the error value of any of the children that fail + * is not 100 % defined, in the sense that users seeing the + * error code won't be able to know anything about the state of + * the cache. + * + * Only returning the error code for the parent cache at least + * has well defined semantics. The cache being written to + * directly either failed or succeeded, in which case we loop + * through the descendants with best-effort propagation. + */ + for_each_memcg_cache_index(i) { + struct kmem_cache *c = cache_from_memcg(s, i); + if (c) + attribute->store(c, buf, len); + } + mutex_unlock(&slab_mutex); + } +#endif return err; } +static void memcg_propagate_slab_attrs(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM + int i; + char *buffer = NULL; + + if (!is_root_cache(s)) + return; + + /* + * This mean this cache had no attribute written. Therefore, no point + * in copying default values around + */ + if (!s->max_attr_size) + return; + + for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { + char mbuf[64]; + char *buf; + struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); + + if (!attr || !attr->store || !attr->show) + continue; + + /* + * It is really bad that we have to allocate here, so we will + * do it only as a fallback. If we actually allocate, though, + * we can just use the allocated buffer until the end. + * + * Most of the slub attributes will tend to be very small in + * size, but sysfs allows buffers up to a page, so they can + * theoretically happen. + */ + if (buffer) + buf = buffer; + else if (s->max_attr_size < ARRAY_SIZE(mbuf)) + buf = mbuf; + else { + buffer = (char *) get_zeroed_page(GFP_KERNEL); + if (WARN_ON(!buffer)) + continue; + buf = buffer; + } + + attr->show(s->memcg_params->root_cache, buf); + attr->store(s, buf, strlen(buf)); + } + + if (buffer) + free_page((unsigned long)buffer); +#endif +} + static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, @@ -5156,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s) if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); + +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg)); +#endif + BUG_ON(p > name + ID_STR_LENGTH - 1); return name; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 7f3096137b8a..828530e2794a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page) } /* - * Are there way too many processes in the direct reclaim path already? + * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and + * then get resheduled. When there are massive number of tasks doing page + * allocation, such sleeping direct reclaimers may keep piling up on each CPU, + * the LRU list will go small and be scanned faster than necessary, leading to + * unnecessary swapping, thrashing and OOM. */ static int too_many_isolated(struct zone *zone, int file, struct scan_control *sc) @@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file, isolated = zone_page_state(zone, NR_ISOLATED_ANON); } + /* + * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they + * won't get blocked by normal direct-reclaimers, forming a circular + * deadlock. + */ + if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + inactive >>= 3; + return isolated > inactive; } |