diff options
Diffstat (limited to 'mm/slub.c')
-rw-r--r-- | mm/slub.c | 232 |
1 files changed, 156 insertions, 76 deletions
diff --git a/mm/slub.c b/mm/slub.c index fe376fe1f4fe..6832c4eab104 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -20,6 +20,7 @@ #include <linux/proc_fs.h> #include <linux/notifier.h> #include <linux/seq_file.h> +#include <linux/kasan.h> #include <linux/kmemcheck.h> #include <linux/cpu.h> #include <linux/cpuset.h> @@ -468,12 +469,30 @@ static char *slub_debug_slabs; static int disable_higher_order_debug; /* + * slub is about to manipulate internal object metadata. This memory lies + * outside the range of the allocated object, so accessing it would normally + * be reported by kasan as a bounds error. metadata_access_enable() is used + * to tell kasan that these accesses are OK. + */ +static inline void metadata_access_enable(void) +{ + kasan_disable_current(); +} + +static inline void metadata_access_disable(void) +{ + kasan_enable_current(); +} + +/* * Object debugging */ static void print_section(char *text, u8 *addr, unsigned int length) { + metadata_access_enable(); print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, length, 1); + metadata_access_disable(); } static struct track *get_track(struct kmem_cache *s, void *object, @@ -503,7 +522,9 @@ static void set_track(struct kmem_cache *s, void *object, trace.max_entries = TRACK_ADDRS_COUNT; trace.entries = p->addrs; trace.skip = 3; + metadata_access_enable(); save_stack_trace(&trace); + metadata_access_disable(); /* See rant in lockdep.c */ if (trace.nr_entries != 0 && @@ -629,7 +650,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) dump_stack(); } -static void object_err(struct kmem_cache *s, struct page *page, +void object_err(struct kmem_cache *s, struct page *page, u8 *object, char *reason) { slab_bug(s, "%s", reason); @@ -677,7 +698,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, u8 *fault; u8 *end; + metadata_access_enable(); fault = memchr_inv(start, value, bytes); + metadata_access_disable(); if (!fault) return 1; @@ -770,7 +793,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) if (!remainder) return 1; + metadata_access_enable(); fault = memchr_inv(end - remainder, POISON_INUSE, remainder); + metadata_access_disable(); if (!fault) return 1; while (end > fault && end[-1] == POISON_INUSE) @@ -1226,11 +1251,13 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { kmemleak_alloc(ptr, size, 1, flags); + kasan_kmalloc_large(ptr, size); } static inline void kfree_hook(const void *x) { kmemleak_free(x); + kasan_kfree_large(x); } static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, @@ -1253,6 +1280,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); memcg_kmem_put_cache(s); + kasan_slab_alloc(s, object); } static inline void slab_free_hook(struct kmem_cache *s, void *x) @@ -1276,6 +1304,8 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) #endif if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); + + kasan_slab_free(s, x); } /* @@ -1370,8 +1400,11 @@ static void setup_object(struct kmem_cache *s, struct page *page, void *object) { setup_object_debug(s, page, object); - if (unlikely(s->ctor)) + if (unlikely(s->ctor)) { + kasan_unpoison_object_data(s, object); s->ctor(object); + kasan_poison_object_data(s, object); + } } static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -1404,6 +1437,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (unlikely(s->flags & SLAB_POISON)) memset(start, POISON_INUSE, PAGE_SIZE << order); + kasan_poison_slab(page); + for_each_object_idx(p, idx, s, start, page->objects) { setup_object(s, page, p); if (likely(idx < page->objects)) @@ -2007,6 +2042,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) int pages; int pobjects; + preempt_disable(); do { pages = 0; pobjects = 0; @@ -2040,6 +2076,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); + if (unlikely(!s->cpu_partial)) { + unsigned long flags; + + local_irq_save(flags); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); + local_irq_restore(flags); + } + preempt_enable(); #endif } @@ -2398,13 +2442,24 @@ redo: * reading from one cpu area. That does not matter as long * as we end up on the original cpu again when doing the cmpxchg. * - * Preemption is disabled for the retrieval of the tid because that - * must occur from the current processor. We cannot allow rescheduling - * on a different processor between the determination of the pointer - * and the retrieval of the tid. + * We should guarantee that tid and kmem_cache are retrieved on + * the same cpu. It could be different if CONFIG_PREEMPT so we need + * to check if it is matched or not. */ - preempt_disable(); - c = this_cpu_ptr(s->cpu_slab); + do { + tid = this_cpu_read(s->cpu_slab->tid); + c = raw_cpu_ptr(s->cpu_slab); + } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); + + /* + * Irqless object alloc/free algorithm used here depends on sequence + * of fetching cpu_slab's data. tid should be fetched before anything + * on c to guarantee that object and page associated with previous tid + * won't be used with current tid. If we fetch tid first, object and + * page could be one associated with next tid and our alloc/free + * request will be failed. In this case, we will retry. So, no problem. + */ + barrier(); /* * The transaction ids are globally unique per cpu and per operation on @@ -2412,8 +2467,6 @@ redo: * occurs on the right processor and that there was no operation on the * linked list in between. */ - tid = c->tid; - preempt_enable(); object = c->freelist; page = c->page; @@ -2479,6 +2532,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = slab_alloc(s, gfpflags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); + kasan_kmalloc(s, ret, size); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_trace); @@ -2505,6 +2559,8 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); + + kasan_kmalloc(s, ret, size); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node_trace); @@ -2512,7 +2568,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif /* - * Slow patch handling. This may still be called frequently since objects + * Slow path handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. * * So we still attempt to reduce cache line usage. Just take the slab @@ -2659,11 +2715,13 @@ redo: * data is retrieved via this pointer. If we are on the same cpu * during the cmpxchg then the free will succedd. */ - preempt_disable(); - c = this_cpu_ptr(s->cpu_slab); + do { + tid = this_cpu_read(s->cpu_slab->tid); + c = raw_cpu_ptr(s->cpu_slab); + } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); - tid = c->tid; - preempt_enable(); + /* Same with comment on barrier() in slab_alloc_node() */ + barrier(); if (likely(page == c->page)) { set_freepointer(s, object, c->freelist); @@ -2888,6 +2946,7 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif + kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node)); init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); @@ -3260,6 +3319,8 @@ void *__kmalloc(size_t size, gfp_t flags) trace_kmalloc(_RET_IP_, ret, size, s->size, flags); + kasan_kmalloc(s, ret, size); + return ret; } EXPORT_SYMBOL(__kmalloc); @@ -3303,12 +3364,14 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); + kasan_kmalloc(s, ret, size); + return ret; } EXPORT_SYMBOL(__kmalloc_node); #endif -size_t ksize(const void *object) +static size_t __ksize(const void *object) { struct page *page; @@ -3324,6 +3387,15 @@ size_t ksize(const void *object) return slab_ksize(page->slab_cache); } + +size_t ksize(const void *object) +{ + size_t size = __ksize(object); + /* We assume that ksize callers could use whole allocated area, + so we need unpoison this area. */ + kasan_krealloc(object, size); + return size; +} EXPORT_SYMBOL(ksize); void kfree(const void *x) @@ -3347,69 +3419,92 @@ void kfree(const void *x) } EXPORT_SYMBOL(kfree); +#define SHRINK_PROMOTE_MAX 32 + /* - * kmem_cache_shrink removes empty slabs from the partial lists and sorts - * the remaining slabs by the number of items in use. The slabs with the - * most items in use come first. New allocations will then fill those up - * and thus they can be removed from the partial lists. + * kmem_cache_shrink discards empty slabs and promotes the slabs filled + * up most to the head of the partial lists. New allocations will then + * fill those up and thus they can be removed from the partial lists. * * The slabs with the least items are placed last. This results in them * being allocated from last increasing the chance that the last objects * are freed in them. */ -int __kmem_cache_shrink(struct kmem_cache *s) +int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) { int node; int i; struct kmem_cache_node *n; struct page *page; struct page *t; - int objects = oo_objects(s->max); - struct list_head *slabs_by_inuse = - kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); + struct list_head discard; + struct list_head promote[SHRINK_PROMOTE_MAX]; unsigned long flags; + int ret = 0; - if (!slabs_by_inuse) - return -ENOMEM; + if (deactivate) { + /* + * Disable empty slabs caching. Used to avoid pinning offline + * memory cgroups by kmem pages that can be freed. + */ + s->cpu_partial = 0; + s->min_partial = 0; + + /* + * s->cpu_partial is checked locklessly (see put_cpu_partial), + * so we have to make sure the change is visible. + */ + kick_all_cpus_sync(); + } flush_all(s); for_each_kmem_cache_node(s, node, n) { - if (!n->nr_partial) - continue; - - for (i = 0; i < objects; i++) - INIT_LIST_HEAD(slabs_by_inuse + i); + INIT_LIST_HEAD(&discard); + for (i = 0; i < SHRINK_PROMOTE_MAX; i++) + INIT_LIST_HEAD(promote + i); spin_lock_irqsave(&n->list_lock, flags); /* - * Build lists indexed by the items in use in each slab. + * Build lists of slabs to discard or promote. * * Note that concurrent frees may occur while we hold the * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - list_move(&page->lru, slabs_by_inuse + page->inuse); - if (!page->inuse) + int free = page->objects - page->inuse; + + /* Do not reread page->inuse */ + barrier(); + + /* We do not keep full slabs on the list */ + BUG_ON(free <= 0); + + if (free == page->objects) { + list_move(&page->lru, &discard); n->nr_partial--; + } else if (free <= SHRINK_PROMOTE_MAX) + list_move(&page->lru, promote + free - 1); } /* - * Rebuild the partial list with the slabs filled up most - * first and the least used slabs at the end. + * Promote the slabs filled up most to the head of the + * partial list. */ - for (i = objects - 1; i > 0; i--) - list_splice(slabs_by_inuse + i, n->partial.prev); + for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) + list_splice(promote + i, &n->partial); spin_unlock_irqrestore(&n->list_lock, flags); /* Release empty slabs */ - list_for_each_entry_safe(page, t, slabs_by_inuse, lru) + list_for_each_entry_safe(page, t, &discard, lru) discard_slab(s, page); + + if (slabs_node(s, node)) + ret = 1; } - kfree(slabs_by_inuse); - return 0; + return ret; } static int slab_mem_going_offline_callback(void *arg) @@ -3418,7 +3513,7 @@ static int slab_mem_going_offline_callback(void *arg) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - __kmem_cache_shrink(s); + __kmem_cache_shrink(s, false); mutex_unlock(&slab_mutex); return 0; @@ -3566,6 +3661,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) p->slab_cache = s; #endif } + slab_init_memcg_params(s); list_add(&s->list, &slab_caches); return s; } @@ -3624,13 +3720,10 @@ struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { - struct kmem_cache *s; + struct kmem_cache *s, *c; s = find_mergeable(size, align, flags, name, ctor); if (s) { - int i; - struct kmem_cache *c; - s->refcount++; /* @@ -3640,10 +3733,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, s->object_size = max(s->object_size, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(s, i); - if (!c) - continue; + for_each_memcg_cache(c, s) { c->object_size = s->object_size; c->inuse = max_t(int, c->inuse, ALIGN(size, sizeof(void *))); @@ -4070,20 +4160,16 @@ static int list_locations(struct kmem_cache *s, char *buf, if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)) && - len < PAGE_SIZE - 60) { - len += sprintf(buf + len, " cpus="); - len += cpulist_scnprintf(buf + len, - PAGE_SIZE - len - 50, - to_cpumask(l->cpus)); - } + len < PAGE_SIZE - 60) + len += scnprintf(buf + len, PAGE_SIZE - len - 50, + " cpus=%*pbl", + cpumask_pr_args(to_cpumask(l->cpus))); if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && - len < PAGE_SIZE - 60) { - len += sprintf(buf + len, " nodes="); - len += nodelist_scnprintf(buf + len, - PAGE_SIZE - len - 50, - l->nodes); - } + len < PAGE_SIZE - 60) + len += scnprintf(buf + len, PAGE_SIZE - len - 50, + " nodes=%*pbl", + nodemask_pr_args(&l->nodes)); len += sprintf(buf + len, "\n"); } @@ -4680,12 +4766,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf) static ssize_t shrink_store(struct kmem_cache *s, const char *buf, size_t length) { - if (buf[0] == '1') { - int rc = kmem_cache_shrink(s); - - if (rc) - return rc; - } else + if (buf[0] == '1') + kmem_cache_shrink(s); + else return -EINVAL; return length; } @@ -4909,7 +4992,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, err = attribute->store(s, buf, len); #ifdef CONFIG_MEMCG_KMEM if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { - int i; + struct kmem_cache *c; mutex_lock(&slab_mutex); if (s->max_attr_size < len) @@ -4932,11 +5015,8 @@ static ssize_t slab_attr_store(struct kobject *kobj, * directly either failed or succeeded, in which case we loop * through the descendants with best-effort propagation. */ - for_each_memcg_cache_index(i) { - struct kmem_cache *c = cache_from_memcg_idx(s, i); - if (c) - attribute->store(c, buf, len); - } + for_each_memcg_cache(c, s) + attribute->store(c, buf, len); mutex_unlock(&slab_mutex); } #endif @@ -4953,7 +5033,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) if (is_root_cache(s)) return; - root_cache = s->memcg_params->root_cache; + root_cache = s->memcg_params.root_cache; /* * This mean this cache had no attribute written. Therefore, no point @@ -5033,7 +5113,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s) { #ifdef CONFIG_MEMCG_KMEM if (!is_root_cache(s)) - return s->memcg_params->root_cache->memcg_kset; + return s->memcg_params.root_cache->memcg_kset; #endif return slab_kset; } |