diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 30 | ||||
-rw-r--r-- | mm/filemap.c | 6 | ||||
-rw-r--r-- | mm/highmem.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 198 | ||||
-rw-r--r-- | mm/mempolicy.c | 9 | ||||
-rw-r--r-- | mm/migrate.c | 8 | ||||
-rw-r--r-- | mm/page-writeback.c | 15 | ||||
-rw-r--r-- | mm/page_cgroup.c | 3 | ||||
-rw-r--r-- | mm/shmem.c | 4 | ||||
-rw-r--r-- | mm/slab.c | 19 | ||||
-rw-r--r-- | mm/slub.c | 558 | ||||
-rw-r--r-- | mm/vmalloc.c | 8 | ||||
-rw-r--r-- | mm/vmscan.c | 71 | ||||
-rw-r--r-- | mm/vmstat.c | 4 |
14 files changed, 484 insertions, 453 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d6edf8d14f9c..a87da524a4a0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -359,6 +359,17 @@ static unsigned long bdi_longest_inactive(void) return max(5UL * 60 * HZ, interval); } +/* + * Clear pending bit and wakeup anybody waiting for flusher thread creation or + * shutdown + */ +static void bdi_clear_pending(struct backing_dev_info *bdi) +{ + clear_bit(BDI_pending, &bdi->state); + smp_mb__after_clear_bit(); + wake_up_bit(&bdi->state, BDI_pending); +} + static int bdi_forker_thread(void *ptr) { struct bdi_writeback *me = ptr; @@ -390,6 +401,13 @@ static int bdi_forker_thread(void *ptr) } spin_lock_bh(&bdi_lock); + /* + * In the following loop we are going to check whether we have + * some work to do without any synchronization with tasks + * waking us up to do work for them. So we have to set task + * state already here so that we don't miss wakeups coming + * after we verify some condition. + */ set_current_state(TASK_INTERRUPTIBLE); list_for_each_entry(bdi, &bdi_list, bdi_list) { @@ -469,11 +487,13 @@ static int bdi_forker_thread(void *ptr) spin_unlock_bh(&bdi->wb_lock); wake_up_process(task); } + bdi_clear_pending(bdi); break; case KILL_THREAD: __set_current_state(TASK_RUNNING); kthread_stop(task); + bdi_clear_pending(bdi); break; case NO_ACTION: @@ -489,16 +509,8 @@ static int bdi_forker_thread(void *ptr) else schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); try_to_freeze(); - /* Back to the main loop */ - continue; + break; } - - /* - * Clear pending bit and wakeup anybody waiting to tear us down. - */ - clear_bit(BDI_pending, &bdi->state); - smp_mb__after_clear_bit(); - wake_up_bit(&bdi->state, BDI_pending); } return 0; diff --git a/mm/filemap.c b/mm/filemap.c index 645a080ba4df..7771871fa353 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -827,13 +827,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, { unsigned int i; unsigned int ret; - unsigned int nr_found; + unsigned int nr_found, nr_skip; rcu_read_lock(); restart: nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, (void ***)pages, NULL, start, nr_pages); ret = 0; + nr_skip = 0; for (i = 0; i < nr_found; i++) { struct page *page; repeat: @@ -856,6 +857,7 @@ repeat: * here as an exceptional entry: so skip over it - * we only reach this from invalidate_mapping_pages(). */ + nr_skip++; continue; } @@ -876,7 +878,7 @@ repeat: * If all entries were removed before we could secure them, * try again, because callers stop trying once 0 is returned. */ - if (unlikely(!ret && nr_found)) + if (unlikely(!ret && nr_found > nr_skip)) goto restart; rcu_read_unlock(); return ret; diff --git a/mm/highmem.c b/mm/highmem.c index 693394daa2ed..5ef672c07f75 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -326,7 +326,7 @@ static struct page_address_slot { spinlock_t lock; /* Protect this bucket's list */ } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; -static struct page_address_slot *page_slot(struct page *page) +static struct page_address_slot *page_slot(const struct page *page) { return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; } @@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page) * * Returns the page's virtual address. */ -void *page_address(struct page *page) +void *page_address(const struct page *page) { unsigned long flags; void *ret; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 930de9437271..3508777837c7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -204,50 +204,6 @@ struct mem_cgroup_eventfd_list { static void mem_cgroup_threshold(struct mem_cgroup *mem); static void mem_cgroup_oom_notify(struct mem_cgroup *mem); -enum { - SCAN_BY_LIMIT, - SCAN_BY_SYSTEM, - NR_SCAN_CONTEXT, - SCAN_BY_SHRINK, /* not recorded now */ -}; - -enum { - SCAN, - SCAN_ANON, - SCAN_FILE, - ROTATE, - ROTATE_ANON, - ROTATE_FILE, - FREED, - FREED_ANON, - FREED_FILE, - ELAPSED, - NR_SCANSTATS, -}; - -struct scanstat { - spinlock_t lock; - unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS]; - unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS]; -}; - -const char *scanstat_string[NR_SCANSTATS] = { - "scanned_pages", - "scanned_anon_pages", - "scanned_file_pages", - "rotated_pages", - "rotated_anon_pages", - "rotated_file_pages", - "freed_pages", - "freed_anon_pages", - "freed_file_pages", - "elapsed_ns", -}; -#define SCANSTAT_WORD_LIMIT "_by_limit" -#define SCANSTAT_WORD_SYSTEM "_by_system" -#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy" - - /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -313,8 +269,7 @@ struct mem_cgroup { /* For oom notifier event fd */ struct list_head oom_notify; - /* For recording LRU-scan statistics */ - struct scanstat scanstat; + /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? @@ -1678,44 +1633,6 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) } #endif -static void __mem_cgroup_record_scanstat(unsigned long *stats, - struct memcg_scanrecord *rec) -{ - - stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1]; - stats[SCAN_ANON] += rec->nr_scanned[0]; - stats[SCAN_FILE] += rec->nr_scanned[1]; - - stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1]; - stats[ROTATE_ANON] += rec->nr_rotated[0]; - stats[ROTATE_FILE] += rec->nr_rotated[1]; - - stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1]; - stats[FREED_ANON] += rec->nr_freed[0]; - stats[FREED_FILE] += rec->nr_freed[1]; - - stats[ELAPSED] += rec->elapsed; -} - -static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec) -{ - struct mem_cgroup *mem; - int context = rec->context; - - if (context >= NR_SCAN_CONTEXT) - return; - - mem = rec->mem; - spin_lock(&mem->scanstat.lock); - __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec); - spin_unlock(&mem->scanstat.lock); - - mem = rec->root; - spin_lock(&mem->scanstat.lock); - __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec); - spin_unlock(&mem->scanstat.lock); -} - /* * Scan the hierarchy if needed to reclaim memory. We remember the last child * we reclaimed from, so that we don't end up penalizing one child extensively @@ -1740,9 +1657,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; - struct memcg_scanrecord rec; unsigned long excess; - unsigned long scanned; + unsigned long nr_scanned; excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; @@ -1750,15 +1666,6 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (!check_soft && !shrink && root_mem->memsw_is_minimum) noswap = true; - if (shrink) - rec.context = SCAN_BY_SHRINK; - else if (check_soft) - rec.context = SCAN_BY_SYSTEM; - else - rec.context = SCAN_BY_LIMIT; - - rec.root = root_mem; - while (1) { victim = mem_cgroup_select_victim(root_mem); if (victim == root_mem) { @@ -1799,23 +1706,14 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, css_put(&victim->css); continue; } - rec.mem = victim; - rec.nr_scanned[0] = 0; - rec.nr_scanned[1] = 0; - rec.nr_rotated[0] = 0; - rec.nr_rotated[1] = 0; - rec.nr_freed[0] = 0; - rec.nr_freed[1] = 0; - rec.elapsed = 0; /* we use swappiness of local cgroup */ if (check_soft) { ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, zone, &rec, &scanned); - *total_scanned += scanned; + noswap, zone, &nr_scanned); + *total_scanned += nr_scanned; } else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap, &rec); - mem_cgroup_record_scanstat(&rec); + noswap); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -1841,29 +1739,23 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, */ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) { - int lock_count = -1; struct mem_cgroup *iter, *failed = NULL; bool cond = true; for_each_mem_cgroup_tree_cond(iter, mem, cond) { - bool locked = iter->oom_lock; - - iter->oom_lock = true; - if (lock_count == -1) - lock_count = iter->oom_lock; - else if (lock_count != locked) { + if (iter->oom_lock) { /* * this subtree of our hierarchy is already locked * so we cannot give a lock. */ - lock_count = 0; failed = iter; cond = false; - } + } else + iter->oom_lock = true; } if (!failed) - goto done; + return true; /* * OK, we failed to lock the whole subtree so we have to clean up @@ -1877,8 +1769,7 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) } iter->oom_lock = false; } -done: - return lock_count; + return false; } /* @@ -2169,13 +2060,7 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) /* Notify other cpus that system-wide "drain" is running */ get_online_cpus(); - /* - * Get a hint for avoiding draining charges on the current cpu, - * which must be exhausted by our charging. It is not required that - * this be a precise check, so we use raw_smp_processor_id() instead of - * getcpu()/putcpu(). - */ - curcpu = raw_smp_processor_id(); + curcpu = get_cpu(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *mem; @@ -2192,6 +2077,7 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) schedule_work_on(cpu, &stock->work); } } + put_cpu(); if (!sync) goto out; @@ -3866,18 +3752,14 @@ try_to_free: /* try to free all pages in this cgroup */ shrink = 1; while (nr_retries && mem->res.usage > 0) { - struct memcg_scanrecord rec; int progress; if (signal_pending(current)) { ret = -EINTR; goto out; } - rec.context = SCAN_BY_SHRINK; - rec.mem = mem; - rec.root = mem; progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, - false, &rec); + false); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -4721,54 +4603,6 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) } #endif /* CONFIG_NUMA */ -static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp, - struct cftype *cft, - struct cgroup_map_cb *cb) -{ - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); - char string[64]; - int i; - - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_LIMIT); - cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]); - } - - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_SYSTEM); - cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]); - } - - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_LIMIT); - strcat(string, SCANSTAT_WORD_HIERARCHY); - cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]); - } - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_SYSTEM); - strcat(string, SCANSTAT_WORD_HIERARCHY); - cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]); - } - return 0; -} - -static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp, - unsigned int event) -{ - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); - - spin_lock(&mem->scanstat.lock); - memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats)); - memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats)); - spin_unlock(&mem->scanstat.lock); - return 0; -} - - static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -4839,11 +4673,6 @@ static struct cftype mem_cgroup_files[] = { .mode = S_IRUGO, }, #endif - { - .name = "vmscan_stat", - .read_map = mem_cgroup_vmscan_stat_read, - .trigger = mem_cgroup_reset_vmscan_stat, - }, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -5107,7 +4936,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) atomic_set(&mem->refcnt, 1); mem->move_charge_at_immigrate = 0; mutex_init(&mem->thresholds_lock); - spin_lock_init(&mem->scanstat.lock); return &mem->css; free_out: __mem_cgroup_free(mem); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8b57173c1dd5..9c51f9f58cac 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -636,7 +636,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; - pgoff_t pgoff; unsigned long vmstart; unsigned long vmend; @@ -649,9 +648,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end); - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, new_pol); + vma->anon_vma, vma->vm_file, vma->vm_pgoff, + new_pol); if (prev) { vma = prev; next = vma->vm_next; @@ -1412,7 +1411,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy, err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); if (!err && nmask) { - err = copy_from_user(bm, nm, alloc_size); + unsigned long copy_size; + copy_size = min_t(unsigned long, sizeof(bm), alloc_size); + err = copy_from_user(bm, nm, copy_size); /* ensure entire bitmap is zeroed */ err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); err |= compat_put_bitmap(nmask, bm, nr_bits); diff --git a/mm/migrate.c b/mm/migrate.c index 666e4e677414..14d0a6a632f6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, ptep = pte_offset_map(pmd, addr); - if (!is_swap_pte(*ptep)) { - pte_unmap(ptep); - goto out; - } + /* + * Peek to check is_swap_pte() before taking ptlock? No, we + * can race mremap's move_ptes(), which skips anon_vma lock. + */ ptl = pte_lockptr(mm, pmd); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d1960744f881..0e309cd1b5b9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -754,21 +754,10 @@ static void balance_dirty_pages(struct address_space *mapping, * 200ms is typically more than enough to curb heavy dirtiers; * (b) the pause time limit makes the dirtiers more responsive. */ - if (nr_dirty < dirty_thresh + - dirty_thresh / DIRTY_MAXPAUSE_AREA && + if (nr_dirty < dirty_thresh && + bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 && time_after(jiffies, start_time + MAX_PAUSE)) break; - /* - * pass-good area. When some bdi gets blocked (eg. NFS server - * not responding), or write bandwidth dropped dramatically due - * to concurrent reads, or dirty threshold suddenly dropped and - * the dirty pages cannot be brought down anytime soon (eg. on - * slow USB stick), at least let go of the good bdi's. - */ - if (nr_dirty < dirty_thresh + - dirty_thresh / DIRTY_PASSGOOD_AREA && - bdi_dirty < bdi_thresh) - break; /* * Increase the delay for each loop, up to our previous diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 39d216d535ea..6bdc67dbbc28 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -513,11 +513,10 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); array_size = length * sizeof(void *); - array = vmalloc(array_size); + array = vzalloc(array_size); if (!array) goto nomem; - memset(array, 0, array_size); ctrl = &swap_cgroup_ctrl[type]; mutex_lock(&swap_cgroup_mutex); ctrl->length = length; diff --git a/mm/shmem.c b/mm/shmem.c index 32f6763f16fb..2d3577295298 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1458,7 +1458,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { error = security_inode_init_security(inode, dir, - &dentry->d_name, NULL, + &dentry->d_name, NULL, NULL); if (error) { if (error != -EOPNOTSUPP) { @@ -1598,7 +1598,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s if (!inode) return -ENOSPC; - error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, + error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, NULL); if (error) { if (error != -EOPNOTSUPP) { diff --git a/mm/slab.c b/mm/slab.c index 6d90a091fdca..708efe886154 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1851,15 +1851,15 @@ static void dump_line(char *data, int offset, int limit) unsigned char error = 0; int bad_count = 0; - printk(KERN_ERR "%03x:", offset); + printk(KERN_ERR "%03x: ", offset); for (i = 0; i < limit; i++) { if (data[offset + i] != POISON_FREE) { error = data[offset + i]; bad_count++; } - printk(" %02x", (unsigned char)data[offset + i]); } - printk("\n"); + print_hex_dump(KERN_CONT, "", 0, 16, 1, + &data[offset], limit, 1); if (bad_count == 1) { error ^= POISON_FREE; @@ -3039,14 +3039,9 @@ bad: printk(KERN_ERR "slab: Internal list corruption detected in " "cache '%s'(%d), slabp %p(%d). Hexdump:\n", cachep->name, cachep->num, slabp, slabp->inuse); - for (i = 0; - i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); - i++) { - if (i % 16 == 0) - printk("\n%03x:", i); - printk(" %02x", ((unsigned char *)slabp)[i]); - } - printk("\n"); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, + sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), + 1); BUG(); } } @@ -4584,7 +4579,7 @@ static const struct file_operations proc_slabstats_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); + proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations); #ifdef CONFIG_DEBUG_SLAB_LEAK proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); #endif diff --git a/mm/slub.c b/mm/slub.c index 9f662d70eb47..95215aa6a75e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -467,34 +467,8 @@ static int disable_higher_order_debug; */ static void print_section(char *text, u8 *addr, unsigned int length) { - int i, offset; - int newline = 1; - char ascii[17]; - - ascii[16] = 0; - - for (i = 0; i < length; i++) { - if (newline) { - printk(KERN_ERR "%8s 0x%p: ", text, addr + i); - newline = 0; - } - printk(KERN_CONT " %02x", addr[i]); - offset = i % 16; - ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; - if (offset == 15) { - printk(KERN_CONT " %s\n", ascii); - newline = 1; - } - } - if (!newline) { - i %= 16; - while (i < 16) { - printk(KERN_CONT " "); - ascii[i] = ' '; - i++; - } - printk(KERN_CONT " %s\n", ascii); - } + print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, + length, 1); } static struct track *get_track(struct kmem_cache *s, void *object, @@ -625,12 +599,12 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) p, p - addr, get_freepointer(s, p)); if (p > addr + 16) - print_section("Bytes b4", p - 16, 16); - - print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); + print_section("Bytes b4 ", p - 16, 16); + print_section("Object ", p, min_t(unsigned long, s->objsize, + PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone", p + s->objsize, + print_section("Redzone ", p + s->objsize, s->inuse - s->objsize); if (s->offset) @@ -643,7 +617,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (off != s->size) /* Beginning of the filler is the free pointer */ - print_section("Padding", p + off, s->size - off); + print_section("Padding ", p + off, s->size - off); dump_stack(); } @@ -838,7 +812,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding", end - remainder, remainder); + print_section("Padding ", end - remainder, remainder); restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); return 0; @@ -987,7 +961,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, page->freelist); if (!alloc) - print_section("Object", (void *)object, s->objsize); + print_section("Object ", (void *)object, s->objsize); dump_stack(); } @@ -1447,7 +1421,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, last, NULL); page->freelist = start; - page->inuse = 0; + page->inuse = page->objects; page->frozen = 1; out: return page; @@ -1534,7 +1508,7 @@ static inline void add_partial(struct kmem_cache_node *n, struct page *page, int tail) { n->nr_partial++; - if (tail) + if (tail == DEACTIVATE_TO_TAIL) list_add_tail(&page->lru, &n->partial); else list_add(&page->lru, &n->partial); @@ -1554,10 +1528,13 @@ static inline void remove_partial(struct kmem_cache_node *n, * Lock slab, remove from the partial list and put the object into the * per cpu freelist. * + * Returns a list of objects or NULL if it fails. + * * Must hold list_lock. */ -static inline int acquire_slab(struct kmem_cache *s, - struct kmem_cache_node *n, struct page *page) +static inline void *acquire_slab(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page, + int mode) { void *freelist; unsigned long counters; @@ -1572,7 +1549,8 @@ static inline int acquire_slab(struct kmem_cache *s, freelist = page->freelist; counters = page->counters; new.counters = counters; - new.inuse = page->objects; + if (mode) + new.inuse = page->objects; VM_BUG_ON(new.frozen); new.frozen = 1; @@ -1583,32 +1561,19 @@ static inline int acquire_slab(struct kmem_cache *s, "lock and freeze")); remove_partial(n, page); - - if (freelist) { - /* Populate the per cpu freelist */ - this_cpu_write(s->cpu_slab->freelist, freelist); - this_cpu_write(s->cpu_slab->page, page); - this_cpu_write(s->cpu_slab->node, page_to_nid(page)); - return 1; - } else { - /* - * Slab page came from the wrong list. No object to allocate - * from. Put it onto the correct list and continue partial - * scan. - */ - printk(KERN_ERR "SLUB: %s : Page without available objects on" - " partial list\n", s->name); - return 0; - } + return freelist; } +static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); + /* * Try to allocate a partial slab from a specific node. */ -static struct page *get_partial_node(struct kmem_cache *s, - struct kmem_cache_node *n) +static void *get_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n, struct kmem_cache_cpu *c) { - struct page *page; + struct page *page, *page2; + void *object = NULL; /* * Racy check. If we mistakenly see no partial slabs then we @@ -1620,26 +1585,43 @@ static struct page *get_partial_node(struct kmem_cache *s, return NULL; spin_lock(&n->list_lock); - list_for_each_entry(page, &n->partial, lru) - if (acquire_slab(s, n, page)) - goto out; - page = NULL; -out: + list_for_each_entry_safe(page, page2, &n->partial, lru) { + void *t = acquire_slab(s, n, page, object == NULL); + int available; + + if (!t) + break; + + if (!object) { + c->page = page; + c->node = page_to_nid(page); + stat(s, ALLOC_FROM_PARTIAL); + object = t; + available = page->objects - page->inuse; + } else { + page->freelist = t; + available = put_cpu_partial(s, page, 0); + } + if (kmem_cache_debug(s) || available > s->cpu_partial / 2) + break; + + } spin_unlock(&n->list_lock); - return page; + return object; } /* * Get a page from somewhere. Search in increasing NUMA distances. */ -static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) +static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, + struct kmem_cache_cpu *c) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); - struct page *page; + void *object; /* * The defrag ratio allows a configuration of the tradeoffs between @@ -1672,10 +1654,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { - page = get_partial_node(s, n); - if (page) { + object = get_partial_node(s, n, c); + if (object) { put_mems_allowed(); - return page; + return object; } } } @@ -1687,16 +1669,17 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) /* * Get a partial page, lock it and return it. */ -static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, + struct kmem_cache_cpu *c) { - struct page *page; + void *object; int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; - page = get_partial_node(s, get_node(s, searchnode)); - if (page || node != NUMA_NO_NODE) - return page; + object = get_partial_node(s, get_node(s, searchnode), c); + if (object || node != NUMA_NO_NODE) + return object; - return get_any_partial(s, flags); + return get_any_partial(s, flags, c); } #ifdef CONFIG_PREEMPT @@ -1765,9 +1748,6 @@ void init_kmem_cache_cpus(struct kmem_cache *s) for_each_possible_cpu(cpu) per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); } -/* - * Remove the cpu slab - */ /* * Remove the cpu slab @@ -1781,13 +1761,13 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) enum slab_modes l = M_NONE, m = M_NONE; void *freelist; void *nextfree; - int tail = 0; + int tail = DEACTIVATE_TO_HEAD; struct page new; struct page old; if (page->freelist) { stat(s, DEACTIVATE_REMOTE_FREES); - tail = 1; + tail = DEACTIVATE_TO_TAIL; } c->tid = next_tid(c->tid); @@ -1893,7 +1873,7 @@ redo: if (m == M_PARTIAL) { add_partial(n, page, tail); - stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); + stat(s, tail); } else if (m == M_FULL) { @@ -1920,6 +1900,123 @@ redo: } } +/* Unfreeze all the cpu partial slabs */ +static void unfreeze_partials(struct kmem_cache *s) +{ + struct kmem_cache_node *n = NULL; + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); + struct page *page; + + while ((page = c->partial)) { + enum slab_modes { M_PARTIAL, M_FREE }; + enum slab_modes l, m; + struct page new; + struct page old; + + c->partial = page->next; + l = M_FREE; + + do { + + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); + + new.counters = old.counters; + new.freelist = old.freelist; + + new.frozen = 0; + + if (!new.inuse && (!n || n->nr_partial > s->min_partial)) + m = M_FREE; + else { + struct kmem_cache_node *n2 = get_node(s, + page_to_nid(page)); + + m = M_PARTIAL; + if (n != n2) { + if (n) + spin_unlock(&n->list_lock); + + n = n2; + spin_lock(&n->list_lock); + } + } + + if (l != m) { + if (l == M_PARTIAL) + remove_partial(n, page); + else + add_partial(n, page, 1); + + l = m; + } + + } while (!cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")); + + if (m == M_FREE) { + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); + } + } + + if (n) + spin_unlock(&n->list_lock); +} + +/* + * Put a page that was just frozen (in __slab_free) into a partial page + * slot if available. This is done without interrupts disabled and without + * preemption disabled. The cmpxchg is racy and may put the partial page + * onto a random cpus partial slot. + * + * If we did not find a slot then simply move all the partials to the + * per node partial list. + */ +int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +{ + struct page *oldpage; + int pages; + int pobjects; + + do { + pages = 0; + pobjects = 0; + oldpage = this_cpu_read(s->cpu_slab->partial); + + if (oldpage) { + pobjects = oldpage->pobjects; + pages = oldpage->pages; + if (drain && pobjects > s->cpu_partial) { + unsigned long flags; + /* + * partial array is full. Move the existing + * set to the per node partial list. + */ + local_irq_save(flags); + unfreeze_partials(s); + local_irq_restore(flags); + pobjects = 0; + pages = 0; + } + } + + pages++; + pobjects += page->objects - page->inuse; + + page->pages = pages; + page->pobjects = pobjects; + page->next = oldpage; + + } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); + stat(s, CPU_PARTIAL_FREE); + return pobjects; +} + static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); @@ -1935,8 +2032,12 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - if (likely(c && c->page)) - flush_slab(s, c); + if (likely(c)) { + if (c->page) + flush_slab(s, c); + + unfreeze_partials(s); + } } static void flush_cpu_slab(void *d) @@ -2027,12 +2128,39 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) } } +static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, + int node, struct kmem_cache_cpu **pc) +{ + void *object; + struct kmem_cache_cpu *c; + struct page *page = new_slab(s, flags, node); + + if (page) { + c = __this_cpu_ptr(s->cpu_slab); + if (c->page) + flush_slab(s, c); + + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg + */ + object = page->freelist; + page->freelist = NULL; + + stat(s, ALLOC_SLAB); + c->node = page_to_nid(page); + c->page = page; + *pc = c; + } else + object = NULL; + + return object; +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. * - * Interrupts are disabled. - * * Processing is still very fast if new objects have been freed to the * regular freelist. In that case we simply take over the regular freelist * as the lockless freelist and zap the regular freelist. @@ -2049,7 +2177,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { void **object; - struct page *page; unsigned long flags; struct page new; unsigned long counters; @@ -2064,13 +2191,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c = this_cpu_ptr(s->cpu_slab); #endif - /* We handle __GFP_ZERO in the caller */ - gfpflags &= ~__GFP_ZERO; - - page = c->page; - if (!page) + if (!c->page) goto new_slab; - +redo: if (unlikely(!node_match(c, node))) { stat(s, ALLOC_NODE_MISMATCH); deactivate_slab(s, c); @@ -2080,8 +2203,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, stat(s, ALLOC_SLOWPATH); do { - object = page->freelist; - counters = page->counters; + object = c->page->freelist; + counters = c->page->counters; new.counters = counters; VM_BUG_ON(!new.frozen); @@ -2093,17 +2216,17 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, * * If there are objects left then we retrieve them * and use them to refill the per cpu queue. - */ + */ - new.inuse = page->objects; + new.inuse = c->page->objects; new.frozen = object != NULL; - } while (!__cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, c->page, object, counters, NULL, new.counters, "__slab_alloc")); - if (unlikely(!object)) { + if (!object) { c->page = NULL; stat(s, DEACTIVATE_BYPASS); goto new_slab; @@ -2112,58 +2235,47 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, stat(s, ALLOC_REFILL); load_freelist: - VM_BUG_ON(!page->frozen); c->freelist = get_freepointer(s, object); c->tid = next_tid(c->tid); local_irq_restore(flags); return object; new_slab: - page = get_partial(s, gfpflags, node); - if (page) { - stat(s, ALLOC_FROM_PARTIAL); - object = c->freelist; - if (kmem_cache_debug(s)) - goto debug; - goto load_freelist; + if (c->partial) { + c->page = c->partial; + c->partial = c->page->next; + c->node = page_to_nid(c->page); + stat(s, CPU_PARTIAL_ALLOC); + c->freelist = NULL; + goto redo; } - page = new_slab(s, gfpflags, node); + /* Then do expensive stuff like retrieving pages from the partial lists */ + object = get_partial(s, gfpflags, node, c); - if (page) { - c = __this_cpu_ptr(s->cpu_slab); - if (c->page) - flush_slab(s, c); + if (unlikely(!object)) { - /* - * No other reference to the page yet so we can - * muck around with it freely without cmpxchg - */ - object = page->freelist; - page->freelist = NULL; - page->inuse = page->objects; + object = new_slab_objects(s, gfpflags, node, &c); - stat(s, ALLOC_SLAB); - c->node = page_to_nid(page); - c->page = page; + if (unlikely(!object)) { + if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) + slab_out_of_memory(s, gfpflags, node); - if (kmem_cache_debug(s)) - goto debug; - goto load_freelist; + local_irq_restore(flags); + return NULL; + } } - if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(s, gfpflags, node); - local_irq_restore(flags); - return NULL; -debug: - if (!object || !alloc_debug_processing(s, page, object, addr)) - goto new_slab; + if (likely(!kmem_cache_debug(s))) + goto load_freelist; + + /* Only entered in the debug case */ + if (!alloc_debug_processing(s, c->page, object, addr)) + goto new_slab; /* Slab failed checks. Next slab needed */ c->freelist = get_freepointer(s, object); deactivate_slab(s, c); - c->page = NULL; c->node = NUMA_NO_NODE; local_irq_restore(flags); return object; @@ -2333,16 +2445,29 @@ static void __slab_free(struct kmem_cache *s, struct page *page, was_frozen = new.frozen; new.inuse--; if ((!new.inuse || !prior) && !was_frozen && !n) { - n = get_node(s, page_to_nid(page)); - /* - * Speculatively acquire the list_lock. - * If the cmpxchg does not succeed then we may - * drop the list_lock without any processing. - * - * Otherwise the list_lock will synchronize with - * other processors updating the list of slabs. - */ - spin_lock_irqsave(&n->list_lock, flags); + + if (!kmem_cache_debug(s) && !prior) + + /* + * Slab was on no list before and will be partially empty + * We can defer the list move and instead freeze it. + */ + new.frozen = 1; + + else { /* Needs to be taken off a list */ + + n = get_node(s, page_to_nid(page)); + /* + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. + */ + spin_lock_irqsave(&n->list_lock, flags); + + } } inuse = new.inuse; @@ -2352,7 +2477,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page, "__slab_free")); if (likely(!n)) { - /* + + /* + * If we just froze the page then put it onto the + * per cpu partial list. + */ + if (new.frozen && !was_frozen) + put_cpu_partial(s, page, 1); + + /* * The list lock was not taken therefore no list * activity can be necessary. */ @@ -2377,7 +2510,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, */ if (unlikely(!prior)) { remove_full(s, page); - add_partial(n, page, 0); + add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } } @@ -2421,7 +2554,6 @@ static __always_inline void slab_free(struct kmem_cache *s, slab_free_hook(s, x); redo: - /* * Determine the currently cpus per cpu slab. * The cpu may change afterward. However that does not matter since @@ -2685,7 +2817,7 @@ static void early_kmem_cache_node_alloc(int node) n = page->freelist; BUG_ON(!n); page->freelist = get_freepointer(kmem_cache_node, n); - page->inuse++; + page->inuse = 1; page->frozen = 0; kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG @@ -2695,7 +2827,7 @@ static void early_kmem_cache_node_alloc(int node) init_kmem_cache_node(n, kmem_cache_node); inc_slabs_node(kmem_cache_node, node, page->objects); - add_partial(n, page, 0); + add_partial(n, page, DEACTIVATE_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -2911,7 +3043,34 @@ static int kmem_cache_open(struct kmem_cache *s, * The larger the object size is, the more pages we want on the partial * list to avoid pounding the page allocator excessively. */ - set_min_partial(s, ilog2(s->size)); + set_min_partial(s, ilog2(s->size) / 2); + + /* + * cpu_partial determined the maximum number of objects kept in the + * per cpu partial lists of a processor. + * + * Per cpu partial lists mainly contain slabs that just have one + * object freed. If they are used for allocation then they can be + * filled up again with minimal effort. The slab will never hit the + * per node partial lists and therefore no locking will be required. + * + * This setting also determines + * + * A) The number of objects from per cpu partial slabs dumped to the + * per node list when we reach the limit. + * B) The number of objects in cpu partial slabs to extract from the + * per node list when we run out of per cpu objects. We only fetch 50% + * to keep some capacity around for frees. + */ + if (s->size >= PAGE_SIZE) + s->cpu_partial = 2; + else if (s->size >= 1024) + s->cpu_partial = 6; + else if (s->size >= 256) + s->cpu_partial = 13; + else + s->cpu_partial = 30; + s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; @@ -2970,13 +3129,13 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, /* * Attempt to free all partial slabs on a node. + * This is called from kmem_cache_close(). We must be the last thread + * using the cache and therefore we do not need to lock anymore. */ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { - unsigned long flags; struct page *page, *h; - spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { remove_partial(n, page); @@ -2986,7 +3145,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) "Objects remaining on kmem_cache_close()"); } } - spin_unlock_irqrestore(&n->list_lock, flags); } /* @@ -3020,6 +3178,7 @@ void kmem_cache_destroy(struct kmem_cache *s) s->refcount--; if (!s->refcount) { list_del(&s->list); + up_write(&slub_lock); if (kmem_cache_close(s)) { printk(KERN_ERR "SLUB %s: %s called for cache that " "still has objects.\n", s->name, __func__); @@ -3028,8 +3187,8 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); sysfs_slab_remove(s); - } - up_write(&slub_lock); + } else + up_write(&slub_lock); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -3347,23 +3506,23 @@ int kmem_cache_shrink(struct kmem_cache *s) * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse) { - remove_partial(n, page); - discard_slab(s, page); - } else { - list_move(&page->lru, - slabs_by_inuse + page->inuse); - } + list_move(&page->lru, slabs_by_inuse + page->inuse); + if (!page->inuse) + n->nr_partial--; } /* * Rebuild the partial list with the slabs filled up most * first and the least used slabs at the end. */ - for (i = objects - 1; i >= 0; i--) + for (i = objects - 1; i > 0; i--) list_splice(slabs_by_inuse + i, n->partial.prev); spin_unlock_irqrestore(&n->list_lock, flags); + + /* Release empty slabs */ + list_for_each_entry_safe(page, t, slabs_by_inuse, lru) + discard_slab(s, page); } kfree(slabs_by_inuse); @@ -4319,6 +4478,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, for_each_possible_cpu(cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + struct page *page; if (!c || c->node < 0) continue; @@ -4334,6 +4494,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s, total += x; nodes[c->node] += x; } + page = c->partial; + + if (page) { + x = page->pobjects; + total += x; + nodes[c->node] += x; + } per_cpu[c->node]++; } } @@ -4412,11 +4579,12 @@ struct slab_attribute { }; #define SLAB_ATTR_RO(_name) \ - static struct slab_attribute _name##_attr = __ATTR_RO(_name) + static struct slab_attribute _name##_attr = \ + __ATTR(_name, 0400, _name##_show, NULL) #define SLAB_ATTR(_name) \ static struct slab_attribute _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) + __ATTR(_name, 0600, _name##_show, _name##_store) static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { @@ -4485,6 +4653,27 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, } SLAB_ATTR(min_partial); +static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%u\n", s->cpu_partial); +} + +static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long objects; + int err; + + err = strict_strtoul(buf, 10, &objects); + if (err) + return err; + + s->cpu_partial = objects; + flush_all(s); + return length; +} +SLAB_ATTR(cpu_partial); + static ssize_t ctor_show(struct kmem_cache *s, char *buf) { if (!s->ctor) @@ -4523,6 +4712,37 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(objects_partial); +static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) +{ + int objects = 0; + int pages = 0; + int cpu; + int len; + + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; + + if (page) { + pages += page->pages; + objects += page->pobjects; + } + } + + len = sprintf(buf, "%d(%d)", objects, pages); + +#ifdef CONFIG_SMP + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; + + if (page && len < PAGE_SIZE - 20) + len += sprintf(buf + len, " C%d=%d(%d)", cpu, + page->pobjects, page->pages); + } +#endif + return len + sprintf(buf + len, "\n"); +} +SLAB_ATTR_RO(slabs_cpu_partial); + static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); @@ -4845,6 +5065,8 @@ STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); STAT_ATTR(ORDER_FALLBACK, order_fallback); STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); +STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); +STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); #endif static struct attribute *slab_attrs[] = { @@ -4853,6 +5075,7 @@ static struct attribute *slab_attrs[] = { &objs_per_slab_attr.attr, &order_attr.attr, &min_partial_attr.attr, + &cpu_partial_attr.attr, &objects_attr.attr, &objects_partial_attr.attr, &partial_attr.attr, @@ -4865,6 +5088,7 @@ static struct attribute *slab_attrs[] = { &destroy_by_rcu_attr.attr, &shrink_attr.attr, &reserved_attr.attr, + &slabs_cpu_partial_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, &slabs_attr.attr, @@ -4906,6 +5130,8 @@ static struct attribute *slab_attrs[] = { &order_fallback_attr.attr, &cmpxchg_double_fail_attr.attr, &cmpxchg_double_cpu_fail_attr.attr, + &cpu_partial_alloc_attr.attr, + &cpu_partial_free_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, @@ -5257,7 +5483,7 @@ static const struct file_operations proc_slabinfo_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); + proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); return 0; } module_init(slab_proc_init); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7ef0903058ee..5016f19e1661 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2140,6 +2140,14 @@ struct vm_struct *alloc_vm_area(size_t size) return NULL; } + /* + * If the allocated address space is passed to a hypercall + * before being used then we cannot rely on a page fault to + * trigger an update of the page tables. So sync all the page + * tables here. + */ + vmalloc_sync_all(); + return area; } EXPORT_SYMBOL_GPL(alloc_vm_area); diff --git a/mm/vmscan.c b/mm/vmscan.c index 7ef69124fa3e..9fdfce7ba403 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -105,7 +105,6 @@ struct scan_control { /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; - struct memcg_scanrecord *memcg_record; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes @@ -1349,8 +1348,6 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_rotated[file] += numpages; } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); @@ -1394,10 +1391,6 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, reclaim_stat->recent_scanned[0] += *nr_anon; reclaim_stat->recent_scanned[1] += *nr_file; - if (!scanning_global_lru(sc)) { - sc->memcg_record->nr_scanned[0] += *nr_anon; - sc->memcg_record->nr_scanned[1] += *nr_file; - } } /* @@ -1423,7 +1416,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) return false; - /* If we have relaimed everything on the isolated list, no stall */ + /* If we have reclaimed everything on the isolated list, no stall */ if (nr_freed == nr_taken) return false; @@ -1511,9 +1504,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, nr_reclaimed += shrink_page_list(&page_list, zone, sc); } - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_freed[file] += nr_reclaimed; - local_irq_disable(); if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_reclaimed); @@ -1613,8 +1603,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } reclaim_stat->recent_scanned[file] += nr_taken; - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) @@ -1666,8 +1654,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * get_scan_ratio. */ reclaim_stat->recent_rotated[file] += nr_rotated; - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_rotated[file] += nr_rotated; move_active_pages_to_lru(zone, &l_active, LRU_ACTIVE + file * LRU_FILE); @@ -1808,23 +1794,15 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, u64 fraction[2], denominator; enum lru_list l; int noswap = 0; - int force_scan = 0; + bool force_scan = false; unsigned long nr_force_scan[2]; - - anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); - - if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { - /* kswapd does zone balancing and need to scan this zone */ - if (scanning_global_lru(sc) && current_is_kswapd()) - force_scan = 1; - /* memcg may have small limit and need to avoid priority drop */ - if (!scanning_global_lru(sc)) - force_scan = 1; - } + /* kswapd does zone balancing and needs to scan this zone */ + if (scanning_global_lru(sc) && current_is_kswapd()) + force_scan = true; + /* memcg may have small limit and need to avoid priority drop */ + if (!scanning_global_lru(sc)) + force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || (nr_swap_pages <= 0)) { @@ -1837,6 +1815,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, goto out; } + anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + if (scanning_global_lru(sc)) { free = zone_page_state(zone, NR_FREE_PAGES); /* If we have very few page cache pages, @@ -2268,10 +2251,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap, - struct zone *zone, - struct memcg_scanrecord *rec, - unsigned long *scanned) + gfp_t gfp_mask, bool noswap, + struct zone *zone, + unsigned long *nr_scanned) { struct scan_control sc = { .nr_scanned = 0, @@ -2281,9 +2263,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_swap = !noswap, .order = 0, .mem_cgroup = mem, - .memcg_record = rec, }; - unsigned long start, end; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2292,7 +2272,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, sc.may_writepage, sc.gfp_mask); - start = sched_clock(); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. @@ -2301,25 +2280,19 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * the priority and make it zero. */ shrink_zone(0, zone, &sc); - end = sched_clock(); - - if (rec) - rec->elapsed += end - start; - *scanned = sc.nr_scanned; trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + *nr_scanned = sc.nr_scanned; return sc.nr_reclaimed; } unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, - bool noswap, - struct memcg_scanrecord *rec) + bool noswap) { struct zonelist *zonelist; unsigned long nr_reclaimed; - unsigned long start, end; int nid; struct scan_control sc = { .may_writepage = !laptop_mode, @@ -2328,7 +2301,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .nr_to_reclaim = SWAP_CLUSTER_MAX, .order = 0, .mem_cgroup = mem_cont, - .memcg_record = rec, .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), @@ -2337,7 +2309,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .gfp_mask = sc.gfp_mask, }; - start = sched_clock(); /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't * take care of from where we get pages. So the node where we start the @@ -2352,9 +2323,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); - end = sched_clock(); - if (rec) - rec->elapsed += end - start; trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -2529,6 +2497,9 @@ loop_again: high_wmark_pages(zone), 0, 0)) { end_zone = i; break; + } else { + /* If balanced, clear the congested flag */ + zone_clear_flag(zone, ZONE_CONGESTED); } } if (i < 0) diff --git a/mm/vmstat.c b/mm/vmstat.c index 20c18b7694b2..d52b13d28e8f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -659,7 +659,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, } #endif -#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) +#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) #ifdef CONFIG_ZONE_DMA #define TEXT_FOR_DMA(xx) xx "_dma", #else @@ -788,7 +788,7 @@ const char * const vmstat_text[] = { #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; -#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */ +#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ #ifdef CONFIG_PROC_FS |