diff options
| author | Adrian Bunk <bunk@r063144.stusta.swh.mhn.de> | 2006-03-20 20:30:36 +0300 | 
|---|---|---|
| committer | Adrian Bunk <bunk@r063144.stusta.swh.mhn.de> | 2006-03-20 20:30:36 +0300 | 
| commit | 0f76ee451484d02c7405d92e7bceb39b415abb01 (patch) | |
| tree | 9722f84281f786ba48971dde057f5171a49969e4 /mm/slab.c | |
| parent | 01d206a7c1167639f6ca6dac22140fbdca017558 (diff) | |
| parent | 7705a8792b0fc82fd7d4dd923724606bbfd9fb20 (diff) | |
| download | linux-0f76ee451484d02c7405d92e7bceb39b415abb01.tar.xz | |
Merge with git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
Diffstat (limited to 'mm/slab.c')
| -rw-r--r-- | mm/slab.c | 306 | 
1 files changed, 212 insertions, 94 deletions
diff --git a/mm/slab.c b/mm/slab.c index 71370256a7eb..d0bd7f07ab04 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -294,6 +294,7 @@ struct kmem_list3 {  	unsigned long next_reap;  	int free_touched;  	unsigned int free_limit; +	unsigned int colour_next;	/* Per-node cache coloring */  	spinlock_t list_lock;  	struct array_cache *shared;	/* shared per node */  	struct array_cache **alien;	/* on other nodes */ @@ -344,6 +345,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)  	INIT_LIST_HEAD(&parent->slabs_free);  	parent->shared = NULL;  	parent->alien = NULL; +	parent->colour_next = 0;  	spin_lock_init(&parent->list_lock);  	parent->free_objects = 0;  	parent->free_touched = 0; @@ -390,7 +392,6 @@ struct kmem_cache {  	size_t colour;		/* cache colouring range */  	unsigned int colour_off;	/* colour offset */ -	unsigned int colour_next;	/* cache colouring */  	struct kmem_cache *slabp_cache;  	unsigned int slab_size;  	unsigned int dflags;	/* dynamic flags */ @@ -788,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char *  	dump_stack();  } +#ifdef CONFIG_NUMA +/* + * Special reaping functions for NUMA systems called from cache_reap(). + * These take care of doing round robin flushing of alien caches (containing + * objects freed on different nodes from which they were allocated) and the + * flushing of remote pcps by calling drain_node_pages. + */ +static DEFINE_PER_CPU(unsigned long, reap_node); + +static void init_reap_node(int cpu) +{ +	int node; + +	node = next_node(cpu_to_node(cpu), node_online_map); +	if (node == MAX_NUMNODES) +		node = 0; + +	__get_cpu_var(reap_node) = node; +} + +static void next_reap_node(void) +{ +	int node = __get_cpu_var(reap_node); + +	/* +	 * Also drain per cpu pages on remote zones +	 */ +	if (node != numa_node_id()) +		drain_node_pages(node); + +	node = next_node(node, node_online_map); +	if (unlikely(node >= MAX_NUMNODES)) +		node = first_node(node_online_map); +	__get_cpu_var(reap_node) = node; +} + +#else +#define init_reap_node(cpu) do { } while (0) +#define next_reap_node(void) do { } while (0) +#endif +  /*   * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz   * via the workqueue/eventd. @@ -805,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)  	 * at that time.  	 */  	if (keventd_up() && reap_work->func == NULL) { +		init_reap_node(cpu);  		INIT_WORK(reap_work, cache_reap, NULL);  		schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);  	} @@ -883,14 +926,31 @@ static void __drain_alien_cache(struct kmem_cache *cachep,  	}  } -static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3) +/* + * Called from cache_reap() to regularly drain alien caches round robin. + */ +static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) +{ +	int node = __get_cpu_var(reap_node); + +	if (l3->alien) { +		struct array_cache *ac = l3->alien[node]; +		if (ac && ac->avail) { +			spin_lock_irq(&ac->lock); +			__drain_alien_cache(cachep, ac, node); +			spin_unlock_irq(&ac->lock); +		} +	} +} + +static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)  {  	int i = 0;  	struct array_cache *ac;  	unsigned long flags;  	for_each_online_node(i) { -		ac = l3->alien[i]; +		ac = alien[i];  		if (ac) {  			spin_lock_irqsave(&ac->lock, flags);  			__drain_alien_cache(cachep, ac, i); @@ -899,9 +959,19 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)  	}  }  #else -#define alloc_alien_cache(node, limit) do { } while (0) -#define free_alien_cache(ac_ptr) do { } while (0) -#define drain_alien_cache(cachep, l3) do { } while (0) + +#define drain_alien_cache(cachep, alien) do { } while (0) +#define reap_alien(cachep, l3) do { } while (0) + +static inline struct array_cache **alloc_alien_cache(int node, int limit) +{ +	return (struct array_cache **) 0x01020304ul; +} + +static inline void free_alien_cache(struct array_cache **ac_ptr) +{ +} +  #endif  static int __devinit cpuup_callback(struct notifier_block *nfb, @@ -935,6 +1005,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,  				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +  				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3; +				/* +				 * The l3s don't come and go as CPUs come and +				 * go.  cache_chain_mutex is sufficient +				 * protection here. +				 */  				cachep->nodelists[node] = l3;  			} @@ -949,26 +1024,46 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,  		   & array cache's */  		list_for_each_entry(cachep, &cache_chain, next) {  			struct array_cache *nc; +			struct array_cache *shared; +			struct array_cache **alien;  			nc = alloc_arraycache(node, cachep->limit, -					      cachep->batchcount); +						cachep->batchcount);  			if (!nc)  				goto bad; +			shared = alloc_arraycache(node, +					cachep->shared * cachep->batchcount, +					0xbaadf00d); +			if (!shared) +				goto bad; + +			alien = alloc_alien_cache(node, cachep->limit); +			if (!alien) +				goto bad;  			cachep->array[cpu] = nc;  			l3 = cachep->nodelists[node];  			BUG_ON(!l3); -			if (!l3->shared) { -				if (!(nc = alloc_arraycache(node, -							    cachep->shared * -							    cachep->batchcount, -							    0xbaadf00d))) -					goto bad; -				/* we are serialised from CPU_DEAD or -				   CPU_UP_CANCELLED by the cpucontrol lock */ -				l3->shared = nc; +			spin_lock_irq(&l3->list_lock); +			if (!l3->shared) { +				/* +				 * We are serialised from CPU_DEAD or +				 * CPU_UP_CANCELLED by the cpucontrol lock +				 */ +				l3->shared = shared; +				shared = NULL;  			} +#ifdef CONFIG_NUMA +			if (!l3->alien) { +				l3->alien = alien; +				alien = NULL; +			} +#endif +			spin_unlock_irq(&l3->list_lock); + +			kfree(shared); +			free_alien_cache(alien);  		}  		mutex_unlock(&cache_chain_mutex);  		break; @@ -977,25 +1072,34 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,  		break;  #ifdef CONFIG_HOTPLUG_CPU  	case CPU_DEAD: +		/* +		 * Even if all the cpus of a node are down, we don't free the +		 * kmem_list3 of any cache. This to avoid a race between +		 * cpu_down, and a kmalloc allocation from another cpu for +		 * memory from the node of the cpu going down.  The list3 +		 * structure is usually allocated from kmem_cache_create() and +		 * gets destroyed at kmem_cache_destroy(). +		 */  		/* fall thru */  	case CPU_UP_CANCELED:  		mutex_lock(&cache_chain_mutex);  		list_for_each_entry(cachep, &cache_chain, next) {  			struct array_cache *nc; +			struct array_cache *shared; +			struct array_cache **alien;  			cpumask_t mask;  			mask = node_to_cpumask(node); -			spin_lock_irq(&cachep->spinlock);  			/* cpu is dead; no one can alloc from it. */  			nc = cachep->array[cpu];  			cachep->array[cpu] = NULL;  			l3 = cachep->nodelists[node];  			if (!l3) -				goto unlock_cache; +				goto free_array_cache; -			spin_lock(&l3->list_lock); +			spin_lock_irq(&l3->list_lock);  			/* Free limit for this kmem_list3 */  			l3->free_limit -= cachep->batchcount; @@ -1003,34 +1107,44 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,  				free_block(cachep, nc->entry, nc->avail, node);  			if (!cpus_empty(mask)) { -				spin_unlock(&l3->list_lock); -				goto unlock_cache; +				spin_unlock_irq(&l3->list_lock); +				goto free_array_cache;  			} -			if (l3->shared) { +			shared = l3->shared; +			if (shared) {  				free_block(cachep, l3->shared->entry,  					   l3->shared->avail, node); -				kfree(l3->shared);  				l3->shared = NULL;  			} -			if (l3->alien) { -				drain_alien_cache(cachep, l3); -				free_alien_cache(l3->alien); -				l3->alien = NULL; -			} -			/* free slabs belonging to this node */ -			if (__node_shrink(cachep, node)) { -				cachep->nodelists[node] = NULL; -				spin_unlock(&l3->list_lock); -				kfree(l3); -			} else { -				spin_unlock(&l3->list_lock); +			alien = l3->alien; +			l3->alien = NULL; + +			spin_unlock_irq(&l3->list_lock); + +			kfree(shared); +			if (alien) { +				drain_alien_cache(cachep, alien); +				free_alien_cache(alien);  			} -		      unlock_cache: -			spin_unlock_irq(&cachep->spinlock); +free_array_cache:  			kfree(nc);  		} +		/* +		 * In the previous loop, all the objects were freed to +		 * the respective cache's slabs,  now we can go ahead and +		 * shrink each nodelist to its limit. +		 */ +		list_for_each_entry(cachep, &cache_chain, next) { +			l3 = cachep->nodelists[node]; +			if (!l3) +				continue; +			spin_lock_irq(&l3->list_lock); +			/* free slabs belonging to this node */ +			__node_shrink(cachep, node); +			spin_unlock_irq(&l3->list_lock); +		}  		mutex_unlock(&cache_chain_mutex);  		break;  #endif @@ -1070,6 +1184,7 @@ void __init kmem_cache_init(void)  	struct cache_sizes *sizes;  	struct cache_names *names;  	int i; +	int order;  	for (i = 0; i < NUM_INIT_LISTS; i++) {  		kmem_list3_init(&initkmem_list3[i]); @@ -1113,13 +1228,16 @@ void __init kmem_cache_init(void)  	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); -	cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0, -		       &left_over, &cache_cache.num); +	for (order = 0; order < MAX_ORDER; order++) { +		cache_estimate(order, cache_cache.buffer_size, +			cache_line_size(), 0, &left_over, &cache_cache.num); +		if (cache_cache.num) +			break; +	}  	if (!cache_cache.num)  		BUG(); - +	cache_cache.gfporder = order;  	cache_cache.colour = left_over / cache_cache.colour_off; -	cache_cache.colour_next = 0;  	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +  				      sizeof(struct slab), cache_line_size()); @@ -1575,36 +1693,44 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,  			size_t size, size_t align, unsigned long flags)  {  	size_t left_over = 0; +	int gfporder; -	for (;; cachep->gfporder++) { +	for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {  		unsigned int num;  		size_t remainder; -		if (cachep->gfporder > MAX_GFP_ORDER) { -			cachep->num = 0; -			break; -		} - -		cache_estimate(cachep->gfporder, size, align, flags, -			       &remainder, &num); +		cache_estimate(gfporder, size, align, flags, &remainder, &num);  		if (!num)  			continue; +  		/* More than offslab_limit objects will cause problems */ -		if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) +		if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit)  			break; +		/* Found something acceptable - save it away */  		cachep->num = num; +		cachep->gfporder = gfporder;  		left_over = remainder;  		/* +		 * A VFS-reclaimable slab tends to have most allocations +		 * as GFP_NOFS and we really don't want to have to be allocating +		 * higher-order pages when we are unable to shrink dcache. +		 */ +		if (flags & SLAB_RECLAIM_ACCOUNT) +			break; + +		/*  		 * Large number of objects is good, but very large slabs are  		 * currently bad for the gfp()s.  		 */ -		if (cachep->gfporder >= slab_break_gfp_order) +		if (gfporder >= slab_break_gfp_order)  			break; -		if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder)) -			/* Acceptable internal fragmentation */ +		/* +		 * Acceptable internal fragmentation? +		 */ +		if ((left_over * 8) <= (PAGE_SIZE << gfporder))  			break;  	}  	return left_over; @@ -1664,6 +1790,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,  		BUG();  	} +	/* +	 * Prevent CPUs from coming and going. +	 * lock_cpu_hotplug() nests outside cache_chain_mutex +	 */ +	lock_cpu_hotplug(); +  	mutex_lock(&cache_chain_mutex);  	list_for_each(p, &cache_chain) { @@ -1810,17 +1942,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,  	size = ALIGN(size, align); -	if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) { -		/* -		 * A VFS-reclaimable slab tends to have most allocations -		 * as GFP_NOFS and we really don't want to have to be allocating -		 * higher-order pages when we are unable to shrink dcache. -		 */ -		cachep->gfporder = 0; -		cache_estimate(cachep->gfporder, size, align, flags, -			       &left_over, &cachep->num); -	} else -		left_over = calculate_slab_order(cachep, size, align, flags); +	left_over = calculate_slab_order(cachep, size, align, flags);  	if (!cachep->num) {  		printk("kmem_cache_create: couldn't create cache %s.\n", name); @@ -1865,8 +1987,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,  	cachep->dtor = dtor;  	cachep->name = name; -	/* Don't let CPUs to come and go */ -	lock_cpu_hotplug();  	if (g_cpucache_up == FULL) {  		enable_cpucache(cachep); @@ -1925,12 +2045,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,  	/* cache setup completed, link it into the list */  	list_add(&cachep->next, &cache_chain); -	unlock_cpu_hotplug();        oops:  	if (!cachep && (flags & SLAB_PANIC))  		panic("kmem_cache_create(): failed to create slab `%s'\n",  		      name);  	mutex_unlock(&cache_chain_mutex); +	unlock_cpu_hotplug();  	return cachep;  }  EXPORT_SYMBOL(kmem_cache_create); @@ -2011,18 +2131,16 @@ static void drain_cpu_caches(struct kmem_cache *cachep)  	smp_call_function_all_cpus(do_drain, cachep);  	check_irq_on(); -	spin_lock_irq(&cachep->spinlock);  	for_each_online_node(node) {  		l3 = cachep->nodelists[node];  		if (l3) { -			spin_lock(&l3->list_lock); +			spin_lock_irq(&l3->list_lock);  			drain_array_locked(cachep, l3->shared, 1, node); -			spin_unlock(&l3->list_lock); +			spin_unlock_irq(&l3->list_lock);  			if (l3->alien) -				drain_alien_cache(cachep, l3); +				drain_alien_cache(cachep, l3->alien);  		}  	} -	spin_unlock_irq(&cachep->spinlock);  }  static int __node_shrink(struct kmem_cache *cachep, int node) @@ -2324,20 +2442,20 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)  		 */  		ctor_flags |= SLAB_CTOR_ATOMIC; -	/* About to mess with non-constant members - lock. */ +	/* Take the l3 list lock to change the colour_next on this node */  	check_irq_off(); -	spin_lock(&cachep->spinlock); +	l3 = cachep->nodelists[nodeid]; +	spin_lock(&l3->list_lock);  	/* Get colour for the slab, and cal the next value. */ -	offset = cachep->colour_next; -	cachep->colour_next++; -	if (cachep->colour_next >= cachep->colour) -		cachep->colour_next = 0; -	offset *= cachep->colour_off; +	offset = l3->colour_next; +	l3->colour_next++; +	if (l3->colour_next >= cachep->colour) +		l3->colour_next = 0; +	spin_unlock(&l3->list_lock); -	spin_unlock(&cachep->spinlock); +	offset *= cachep->colour_off; -	check_irq_off();  	if (local_flags & __GFP_WAIT)  		local_irq_enable(); @@ -2367,7 +2485,6 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)  	if (local_flags & __GFP_WAIT)  		local_irq_disable();  	check_irq_off(); -	l3 = cachep->nodelists[nodeid];  	spin_lock(&l3->list_lock);  	/* Make slab active. */ @@ -2500,7 +2617,7 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)  		       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",  		       cachep->name, cachep->num, slabp, slabp->inuse);  		for (i = 0; -		     i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t); +		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);  		     i++) {  			if ((i % 16) == 0)  				printk("\n%03x:", i); @@ -2725,6 +2842,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node  	BUG_ON(!l3);        retry: +	check_irq_off();  	spin_lock(&l3->list_lock);  	entry = l3->slabs_partial.next;  	if (entry == &l3->slabs_partial) { @@ -3304,11 +3422,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount  	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);  	check_irq_on(); -	spin_lock_irq(&cachep->spinlock); +	spin_lock(&cachep->spinlock);  	cachep->batchcount = batchcount;  	cachep->limit = limit;  	cachep->shared = shared; -	spin_unlock_irq(&cachep->spinlock); +	spin_unlock(&cachep->spinlock);  	for_each_online_cpu(i) {  		struct array_cache *ccold = new.new[i]; @@ -3439,8 +3557,7 @@ static void cache_reap(void *unused)  		check_irq_on();  		l3 = searchp->nodelists[numa_node_id()]; -		if (l3->alien) -			drain_alien_cache(searchp, l3); +		reap_alien(searchp, l3);  		spin_lock_irq(&l3->list_lock);  		drain_array_locked(searchp, cpu_cache_get(searchp), 0, @@ -3490,7 +3607,7 @@ static void cache_reap(void *unused)  	}  	check_irq_on();  	mutex_unlock(&cache_chain_mutex); -	drain_remote_pages(); +	next_reap_node();  	/* Setup the next iteration */  	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);  } @@ -3564,8 +3681,7 @@ static int s_show(struct seq_file *m, void *p)  	int node;  	struct kmem_list3 *l3; -	check_irq_on(); -	spin_lock_irq(&cachep->spinlock); +	spin_lock(&cachep->spinlock);  	active_objs = 0;  	num_slabs = 0;  	for_each_online_node(node) { @@ -3573,7 +3689,8 @@ static int s_show(struct seq_file *m, void *p)  		if (!l3)  			continue; -		spin_lock(&l3->list_lock); +		check_irq_on(); +		spin_lock_irq(&l3->list_lock);  		list_for_each(q, &l3->slabs_full) {  			slabp = list_entry(q, struct slab, list); @@ -3598,9 +3715,10 @@ static int s_show(struct seq_file *m, void *p)  			num_slabs++;  		}  		free_objects += l3->free_objects; -		shared_avail += l3->shared->avail; +		if (l3->shared) +			shared_avail += l3->shared->avail; -		spin_unlock(&l3->list_lock); +		spin_unlock_irq(&l3->list_lock);  	}  	num_slabs += active_slabs;  	num_objs = num_slabs * cachep->num; @@ -3644,7 +3762,7 @@ static int s_show(struct seq_file *m, void *p)  	}  #endif  	seq_putc(m, '\n'); -	spin_unlock_irq(&cachep->spinlock); +	spin_unlock(&cachep->spinlock);  	return 0;  }  | 
