From b55e6bfcd23cb2f7249095050c649f7aea813f9f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:06 -0800
Subject: [PATCH] md: Split disks array out of raid5 conf structure so it is
 easier to grow

The remainder of this batch implements raid5 reshaping.  Currently the only
shape change that is supported is added a device, but it is envisioned that
changing the chunksize and layout will also be supported, as well as changing
the level (e.g.  1->5, 5->6).

The reshape process naturally has to move all of the data in the array, and so
should be used with caution.  It is believed to work, and some testing does
support this, but wider testing would be great for increasing my confidence.

You will need a version of mdadm newer than 2.3.1 to make use of raid5 growth.
 This is because mdadm need to take a copy of a 'critical section' at the
start of the array incase there is a crash at an awkward moment.  On restart,
mdadm will restore the critical section and allow reshape to continue.

I hope to release a 2.4-pre by early next week - it still needs a little more
polishing.

This patch:

Previously the array of disk information was included in the raid5 'conf'
structure which was allocated to an appropriate size.  This makes it awkward
to change the size of that array.  So we split it off into a separate
kmalloced array which will require a little extra indexing, but is much easier
to grow.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/raid5.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux/raid/raid5.h')

diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 394da8207b34..94dbdd406f12 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -240,7 +240,7 @@ struct raid5_private_data {
 							 * waiting for 25% to be free
 							 */        
 	spinlock_t		device_lock;
-	struct disk_info	disks[0];
+	struct disk_info	*disks;
 };
 
 typedef struct raid5_private_data raid5_conf_t;
-- 
cgit v1.2.3


From ad01c9e3752f4ba4f3d99c89b7370fa4983a25b5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:07 -0800
Subject: [PATCH] md: Allow stripes to be expanded in preparation for expanding
 an array

Before a RAID-5 can be expanded, we need to be able to expand the stripe-cache
data structure.

This requires allocating new stripes in a new kmem_cache.  If this succeeds,
we copy cache pages over and release the old stripes and kmem_cache.

We then allocate new pages.  If that fails, we leave the stripe cache at it's
new size.  It isn't worth the effort to shrink it back again.

Unfortuanately this means we need two kmem_cache names as we, for a short
period of time, we have two kmem_caches.  So they are raid5/%s and
raid5/%s-alt

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |   2 +-
 drivers/md/raid5.c         | 131 +++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/raid6main.c     |   4 +-
 include/linux/raid/raid5.h |   9 +++-
 4 files changed, 137 insertions(+), 9 deletions(-)

(limited to 'include/linux/raid/raid5.h')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a3ecaf8ed30a..c7b7656f9aa5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2775,7 +2775,6 @@ static void autorun_array(mddev_t *mddev)
  */
 static void autorun_devices(int part)
 {
-	struct list_head candidates;
 	struct list_head *tmp;
 	mdk_rdev_t *rdev0, *rdev;
 	mddev_t *mddev;
@@ -2784,6 +2783,7 @@ static void autorun_devices(int part)
 	printk(KERN_INFO "md: autorun ...\n");
 	while (!list_empty(&pending_raid_disks)) {
 		dev_t dev;
+		LIST_HEAD(candidates);
 		rdev0 = list_entry(pending_raid_disks.next,
 					 mdk_rdev_t, same_set);
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 03f31379cebb..6c20b44509d8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -313,20 +313,143 @@ static int grow_stripes(raid5_conf_t *conf, int num)
 	kmem_cache_t *sc;
 	int devs = conf->raid_disks;
 
-	sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev));
-
-	sc = kmem_cache_create(conf->cache_name, 
+	sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
+	sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev));
+	conf->active_name = 0;
+	sc = kmem_cache_create(conf->cache_name[conf->active_name],
 			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
 			       0, 0, NULL, NULL);
 	if (!sc)
 		return 1;
 	conf->slab_cache = sc;
+	conf->pool_size = devs;
 	while (num--) {
 		if (!grow_one_stripe(conf))
 			return 1;
 	}
 	return 0;
 }
+static int resize_stripes(raid5_conf_t *conf, int newsize)
+{
+	/* Make all the stripes able to hold 'newsize' devices.
+	 * New slots in each stripe get 'page' set to a new page.
+	 *
+	 * This happens in stages:
+	 * 1/ create a new kmem_cache and allocate the required number of
+	 *    stripe_heads.
+	 * 2/ gather all the old stripe_heads and tranfer the pages across
+	 *    to the new stripe_heads.  This will have the side effect of
+	 *    freezing the array as once all stripe_heads have been collected,
+	 *    no IO will be possible.  Old stripe heads are freed once their
+	 *    pages have been transferred over, and the old kmem_cache is
+	 *    freed when all stripes are done.
+	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
+	 *    we simple return a failre status - no need to clean anything up.
+	 * 4/ allocate new pages for the new slots in the new stripe_heads.
+	 *    If this fails, we don't bother trying the shrink the
+	 *    stripe_heads down again, we just leave them as they are.
+	 *    As each stripe_head is processed the new one is released into
+	 *    active service.
+	 *
+	 * Once step2 is started, we cannot afford to wait for a write,
+	 * so we use GFP_NOIO allocations.
+	 */
+	struct stripe_head *osh, *nsh;
+	LIST_HEAD(newstripes);
+	struct disk_info *ndisks;
+	int err = 0;
+	kmem_cache_t *sc;
+	int i;
+
+	if (newsize <= conf->pool_size)
+		return 0; /* never bother to shrink */
+
+	/* Step 1 */
+	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
+			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
+			       0, 0, NULL, NULL);
+	if (!sc)
+		return -ENOMEM;
+
+	for (i = conf->max_nr_stripes; i; i--) {
+		nsh = kmem_cache_alloc(sc, GFP_KERNEL);
+		if (!nsh)
+			break;
+
+		memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
+
+		nsh->raid_conf = conf;
+		spin_lock_init(&nsh->lock);
+
+		list_add(&nsh->lru, &newstripes);
+	}
+	if (i) {
+		/* didn't get enough, give up */
+		while (!list_empty(&newstripes)) {
+			nsh = list_entry(newstripes.next, struct stripe_head, lru);
+			list_del(&nsh->lru);
+			kmem_cache_free(sc, nsh);
+		}
+		kmem_cache_destroy(sc);
+		return -ENOMEM;
+	}
+	/* Step 2 - Must use GFP_NOIO now.
+	 * OK, we have enough stripes, start collecting inactive
+	 * stripes and copying them over
+	 */
+	list_for_each_entry(nsh, &newstripes, lru) {
+		spin_lock_irq(&conf->device_lock);
+		wait_event_lock_irq(conf->wait_for_stripe,
+				    !list_empty(&conf->inactive_list),
+				    conf->device_lock,
+				    unplug_slaves(conf->mddev);
+			);
+		osh = get_free_stripe(conf);
+		spin_unlock_irq(&conf->device_lock);
+		atomic_set(&nsh->count, 1);
+		for(i=0; i<conf->pool_size; i++)
+			nsh->dev[i].page = osh->dev[i].page;
+		for( ; i<newsize; i++)
+			nsh->dev[i].page = NULL;
+		kmem_cache_free(conf->slab_cache, osh);
+	}
+	kmem_cache_destroy(conf->slab_cache);
+
+	/* Step 3.
+	 * At this point, we are holding all the stripes so the array
+	 * is completely stalled, so now is a good time to resize
+	 * conf->disks.
+	 */
+	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
+	if (ndisks) {
+		for (i=0; i<conf->raid_disks; i++)
+			ndisks[i] = conf->disks[i];
+		kfree(conf->disks);
+		conf->disks = ndisks;
+	} else
+		err = -ENOMEM;
+
+	/* Step 4, return new stripes to service */
+	while(!list_empty(&newstripes)) {
+		nsh = list_entry(newstripes.next, struct stripe_head, lru);
+		list_del_init(&nsh->lru);
+		for (i=conf->raid_disks; i < newsize; i++)
+			if (nsh->dev[i].page == NULL) {
+				struct page *p = alloc_page(GFP_NOIO);
+				nsh->dev[i].page = p;
+				if (!p)
+					err = -ENOMEM;
+			}
+		release_stripe(nsh);
+	}
+	/* critical section pass, GFP_NOIO no longer needed */
+
+	conf->slab_cache = sc;
+	conf->active_name = 1-conf->active_name;
+	conf->pool_size = newsize;
+	return err;
+}
+
 
 static int drop_one_stripe(raid5_conf_t *conf)
 {
@@ -339,7 +462,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
 		return 0;
 	if (atomic_read(&sh->count))
 		BUG();
-	shrink_buffers(sh, conf->raid_disks);
+	shrink_buffers(sh, conf->pool_size);
 	kmem_cache_free(conf->slab_cache, sh);
 	atomic_dec(&conf->active_stripes);
 	return 1;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index c7632f6cc487..6df4930fddec 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -331,9 +331,9 @@ static int grow_stripes(raid6_conf_t *conf, int num)
 	kmem_cache_t *sc;
 	int devs = conf->raid_disks;
 
-	sprintf(conf->cache_name, "raid6/%s", mdname(conf->mddev));
+	sprintf(conf->cache_name[0], "raid6/%s", mdname(conf->mddev));
 
-	sc = kmem_cache_create(conf->cache_name,
+	sc = kmem_cache_create(conf->cache_name[0],
 			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
 			       0, 0, NULL, NULL);
 	if (!sc)
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 94dbdd406f12..b7b2653af7bb 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -216,7 +216,11 @@ struct raid5_private_data {
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 
-	char			cache_name[20];
+	/* unfortunately we need two cache names as we temporarily have
+	 * two caches.
+	 */
+	int			active_name;
+	char			cache_name[2][20];
 	kmem_cache_t		*slab_cache; /* for allocating stripes */
 
 	int			seq_flush, seq_write;
@@ -238,7 +242,8 @@ struct raid5_private_data {
 	wait_queue_head_t	wait_for_overlap;
 	int			inactive_blocked;	/* release of inactive stripes blocked,
 							 * waiting for 25% to be free
-							 */        
+							 */
+	int			pool_size; /* number of disks in stripeheads in pool */
 	spinlock_t		device_lock;
 	struct disk_info	*disks;
 };
-- 
cgit v1.2.3


From 7ecaa1e6a1ad69862e9980b6c777e11f26c4782d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:08 -0800
Subject: [PATCH] md: Infrastructure to allow normal IO to continue while array
 is expanding

We need to allow that different stripes are of different effective sizes, and
use the appropriate size.  Also, when a stripe is being expanded, we must
block any IO attempts until the stripe is stable again.

Key elements in this change are:
 - each stripe_head gets a 'disk' field which is part of the key,
   thus there can sometimes be two stripe heads of the same area of
   the array, but covering different numbers of devices.  One of these
   will be marked STRIPE_EXPANDING and so won't accept new requests.
 - conf->expand_progress tracks how the expansion is progressing and
   is used to determine whether the target part of the array has been
   expanded yet or not.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c         | 88 ++++++++++++++++++++++++++++++----------------
 include/linux/raid/raid5.h |  6 ++++
 2 files changed, 64 insertions(+), 30 deletions(-)

(limited to 'include/linux/raid/raid5.h')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6c20b44509d8..7a6df515b008 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -178,10 +178,10 @@ static int grow_buffers(struct stripe_head *sh, int num)
 
 static void raid5_build_block (struct stripe_head *sh, int i);
 
-static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
+static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, i;
+	int i;
 
 	if (atomic_read(&sh->count) != 0)
 		BUG();
@@ -198,7 +198,9 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
 	sh->pd_idx = pd_idx;
 	sh->state = 0;
 
-	for (i=disks; i--; ) {
+	sh->disks = disks;
+
+	for (i = sh->disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 
 		if (dev->toread || dev->towrite || dev->written ||
@@ -215,7 +217,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
 	insert_hash(conf, sh);
 }
 
-static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
 {
 	struct stripe_head *sh;
 	struct hlist_node *hn;
@@ -223,7 +225,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
 	CHECK_DEVLOCK();
 	PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
 	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
-		if (sh->sector == sector)
+		if (sh->sector == sector && sh->disks == disks)
 			return sh;
 	PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
 	return NULL;
@@ -232,8 +234,8 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
 static void unplug_slaves(mddev_t *mddev);
 static void raid5_unplug_device(request_queue_t *q);
 
-static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector,
-					     int pd_idx, int noblock) 
+static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
+					     int pd_idx, int noblock)
 {
 	struct stripe_head *sh;
 
@@ -245,7 +247,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    conf->quiesce == 0,
 				    conf->device_lock, /* nothing */);
-		sh = __find_stripe(conf, sector);
+		sh = __find_stripe(conf, sector, disks);
 		if (!sh) {
 			if (!conf->inactive_blocked)
 				sh = get_free_stripe(conf);
@@ -263,7 +265,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 					);
 				conf->inactive_blocked = 0;
 			} else
-				init_stripe(sh, sector, pd_idx);
+				init_stripe(sh, sector, pd_idx, disks);
 		} else {
 			if (atomic_read(&sh->count)) {
 				if (!list_empty(&sh->lru))
@@ -300,6 +302,7 @@ static int grow_one_stripe(raid5_conf_t *conf)
 		kmem_cache_free(conf->slab_cache, sh);
 		return 0;
 	}
+	sh->disks = conf->raid_disks;
 	/* we just created an active stripe so... */
 	atomic_set(&sh->count, 1);
 	atomic_inc(&conf->active_stripes);
@@ -483,7 +486,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
 {
  	struct stripe_head *sh = bi->bi_private;
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, i;
+	int disks = sh->disks, i;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 
 	if (bi->bi_size)
@@ -581,7 +584,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
 {
  	struct stripe_head *sh = bi->bi_private;
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, i;
+	int disks = sh->disks, i;
 	unsigned long flags;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 
@@ -735,7 +738,7 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
 static sector_t compute_blocknr(struct stripe_head *sh, int i)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
+	int raid_disks = sh->disks, data_disks = raid_disks - 1;
 	sector_t new_sector = sh->sector, check;
 	int sectors_per_chunk = conf->chunk_size >> 9;
 	sector_t stripe;
@@ -836,8 +839,7 @@ static void copy_data(int frombio, struct bio *bio,
 
 static void compute_block(struct stripe_head *sh, int dd_idx)
 {
-	raid5_conf_t *conf = sh->raid_conf;
-	int i, count, disks = conf->raid_disks;
+	int i, count, disks = sh->disks;
 	void *ptr[MAX_XOR_BLOCKS], *p;
 
 	PRINTK("compute_block, stripe %llu, idx %d\n", 
@@ -867,7 +869,7 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
 static void compute_parity(struct stripe_head *sh, int method)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
+	int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
 	void *ptr[MAX_XOR_BLOCKS];
 	struct bio *chosen;
 
@@ -1055,7 +1057,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 static void handle_stripe(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks;
+	int disks = sh->disks;
 	struct bio *return_bi= NULL;
 	struct bio *bi;
 	int i;
@@ -1649,12 +1651,10 @@ static inline void raid5_plug_device(raid5_conf_t *conf)
 	spin_unlock_irq(&conf->device_lock);
 }
 
-static int make_request (request_queue_t *q, struct bio * bi)
+static int make_request(request_queue_t *q, struct bio * bi)
 {
 	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev_to_conf(mddev);
-	const unsigned int raid_disks = conf->raid_disks;
-	const unsigned int data_disks = raid_disks - 1;
 	unsigned int dd_idx, pd_idx;
 	sector_t new_sector;
 	sector_t logical_sector, last_sector;
@@ -1678,20 +1678,48 @@ static int make_request (request_queue_t *q, struct bio * bi)
 
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
+		int disks;
 		
-		new_sector = raid5_compute_sector(logical_sector,
-						  raid_disks, data_disks, &dd_idx, &pd_idx, conf);
-
+	retry:
+		if (likely(conf->expand_progress == MaxSector))
+			disks = conf->raid_disks;
+		else {
+			spin_lock_irq(&conf->device_lock);
+			disks = conf->raid_disks;
+			if (logical_sector >= conf->expand_progress)
+				disks = conf->previous_raid_disks;
+			spin_unlock_irq(&conf->device_lock);
+		}
+ 		new_sector = raid5_compute_sector(logical_sector, disks, disks - 1,
+						  &dd_idx, &pd_idx, conf);
 		PRINTK("raid5: make_request, sector %llu logical %llu\n",
 			(unsigned long long)new_sector, 
 			(unsigned long long)logical_sector);
 
-	retry:
 		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
-		sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
+		sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
 		if (sh) {
-			if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
-				/* Add failed due to overlap.  Flush everything
+			if (unlikely(conf->expand_progress != MaxSector)) {
+				/* expansion might have moved on while waiting for a
+				 * stripe, so we much do the range check again.
+				 */
+				int must_retry = 0;
+				spin_lock_irq(&conf->device_lock);
+				if (logical_sector <  conf->expand_progress &&
+				    disks == conf->previous_raid_disks)
+					/* mismatch, need to try again */
+					must_retry = 1;
+				spin_unlock_irq(&conf->device_lock);
+				if (must_retry) {
+					release_stripe(sh);
+					goto retry;
+				}
+			}
+
+			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
+			    !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+				/* Stripe is busy expanding or
+				 * add failed due to overlap.  Flush everything
 				 * and wait a while
 				 */
 				raid5_unplug_device(mddev->queue);
@@ -1703,7 +1731,6 @@ static int make_request (request_queue_t *q, struct bio * bi)
 			raid5_plug_device(conf);
 			handle_stripe(sh);
 			release_stripe(sh);
-
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1779,9 +1806,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
 	first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
 		+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
-	sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
+	sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
 	if (sh == NULL) {
-		sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
+		sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
 		/* make sure we don't swamp the stripe cache if someone else
 		 * is trying to get access 
 		 */
@@ -1998,6 +2025,7 @@ static int run(mddev_t *mddev)
 	conf->level = mddev->level;
 	conf->algorithm = mddev->layout;
 	conf->max_nr_stripes = NR_STRIPES;
+	conf->expand_progress = MaxSector;
 
 	/* device size must be a multiple of chunk size */
 	mddev->size &= ~(mddev->chunk_size/1024 -1);
@@ -2128,7 +2156,7 @@ static void print_sh (struct stripe_head *sh)
 	printk("sh %llu,  count %d.\n",
 		(unsigned long long)sh->sector, atomic_read(&sh->count));
 	printk("sh %llu, ", (unsigned long long)sh->sector);
-	for (i = 0; i < sh->raid_conf->raid_disks; i++) {
+	for (i = 0; i < sh->disks; i++) {
 		printk("(cache%d: %p %ld) ", 
 			i, sh->dev[i].page, sh->dev[i].flags);
 	}
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index b7b2653af7bb..6fa274aea2a0 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -135,6 +135,7 @@ struct stripe_head {
 	atomic_t		count;			/* nr of active thread/requests */
 	spinlock_t		lock;
 	int			bm_seq;	/* sequence number for bitmap flushes */
+	int			disks;			/* disks in stripe */
 	struct r5dev {
 		struct bio	req;
 		struct bio_vec	vec;
@@ -174,6 +175,7 @@ struct stripe_head {
 #define	STRIPE_DELAYED		6
 #define	STRIPE_DEGRADED		7
 #define	STRIPE_BIT_DELAY	8
+#define	STRIPE_EXPANDING	9
 
 /*
  * Plugging:
@@ -211,6 +213,10 @@ struct raid5_private_data {
 	int			raid_disks, working_disks, failed_disks;
 	int			max_nr_stripes;
 
+	/* used during an expand */
+	sector_t		expand_progress;	/* MaxSector when no expand happening */
+	int			previous_raid_disks;
+
 	struct list_head	handle_list; /* stripes needing handling */
 	struct list_head	delayed_list; /* stripes that have plugged requests */
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
-- 
cgit v1.2.3


From ccfcc3c10b2a5cb8fd3c918199a4ff904fc6fb3e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:09 -0800
Subject: [PATCH] md: Core of raid5 resize process

This patch provides the core of the resize/expand process.

sync_request notices if a 'reshape' is happening and acts accordingly.

It allocated new stripe_heads for the next chunk-wide-stripe in the target
geometry, marking them STRIPE_EXPANDING.

Then it finds which stripe heads in the old geometry can provide data needed
by these and marks them STRIPE_EXPAND_SOURCE.  This causes stripe_handle to
read all blocks on those stripes.

Once all blocks on a STRIPE_EXPAND_SOURCE stripe_head are read, any that are
needed are copied into the corresponding STRIPE_EXPANDING stripe_head.  Once a
STRIPE_EXPANDING stripe_head is full, it is marks STRIPE_EXPAND_READY and then
is written out and released.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |  14 +++-
 drivers/md/raid5.c         | 185 +++++++++++++++++++++++++++++++++++++++------
 include/linux/raid/md_k.h  |   4 +
 include/linux/raid/raid5.h |   4 +-
 4 files changed, 181 insertions(+), 26 deletions(-)

(limited to 'include/linux/raid/raid5.h')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c7b7656f9aa5..8e65986bc63f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2165,7 +2165,9 @@ action_show(mddev_t *mddev, char *page)
 	char *type = "idle";
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
 	    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
-		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+			type = "reshape";
+		else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 			if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
 				type = "resync";
 			else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
@@ -4088,8 +4090,10 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
 		seq_printf(seq, "] ");
 	}
 	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
+		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
+		    "reshape" :
 		      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
-		       "resync" : "recovery"),
+		       "resync" : "recovery")),
 		      per_milli/10, per_milli % 10,
 		   (unsigned long long) resync,
 		   (unsigned long long) max_blocks);
@@ -4543,7 +4547,9 @@ static void md_do_sync(mddev_t *mddev)
 		 */
 		max_sectors = mddev->resync_max_sectors;
 		mddev->resync_mismatches = 0;
-	} else
+	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+		max_sectors = mddev->size << 1;
+	else
 		/* recovery follows the physical size of devices */
 		max_sectors = mddev->size << 1;
 
@@ -4679,6 +4685,8 @@ static void md_do_sync(mddev_t *mddev)
 	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
 
 	if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+	    !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
 	    mddev->curr_resync > 2 &&
 	    mddev->curr_resync >= mddev->recovery_cp) {
 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7a6df515b008..56cba8d3e398 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -93,11 +93,11 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
 					md_wakeup_thread(conf->mddev->thread);
 			}
-			list_add_tail(&sh->lru, &conf->inactive_list);
 			atomic_dec(&conf->active_stripes);
-			if (!conf->inactive_blocked ||
-			    atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4))
+			if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+				list_add_tail(&sh->lru, &conf->inactive_list);
 				wake_up(&conf->wait_for_stripe);
+			}
 		}
 	}
 }
@@ -273,9 +273,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 			} else {
 				if (!test_bit(STRIPE_HANDLE, &sh->state))
 					atomic_inc(&conf->active_stripes);
-				if (list_empty(&sh->lru))
-					BUG();
-				list_del_init(&sh->lru);
+				if (!list_empty(&sh->lru))
+					list_del_init(&sh->lru);
 			}
 		}
 	} while (sh == NULL);
@@ -1035,6 +1034,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 	return 0;
 }
 
+static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
+{
+	int sectors_per_chunk = conf->chunk_size >> 9;
+	sector_t x = stripe;
+	int pd_idx, dd_idx;
+	int chunk_offset = sector_div(x, sectors_per_chunk);
+	stripe = x;
+	raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
+			     + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
+	return pd_idx;
+}
+
 
 /*
  * handle_stripe - do things to a stripe.
@@ -1061,7 +1072,7 @@ static void handle_stripe(struct stripe_head *sh)
 	struct bio *return_bi= NULL;
 	struct bio *bi;
 	int i;
-	int syncing;
+	int syncing, expanding, expanded;
 	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
 	int non_overwrite = 0;
 	int failed_num=0;
@@ -1076,6 +1087,8 @@ static void handle_stripe(struct stripe_head *sh)
 	clear_bit(STRIPE_DELAYED, &sh->state);
 
 	syncing = test_bit(STRIPE_SYNCING, &sh->state);
+	expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+	expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
 	/* Now to look around and see what can be done */
 
 	rcu_read_lock();
@@ -1268,13 +1281,14 @@ static void handle_stripe(struct stripe_head *sh)
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
 	 */
-	if (to_read || non_overwrite || (syncing && (uptodate < disks))) {
+	if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) {
 		for (i=disks; i--;) {
 			dev = &sh->dev[i];
 			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
 			    (dev->toread ||
 			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
 			     syncing ||
+			     expanding ||
 			     (failed && (sh->dev[failed_num].toread ||
 					 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
 				    )
@@ -1464,13 +1478,76 @@ static void handle_stripe(struct stripe_head *sh)
 			set_bit(R5_Wantwrite, &dev->flags);
 			set_bit(R5_ReWrite, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
+			locked++;
 		} else {
 			/* let's read it back */
 			set_bit(R5_Wantread, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
+			locked++;
 		}
 	}
 
+	if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+		/* Need to write out all blocks after computing parity */
+		sh->disks = conf->raid_disks;
+		sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
+		compute_parity(sh, RECONSTRUCT_WRITE);
+		for (i= conf->raid_disks; i--;) {
+			set_bit(R5_LOCKED, &sh->dev[i].flags);
+			locked++;
+			set_bit(R5_Wantwrite, &sh->dev[i].flags);
+		}
+		clear_bit(STRIPE_EXPANDING, &sh->state);
+	} else if (expanded) {
+		clear_bit(STRIPE_EXPAND_READY, &sh->state);
+		wake_up(&conf->wait_for_overlap);
+		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+	}
+
+	if (expanding && locked == 0) {
+		/* We have read all the blocks in this stripe and now we need to
+		 * copy some of them into a target stripe for expand.
+		 */
+		clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+		for (i=0; i< sh->disks; i++)
+			if (i != sh->pd_idx) {
+				int dd_idx, pd_idx, j;
+				struct stripe_head *sh2;
+
+				sector_t bn = compute_blocknr(sh, i);
+				sector_t s = raid5_compute_sector(bn, conf->raid_disks,
+								  conf->raid_disks-1,
+								  &dd_idx, &pd_idx, conf);
+				sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1);
+				if (sh2 == NULL)
+					/* so far only the early blocks of this stripe
+					 * have been requested.  When later blocks
+					 * get requested, we will try again
+					 */
+					continue;
+				if(!test_bit(STRIPE_EXPANDING, &sh2->state) ||
+				   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
+					/* must have already done this block */
+					release_stripe(sh2);
+					continue;
+				}
+				memcpy(page_address(sh2->dev[dd_idx].page),
+				       page_address(sh->dev[i].page),
+				       STRIPE_SIZE);
+				set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
+				set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
+				for (j=0; j<conf->raid_disks; j++)
+					if (j != sh2->pd_idx &&
+					    !test_bit(R5_Expanded, &sh2->dev[j].flags))
+						break;
+				if (j == conf->raid_disks) {
+					set_bit(STRIPE_EXPAND_READY, &sh2->state);
+					set_bit(STRIPE_HANDLE, &sh2->state);
+				}
+				release_stripe(sh2);
+			}
+	}
+
 	spin_unlock(&sh->lock);
 
 	while ((bi=return_bi)) {
@@ -1509,7 +1586,7 @@ static void handle_stripe(struct stripe_head *sh)
 		rcu_read_unlock();
  
 		if (rdev) {
-			if (syncing)
+			if (syncing || expanding || expanded)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			bi->bi_bdev = rdev->bdev;
@@ -1757,12 +1834,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 {
 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
 	struct stripe_head *sh;
-	int sectors_per_chunk = conf->chunk_size >> 9;
-	sector_t x;
-	unsigned long stripe;
-	int chunk_offset;
-	int dd_idx, pd_idx;
-	sector_t first_sector;
+	int pd_idx;
+	sector_t first_sector, last_sector;
 	int raid_disks = conf->raid_disks;
 	int data_disks = raid_disks-1;
 	sector_t max_sector = mddev->size << 1;
@@ -1781,6 +1854,80 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
 		return 0;
 	}
+
+	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+		/* reshaping is quite different to recovery/resync so it is
+		 * handled quite separately ... here.
+		 *
+		 * On each call to sync_request, we gather one chunk worth of
+		 * destination stripes and flag them as expanding.
+		 * Then we find all the source stripes and request reads.
+		 * As the reads complete, handle_stripe will copy the data
+		 * into the destination stripe and release that stripe.
+		 */
+		int i;
+		int dd_idx;
+		for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
+			int j;
+			int skipped = 0;
+			pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
+			sh = get_active_stripe(conf, sector_nr+i,
+					       conf->raid_disks, pd_idx, 0);
+			set_bit(STRIPE_EXPANDING, &sh->state);
+			/* If any of this stripe is beyond the end of the old
+			 * array, then we need to zero those blocks
+			 */
+			for (j=sh->disks; j--;) {
+				sector_t s;
+				if (j == sh->pd_idx)
+					continue;
+				s = compute_blocknr(sh, j);
+				if (s < (mddev->array_size<<1)) {
+					skipped = 1;
+					continue;
+				}
+				memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
+				set_bit(R5_Expanded, &sh->dev[j].flags);
+				set_bit(R5_UPTODATE, &sh->dev[j].flags);
+			}
+			if (!skipped) {
+				set_bit(STRIPE_EXPAND_READY, &sh->state);
+				set_bit(STRIPE_HANDLE, &sh->state);
+			}
+			release_stripe(sh);
+		}
+		spin_lock_irq(&conf->device_lock);
+		conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
+		spin_unlock_irq(&conf->device_lock);
+		/* Ok, those stripe are ready. We can start scheduling
+		 * reads on the source stripes.
+		 * The source stripes are determined by mapping the first and last
+		 * block on the destination stripes.
+		 */
+		raid_disks = conf->previous_raid_disks;
+		data_disks = raid_disks - 1;
+		first_sector =
+			raid5_compute_sector(sector_nr*(conf->raid_disks-1),
+					     raid_disks, data_disks,
+					     &dd_idx, &pd_idx, conf);
+		last_sector =
+			raid5_compute_sector((sector_nr+conf->chunk_size/512)
+					       *(conf->raid_disks-1) -1,
+					     raid_disks, data_disks,
+					     &dd_idx, &pd_idx, conf);
+		if (last_sector >= (mddev->size<<1))
+			last_sector = (mddev->size<<1)-1;
+		while (first_sector <= last_sector) {
+			pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
+			sh = get_active_stripe(conf, first_sector,
+					       conf->previous_raid_disks, pd_idx, 0);
+			set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+			set_bit(STRIPE_HANDLE, &sh->state);
+			release_stripe(sh);
+			first_sector += STRIPE_SECTORS;
+		}
+		return conf->chunk_size>>9;
+	}
 	/* if there is 1 or more failed drives and we are trying
 	 * to resync, then assert that we are finished, because there is
 	 * nothing we can do.
@@ -1799,13 +1946,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
 	}
 
-	x = sector_nr;
-	chunk_offset = sector_div(x, sectors_per_chunk);
-	stripe = x;
-	BUG_ON(x != stripe);
-
-	first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
-		+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+	pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
 	sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
 	if (sh == NULL) {
 		sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 617b9506c760..4e26ef2cacca 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -157,6 +157,9 @@ struct mddev_s
 	 * DONE:     thread is done and is waiting to be reaped
 	 * REQUEST:  user-space has requested a sync (used with SYNC)
 	 * CHECK:    user-space request for for check-only, no repair
+	 * RESHAPE:  A reshape is happening
+	 *
+	 * If neither SYNC or RESHAPE are set, then it is a recovery.
 	 */
 #define	MD_RECOVERY_RUNNING	0
 #define	MD_RECOVERY_SYNC	1
@@ -166,6 +169,7 @@ struct mddev_s
 #define	MD_RECOVERY_NEEDED	5
 #define	MD_RECOVERY_REQUESTED	6
 #define	MD_RECOVERY_CHECK	7
+#define MD_RECOVERY_RESHAPE	8
 	unsigned long			recovery;
 
 	int				in_sync;	/* know to not need resync */
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 6fa274aea2a0..55c738d50508 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -157,6 +157,7 @@ struct stripe_head {
 #define	R5_ReadError	8	/* seen a read error here recently */
 #define	R5_ReWrite	9	/* have tried to over-write the readerror */
 
+#define	R5_Expanded	10	/* This block now has post-expand data */
 /*
  * Write method
  */
@@ -176,7 +177,8 @@ struct stripe_head {
 #define	STRIPE_DEGRADED		7
 #define	STRIPE_BIT_DELAY	8
 #define	STRIPE_EXPANDING	9
-
+#define	STRIPE_EXPAND_SOURCE	10
+#define	STRIPE_EXPAND_READY	11
 /*
  * Plugging:
  *
-- 
cgit v1.2.3


From f67055780caac6a99f43834795c43acf99eba6a6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:11 -0800
Subject: [PATCH] md: Checkpoint and allow restart of raid5 reshape

We allow the superblock to record an 'old' and a 'new' geometry, and a
position where any conversion is up to.  The geometry allows for changing
chunksize, layout and level as well as number of devices.

When using verion-0.90 superblock, we convert the version to 0.91 while the
conversion is happening so that an old kernel will refuse the assemble the
array.  For version-1, we use a feature bit for the same effect.

When starting an array we check for an incomplete reshape and restart the
reshape process if needed.  If the reshape stopped at an awkward time (like
when updating the first stripe) we refuse to assemble the array, and let
user-space worry about it.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |  69 +++++++++++++++++++++-
 drivers/md/raid1.c         |   5 ++
 drivers/md/raid5.c         | 140 ++++++++++++++++++++++++++++++++++++++-------
 include/linux/raid/md.h    |   2 +
 include/linux/raid/md_k.h  |   8 +++
 include/linux/raid/md_p.h  |  32 ++++++++++-
 include/linux/raid/raid5.h |   1 +
 7 files changed, 231 insertions(+), 26 deletions(-)

(limited to 'include/linux/raid/raid5.h')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d169bc964676..b9dfdfccdb78 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -662,7 +662,8 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 	}
 
 	if (sb->major_version != 0 ||
-	    sb->minor_version != 90) {
+	    sb->minor_version < 90 ||
+	    sb->minor_version > 91) {
 		printk(KERN_WARNING "Bad version number %d.%d on %s\n",
 			sb->major_version, sb->minor_version,
 			b);
@@ -747,6 +748,20 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->bitmap_offset = 0;
 		mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 
+		if (mddev->minor_version >= 91) {
+			mddev->reshape_position = sb->reshape_position;
+			mddev->delta_disks = sb->delta_disks;
+			mddev->new_level = sb->new_level;
+			mddev->new_layout = sb->new_layout;
+			mddev->new_chunk = sb->new_chunk;
+		} else {
+			mddev->reshape_position = MaxSector;
+			mddev->delta_disks = 0;
+			mddev->new_level = mddev->level;
+			mddev->new_layout = mddev->layout;
+			mddev->new_chunk = mddev->chunk_size;
+		}
+
 		if (sb->state & (1<<MD_SB_CLEAN))
 			mddev->recovery_cp = MaxSector;
 		else {
@@ -841,7 +856,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	sb->md_magic = MD_SB_MAGIC;
 	sb->major_version = mddev->major_version;
-	sb->minor_version = mddev->minor_version;
 	sb->patch_version = mddev->patch_version;
 	sb->gvalid_words  = 0; /* ignored */
 	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
@@ -860,6 +874,17 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	sb->events_hi = (mddev->events>>32);
 	sb->events_lo = (u32)mddev->events;
 
+	if (mddev->reshape_position == MaxSector)
+		sb->minor_version = 90;
+	else {
+		sb->minor_version = 91;
+		sb->reshape_position = mddev->reshape_position;
+		sb->new_level = mddev->new_level;
+		sb->delta_disks = mddev->delta_disks;
+		sb->new_layout = mddev->new_layout;
+		sb->new_chunk = mddev->new_chunk;
+	}
+	mddev->minor_version = sb->minor_version;
 	if (mddev->in_sync)
 	{
 		sb->recovery_cp = mddev->recovery_cp;
@@ -1104,6 +1129,20 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			}
 			mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
 		}
+		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
+			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
+			mddev->new_level = le32_to_cpu(sb->new_level);
+			mddev->new_layout = le32_to_cpu(sb->new_layout);
+			mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
+		} else {
+			mddev->reshape_position = MaxSector;
+			mddev->delta_disks = 0;
+			mddev->new_level = mddev->level;
+			mddev->new_layout = mddev->layout;
+			mddev->new_chunk = mddev->chunk_size;
+		}
+
 	} else if (mddev->pers == NULL) {
 		/* Insist of good event counter while assembling */
 		__u64 ev1 = le64_to_cpu(sb->events);
@@ -1175,6 +1214,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
 	}
+	if (mddev->reshape_position != MaxSector) {
+		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
+		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
+		sb->new_layout = cpu_to_le32(mddev->new_layout);
+		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
+		sb->new_level = cpu_to_le32(mddev->new_level);
+		sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
+	}
 
 	max_dev = 0;
 	ITERATE_RDEV(mddev,rdev2,tmp)
@@ -1497,7 +1544,7 @@ static void sync_sbs(mddev_t * mddev)
 	}
 }
 
-static void md_update_sb(mddev_t * mddev)
+void md_update_sb(mddev_t * mddev)
 {
 	int err;
 	struct list_head *tmp;
@@ -1574,6 +1621,7 @@ repeat:
 	wake_up(&mddev->sb_wait);
 
 }
+EXPORT_SYMBOL_GPL(md_update_sb);
 
 /* words written to sysfs files may, or my not, be \n terminated.
  * We want to accept with case. For this we use cmd_match.
@@ -2545,6 +2593,14 @@ static int do_md_run(mddev_t * mddev)
 	mddev->level = pers->level;
 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
+	if (mddev->reshape_position != MaxSector &&
+	    pers->reshape == NULL) {
+		/* This personality cannot handle reshaping... */
+		mddev->pers = NULL;
+		module_put(pers->owner);
+		return -EINVAL;
+	}
+
 	mddev->recovery = 0;
 	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
 	mddev->barriers_work = 1;
@@ -3433,11 +3489,18 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 	mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 	mddev->bitmap_offset = 0;
 
+	mddev->reshape_position = MaxSector;
+
 	/*
 	 * Generate a 128 bit UUID
 	 */
 	get_random_bytes(mddev->uuid, 16);
 
+	mddev->new_level = mddev->level;
+	mddev->new_chunk = mddev->chunk_size;
+	mddev->new_layout = mddev->layout;
+	mddev->delta_disks = 0;
+
 	return 0;
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5d88329e3c7a..b65b8cfbdf30 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1789,6 +1789,11 @@ static int run(mddev_t *mddev)
 		       mdname(mddev), mddev->level);
 		goto out;
 	}
+	if (mddev->reshape_position != MaxSector) {
+		printk("raid1: %s: reshape_position set but not supported\n",
+		       mdname(mddev));
+		goto out;
+	}
 	/*
 	 * copy the already verified devices into our private RAID1
 	 * bookkeeping area. [whatever we allocate in run(),
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b29135acb1d9..20ae32d67e21 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -22,6 +22,7 @@
 #include <linux/raid/raid5.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
+#include <linux/kthread.h>
 #include <asm/atomic.h>
 
 #include <linux/raid/bitmap.h>
@@ -1504,6 +1505,7 @@ static void handle_stripe(struct stripe_head *sh)
 		clear_bit(STRIPE_EXPANDING, &sh->state);
 	} else if (expanded) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
+		atomic_dec(&conf->reshape_stripes);
 		wake_up(&conf->wait_for_overlap);
 		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
 	}
@@ -1875,6 +1877,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		 */
 		int i;
 		int dd_idx;
+
+		if (sector_nr == 0 &&
+		    conf->expand_progress != 0) {
+			/* restarting in the middle, skip the initial sectors */
+			sector_nr = conf->expand_progress;
+			sector_div(sector_nr, conf->raid_disks-1);
+			*skipped = 1;
+			return sector_nr;
+		}
+
+		/* Cannot proceed until we've updated the superblock... */
+		wait_event(conf->wait_for_overlap,
+			   atomic_read(&conf->reshape_stripes)==0);
+		mddev->reshape_position = conf->expand_progress;
+
+		mddev->sb_dirty = 1;
+		md_wakeup_thread(mddev->thread);
+		wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+			kthread_should_stop());
+
 		for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
 			int j;
 			int skipped = 0;
@@ -1882,6 +1904,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 			sh = get_active_stripe(conf, sector_nr+i,
 					       conf->raid_disks, pd_idx, 0);
 			set_bit(STRIPE_EXPANDING, &sh->state);
+			atomic_inc(&conf->reshape_stripes);
 			/* If any of this stripe is beyond the end of the old
 			 * array, then we need to zero those blocks
 			 */
@@ -2121,10 +2144,61 @@ static int run(mddev_t *mddev)
 		return -EIO;
 	}
 
+	if (mddev->reshape_position != MaxSector) {
+		/* Check that we can continue the reshape.
+		 * Currently only disks can change, it must
+		 * increase, and we must be past the point where
+		 * a stripe over-writes itself
+		 */
+		sector_t here_new, here_old;
+		int old_disks;
+
+		if (mddev->new_level != mddev->level ||
+		    mddev->new_layout != mddev->layout ||
+		    mddev->new_chunk != mddev->chunk_size) {
+			printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
+			       mdname(mddev));
+			return -EINVAL;
+		}
+		if (mddev->delta_disks <= 0) {
+			printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
+			       mdname(mddev));
+			return -EINVAL;
+		}
+		old_disks = mddev->raid_disks - mddev->delta_disks;
+		/* reshape_position must be on a new-stripe boundary, and one
+		 * further up in new geometry must map after here in old geometry.
+		 */
+		here_new = mddev->reshape_position;
+		if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
+			printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
+			return -EINVAL;
+		}
+		/* here_new is the stripe we will write to */
+		here_old = mddev->reshape_position;
+		sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
+		/* here_old is the first stripe that we might need to read from */
+		if (here_new >= here_old) {
+			/* Reading from the same stripe as writing to - bad */
+			printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
+			return -EINVAL;
+		}
+		printk(KERN_INFO "raid5: reshape will continue\n");
+		/* OK, we should be able to continue; */
+	}
+
+
 	mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
 	if ((conf = mddev->private) == NULL)
 		goto abort;
-	conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
+	if (mddev->reshape_position == MaxSector) {
+		conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
+	} else {
+		conf->raid_disks = mddev->raid_disks;
+		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
+	}
+
+	conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
 			      GFP_KERNEL);
 	if (!conf->disks)
 		goto abort;
@@ -2148,7 +2222,7 @@ static int run(mddev_t *mddev)
 
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		raid_disk = rdev->raid_disk;
-		if (raid_disk >= mddev->raid_disks
+		if (raid_disk >= conf->raid_disks
 		    || raid_disk < 0)
 			continue;
 		disk = conf->disks + raid_disk;
@@ -2164,7 +2238,6 @@ static int run(mddev_t *mddev)
 		}
 	}
 
-	conf->raid_disks = mddev->raid_disks;
 	/*
 	 * 0 for a fully functional array, 1 for a degraded array.
 	 */
@@ -2174,7 +2247,7 @@ static int run(mddev_t *mddev)
 	conf->level = mddev->level;
 	conf->algorithm = mddev->layout;
 	conf->max_nr_stripes = NR_STRIPES;
-	conf->expand_progress = MaxSector;
+	conf->expand_progress = mddev->reshape_position;
 
 	/* device size must be a multiple of chunk size */
 	mddev->size &= ~(mddev->chunk_size/1024 -1);
@@ -2247,6 +2320,20 @@ static int run(mddev_t *mddev)
 
 	print_raid5_conf(conf);
 
+	if (conf->expand_progress != MaxSector) {
+		printk("...ok start reshape thread\n");
+		atomic_set(&conf->reshape_stripes, 0);
+		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+							"%s_reshape");
+		/* FIXME if md_register_thread fails?? */
+		md_wakeup_thread(mddev->sync_thread);
+
+	}
+
 	/* read-ahead size must cover two whole stripes, which is
 	 * 2 * (n-1) * chunksize where 'n' is the number of raid devices
 	 */
@@ -2262,8 +2349,8 @@ static int run(mddev_t *mddev)
 
 	mddev->queue->unplug_fn = raid5_unplug_device;
 	mddev->queue->issue_flush_fn = raid5_issue_flush;
+	mddev->array_size =  mddev->size * (conf->previous_raid_disks - 1);
 
-	mddev->array_size =  mddev->size * (mddev->raid_disks - 1);
 	return 0;
 abort:
 	if (conf) {
@@ -2436,7 +2523,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	/*
 	 * find the disk ...
 	 */
-	for (disk=0; disk < mddev->raid_disks; disk++)
+	for (disk=0; disk < conf->raid_disks; disk++)
 		if ((p=conf->disks + disk)->rdev == NULL) {
 			clear_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = disk;
@@ -2518,9 +2605,10 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 	if (err)
 		return err;
 
+	atomic_set(&conf->reshape_stripes, 0);
 	spin_lock_irq(&conf->device_lock);
 	conf->previous_raid_disks = conf->raid_disks;
-	mddev->raid_disks = conf->raid_disks = raid_disks;
+	conf->raid_disks = raid_disks;
 	conf->expand_progress = 0;
 	spin_unlock_irq(&conf->device_lock);
 
@@ -2542,6 +2630,14 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 		}
 
 	mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices;
+	mddev->new_chunk = mddev->chunk_size;
+	mddev->new_layout = mddev->layout;
+	mddev->new_level = mddev->level;
+	mddev->raid_disks = raid_disks;
+	mddev->delta_disks = raid_disks - conf->previous_raid_disks;
+	mddev->reshape_position = 0;
+	mddev->sb_dirty = 1;
+
 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -2552,6 +2648,7 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 		mddev->recovery = 0;
 		spin_lock_irq(&conf->device_lock);
 		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+		mddev->delta_disks = 0;
 		conf->expand_progress = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
 		return -EAGAIN;
@@ -2566,20 +2663,23 @@ static void end_reshape(raid5_conf_t *conf)
 {
 	struct block_device *bdev;
 
-	conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1);
-	set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
-	conf->mddev->changed = 1;
-
-	bdev = bdget_disk(conf->mddev->gendisk, 0);
-	if (bdev) {
-		mutex_lock(&bdev->bd_inode->i_mutex);
-		i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
-		mutex_unlock(&bdev->bd_inode->i_mutex);
-		bdput(bdev);
+	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
+		conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
+		set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
+		conf->mddev->changed = 1;
+
+		bdev = bdget_disk(conf->mddev->gendisk, 0);
+		if (bdev) {
+			mutex_lock(&bdev->bd_inode->i_mutex);
+			i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
+			mutex_unlock(&bdev->bd_inode->i_mutex);
+			bdput(bdev);
+		}
+		spin_lock_irq(&conf->device_lock);
+		conf->expand_progress = MaxSector;
+		spin_unlock_irq(&conf->device_lock);
+		conf->mddev->reshape_position = MaxSector;
 	}
-	spin_lock_irq(&conf->device_lock);
-	conf->expand_progress = MaxSector;
-	spin_unlock_irq(&conf->device_lock);
 }
 
 static void raid5_quiesce(mddev_t *mddev, int state)
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 9c77cde5a795..66b44e5e0d6e 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -95,6 +95,8 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 extern void md_do_sync(mddev_t *mddev);
 extern void md_new_event(mddev_t *mddev);
 
+extern void md_update_sb(mddev_t * mddev);
+
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 
 #endif 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 4e26ef2cacca..1a6f9f2f6282 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -132,6 +132,14 @@ struct mddev_s
 
 	char				uuid[16];
 
+	/* If the array is being reshaped, we need to record the
+	 * new shape and an indication of where we are up to.
+	 * This is written to the superblock.
+	 * If reshape_position is MaxSector, then no reshape is happening (yet).
+	 */
+	sector_t			reshape_position;
+	int				delta_disks, new_level, new_layout, new_chunk;
+
 	struct mdk_thread_s		*thread;	/* management thread */
 	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
 	sector_t			curr_resync;	/* blocks scheduled */
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index c100fa5d4bfa..774e1acfb8c4 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -102,6 +102,18 @@ typedef struct mdp_device_descriptor_s {
 #define MD_SB_ERRORS		1
 
 #define	MD_SB_BITMAP_PRESENT	8 /* bitmap may be present nearby */
+
+/*
+ * Notes:
+ * - if an array is being reshaped (restriped) in order to change the
+ *   the number of active devices in the array, 'raid_disks' will be
+ *   the larger of the old and new numbers.  'delta_disks' will
+ *   be the "new - old".  So if +ve, raid_disks is the new value, and
+ *   "raid_disks-delta_disks" is the old.  If -ve, raid_disks is the
+ *   old value and "raid_disks+delta_disks" is the new (smaller) value.
+ */
+
+
 typedef struct mdp_superblock_s {
 	/*
 	 * Constant generic information
@@ -146,7 +158,13 @@ typedef struct mdp_superblock_s {
 	__u32 cp_events_hi;	/* 10 high-order of checkpoint update count   */
 #endif
 	__u32 recovery_cp;	/* 11 recovery checkpoint sector count	      */
-	__u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 12];
+	/* There are only valid for minor_version > 90 */
+	__u64 reshape_position;	/* 12,13 next address in array-space for reshape */
+	__u32 new_level;	/* 14 new level we are reshaping to	      */
+	__u32 delta_disks;	/* 15 change in number of raid_disks	      */
+	__u32 new_layout;	/* 16 new layout			      */
+	__u32 new_chunk;	/* 17 new chunk size (bytes)		      */
+	__u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18];
 
 	/*
 	 * Personality information
@@ -207,7 +225,14 @@ struct mdp_superblock_1 {
 				 * NOTE: signed, so bitmap can be before superblock
 				 * only meaningful of feature_map[0] is set.
 				 */
-	__u8	pad1[128-100];	/* set to 0 when written */
+
+	/* These are only valid with feature bit '4' */
+	__u64	reshape_position;	/* next address in array-space for reshape */
+	__u32	new_level;	/* new level we are reshaping to		*/
+	__u32	delta_disks;	/* change in number of raid_disks		*/
+	__u32	new_layout;	/* new layout					*/
+	__u32	new_chunk;	/* new chunk size (bytes)			*/
+	__u8	pad1[128-124];	/* set to 0 when written */
 
 	/* constant this-device information - 64 bytes */
 	__u64	data_offset;	/* sector start of data, often 0 */
@@ -240,8 +265,9 @@ struct mdp_superblock_1 {
 
 /* feature_map bits */
 #define MD_FEATURE_BITMAP_OFFSET	1
+#define	MD_FEATURE_RESHAPE_ACTIVE	4
 
-#define	MD_FEATURE_ALL			1
+#define	MD_FEATURE_ALL			5
 
 #endif 
 
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 55c738d50508..abcdf0d0658a 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -224,6 +224,7 @@ struct raid5_private_data {
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 
+	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
 	/* unfortunately we need two cache names as we temporarily have
 	 * two caches.
 	 */
-- 
cgit v1.2.3


From b578d55fdd80140f657130abd85aebeb345755fb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:12 -0800
Subject: [PATCH] md: Only checkpoint expansion progress occasionally

Instead of checkpointing at each stripe, only checkpoint when a new write
would overwrite uncheckpointed data.  Block any write to the uncheckpointed
area.  Arbitrarily checkpoint at least every 3Meg.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c         | 53 ++++++++++++++++++++++++++++++++++++----------
 include/linux/raid/raid5.h |  3 +++
 2 files changed, 45 insertions(+), 11 deletions(-)

(limited to 'include/linux/raid/raid5.h')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 20ae32d67e21..089a32604305 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1762,8 +1762,9 @@ static int make_request(request_queue_t *q, struct bio * bi)
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
 		int disks;
-		
+
 	retry:
+		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 		if (likely(conf->expand_progress == MaxSector))
 			disks = conf->raid_disks;
 		else {
@@ -1771,6 +1772,13 @@ static int make_request(request_queue_t *q, struct bio * bi)
 			disks = conf->raid_disks;
 			if (logical_sector >= conf->expand_progress)
 				disks = conf->previous_raid_disks;
+			else {
+				if (logical_sector >= conf->expand_lo) {
+					spin_unlock_irq(&conf->device_lock);
+					schedule();
+					goto retry;
+				}
+			}
 			spin_unlock_irq(&conf->device_lock);
 		}
  		new_sector = raid5_compute_sector(logical_sector, disks, disks - 1,
@@ -1779,7 +1787,6 @@ static int make_request(request_queue_t *q, struct bio * bi)
 			(unsigned long long)new_sector, 
 			(unsigned long long)logical_sector);
 
-		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 		sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
 		if (sh) {
 			if (unlikely(conf->expand_progress != MaxSector)) {
@@ -1877,6 +1884,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		 */
 		int i;
 		int dd_idx;
+		sector_t writepos, safepos, gap;
 
 		if (sector_nr == 0 &&
 		    conf->expand_progress != 0) {
@@ -1887,15 +1895,36 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 			return sector_nr;
 		}
 
-		/* Cannot proceed until we've updated the superblock... */
-		wait_event(conf->wait_for_overlap,
-			   atomic_read(&conf->reshape_stripes)==0);
-		mddev->reshape_position = conf->expand_progress;
-
-		mddev->sb_dirty = 1;
-		md_wakeup_thread(mddev->thread);
-		wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
-			kthread_should_stop());
+		/* we update the metadata when there is more than 3Meg
+		 * in the block range (that is rather arbitrary, should
+		 * probably be time based) or when the data about to be
+		 * copied would over-write the source of the data at
+		 * the front of the range.
+		 * i.e. one new_stripe forward from expand_progress new_maps
+		 * to after where expand_lo old_maps to
+		 */
+		writepos = conf->expand_progress +
+			conf->chunk_size/512*(conf->raid_disks-1);
+		sector_div(writepos, conf->raid_disks-1);
+		safepos = conf->expand_lo;
+		sector_div(safepos, conf->previous_raid_disks-1);
+		gap = conf->expand_progress - conf->expand_lo;
+
+		if (writepos >= safepos ||
+		    gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
+			/* Cannot proceed until we've updated the superblock... */
+			wait_event(conf->wait_for_overlap,
+				   atomic_read(&conf->reshape_stripes)==0);
+			mddev->reshape_position = conf->expand_progress;
+			mddev->sb_dirty = 1;
+			md_wakeup_thread(mddev->thread);
+			wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+				   kthread_should_stop());
+			spin_lock_irq(&conf->device_lock);
+			conf->expand_lo = mddev->reshape_position;
+			spin_unlock_irq(&conf->device_lock);
+			wake_up(&conf->wait_for_overlap);
+		}
 
 		for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
 			int j;
@@ -2322,6 +2351,7 @@ static int run(mddev_t *mddev)
 
 	if (conf->expand_progress != MaxSector) {
 		printk("...ok start reshape thread\n");
+		conf->expand_lo = conf->expand_progress;
 		atomic_set(&conf->reshape_stripes, 0);
 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -2610,6 +2640,7 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 	conf->previous_raid_disks = conf->raid_disks;
 	conf->raid_disks = raid_disks;
 	conf->expand_progress = 0;
+	conf->expand_lo = 0;
 	spin_unlock_irq(&conf->device_lock);
 
 	/* Add some new drives, as many as will fit.
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index abcdf0d0658a..914af667044f 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -217,6 +217,9 @@ struct raid5_private_data {
 
 	/* used during an expand */
 	sector_t		expand_progress;	/* MaxSector when no expand happening */
+	sector_t		expand_lo; /* from here up to expand_progress it out-of-bounds
+					    * as we haven't flushed the metadata yet
+					    */
 	int			previous_raid_disks;
 
 	struct list_head	handle_list; /* stripes needing handling */
-- 
cgit v1.2.3