From b6538fe32966e63ef38897860ef220980d904974 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Fri, 8 May 2015 18:19:03 +1000
Subject: md-raid0: conditional mddev->queue access to suit dm-raid

This patch is a prerequisite for dm-raid "raid0" support to allow
dm-raid to access the MD RAID0 personality doing unconditional
accesses to mddev->queue, which is NULL in case of dm-raid stacked on
top of MD.

Most of the conditional mddev->queue accesses made it to upstream but
this missing one, which prohibits md raid0 to set disk stack limits
(being done in dm core in case of md underneath dm).

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Tested-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 2cb59a641cd2..6a68ef5246d4 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -188,8 +188,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		}
 		dev[j] = rdev1;
 
-		disk_stack_limits(mddev->gendisk, rdev1->bdev,
-				  rdev1->data_offset << 9);
+		if (mddev->queue)
+			disk_stack_limits(mddev->gendisk, rdev1->bdev,
+					  rdev1->data_offset << 9);
 
 		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
 			conf->has_merge_bvec = 1;
-- 
cgit v1.2.3


From f18c1a35f62caccb527e8b0990c8801596e7c662 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 8 May 2015 18:19:04 +1000
Subject: md/raid5: new alloc_stripe() to allocate an initialize a stripe.

The new batch_lock and batch_list fields are being initialized in
grow_one_stripe() but not in resize_stripes().  This causes a crash
on resize.

So separate the core initialization into a new function and call it
from both allocation sites.

Signed-off-by: NeilBrown <neilb@suse.de>
Fixes: 59fc630b8b5f ("RAID5: batch adjacent full stripe write")
---
 drivers/md/raid5.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77dfd720aaa0..91a1e8b26b52 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1971,17 +1971,30 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 	put_cpu();
 }
 
+static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
+{
+	struct stripe_head *sh;
+
+	sh = kmem_cache_zalloc(sc, gfp);
+	if (sh) {
+		spin_lock_init(&sh->stripe_lock);
+		spin_lock_init(&sh->batch_lock);
+		INIT_LIST_HEAD(&sh->batch_list);
+		INIT_LIST_HEAD(&sh->lru);
+		atomic_set(&sh->count, 1);
+	}
+	return sh;
+}
 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
 {
 	struct stripe_head *sh;
-	sh = kmem_cache_zalloc(conf->slab_cache, gfp);
+
+	sh = alloc_stripe(conf->slab_cache, gfp);
 	if (!sh)
 		return 0;
 
 	sh->raid_conf = conf;
 
-	spin_lock_init(&sh->stripe_lock);
-
 	if (grow_buffers(sh, gfp)) {
 		shrink_buffers(sh);
 		kmem_cache_free(conf->slab_cache, sh);
@@ -1990,13 +2003,8 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
 	sh->hash_lock_index =
 		conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
 	/* we just created an active stripe so... */
-	atomic_set(&sh->count, 1);
 	atomic_inc(&conf->active_stripes);
-	INIT_LIST_HEAD(&sh->lru);
 
-	spin_lock_init(&sh->batch_lock);
-	INIT_LIST_HEAD(&sh->batch_list);
-	sh->batch_head = NULL;
 	release_stripe(sh);
 	conf->max_nr_stripes++;
 	return 1;
@@ -2109,13 +2117,11 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 		return -ENOMEM;
 
 	for (i = conf->max_nr_stripes; i; i--) {
-		nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
+		nsh = alloc_stripe(sc, GFP_KERNEL);
 		if (!nsh)
 			break;
 
 		nsh->raid_conf = conf;
-		spin_lock_init(&nsh->stripe_lock);
-
 		list_add(&nsh->lru, &newstripes);
 	}
 	if (i) {
@@ -2142,13 +2148,11 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 				    lock_device_hash_lock(conf, hash));
 		osh = get_free_stripe(conf, hash);
 		unlock_device_hash_lock(conf, hash);
-		atomic_set(&nsh->count, 1);
+
 		for(i=0; i<conf->pool_size; i++) {
 			nsh->dev[i].page = osh->dev[i].page;
 			nsh->dev[i].orig_page = osh->dev[i].page;
 		}
-		for( ; i<newsize; i++)
-			nsh->dev[i].page = NULL;
 		nsh->hash_lock_index = hash;
 		kmem_cache_free(conf->slab_cache, osh);
 		cnt++;
-- 
cgit v1.2.3


From b0c783b32318bef29d64086fa812e8c659cb5b37 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 8 May 2015 18:19:32 +1000
Subject: md/raid5: more incorrect BUG_ON in handle_stripe_fill.

It is not incorrect to call handle_stripe_fill() when
a batch of full-stripe writes is active.
It is, however, a BUG if fetch_block() then decides
it needs to actually fetch anything.

So move the 'BUG_ON' to where it belongs.

Signed-off-by: NeilBrown  <neilb@suse.de>
Fixes: 59fc630b8b5f ("RAID5: batch adjacent full stripe write")
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 91a1e8b26b52..415cac6d89bd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3302,6 +3302,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
 		 */
 		BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
 		BUG_ON(test_bit(R5_Wantread, &dev->flags));
+		BUG_ON(sh->batch_head);
 		if ((s->uptodate == disks - 1) &&
 		    (s->failed && (disk_idx == s->failed_num[0] ||
 				   disk_idx == s->failed_num[1]))) {
@@ -3370,7 +3371,6 @@ static void handle_stripe_fill(struct stripe_head *sh,
 {
 	int i;
 
-	BUG_ON(sh->batch_head);
 	/* look for blocks to read/compute, skip this if a compute
 	 * is already in flight, or if the stripe contents are in the
 	 * midst of changing due to a write
-- 
cgit v1.2.3


From 10d82c5f0d167ef75a2d8d7d4eed9ee43d3369c9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 8 May 2015 18:19:33 +1000
Subject: md/raid5: avoid reading parity blocks for full-stripe write to
 degraded array

When performing a reconstruct write, we need to read all blocks
that are not being over-written .. except the parity (P and Q) blocks.

The code currently reads these (as they are not being over-written!)
unnecessarily.

Signed-off-by: NeilBrown <neilb@suse.de>
Fixes: ea664c8245f3 ("md/raid5: need_this_block: tidy/fix last condition.")
---
 drivers/md/raid5.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 415cac6d89bd..85dc0e67fb88 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3282,7 +3282,9 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 		/* reconstruct-write isn't being forced */
 		return 0;
 	for (i = 0; i < s->failed; i++) {
-		if (!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+		if (s->failed_num[i] != sh->pd_idx &&
+		    s->failed_num[i] != sh->qd_idx &&
+		    !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
 		    !test_bit(R5_OVERWRITE, &fdev[i]->flags))
 			return 1;
 	}
-- 
cgit v1.2.3


From 6e9eac2dcee5e19f125967dd2be3e36558c42fff Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 8 May 2015 18:19:34 +1000
Subject: md/raid5: don't record new size if resize_stripes fails.

If any memory allocation in resize_stripes fails we will return
-ENOMEM, but in some cases we update conf->pool_size anyway.

This means that if we try again, the allocations will be assumed
to be larger than they are, and badness results.

So only update pool_size if there is no error.

This bug was introduced in 2.6.17 and the patch is suitable for
-stable.

Fixes: ad01c9e3752f ("[PATCH] md: Allow stripes to be expanded in preparation for expanding an array")
Cc: stable@vger.kernel.org (v2.6.17+)
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 85dc0e67fb88..9748e525e4c0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2216,7 +2216,8 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 
 	conf->slab_cache = sc;
 	conf->active_name = 1-conf->active_name;
-	conf->pool_size = newsize;
+	if (!err)
+		conf->pool_size = newsize;
 	return err;
 }
 
-- 
cgit v1.2.3


From 738a273806ee0568369c9bb19ef3b102f54beef4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 8 May 2015 18:19:39 +1000
Subject: md/raid5: fix allocation of 'scribble' array.

As the new 'scribble' array is sized based on chunk size,
we need to make sure the size matches the largest of 'old'
and 'new' chunk sizes when the array is undergoing reshape.

We also potentially need to resize it even when not resizing
the stripe cache, as chunk size can change without changing
number of devices.

So move the 'resize' code into a separate function, and
consider old and new sizes when allocating.

Signed-off-by: NeilBrown <neilb@suse.de>
Fixes: 46d5b785621a ("raid5: use flex_array for scribble data")
---
 drivers/md/raid5.c | 65 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 43 insertions(+), 22 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9748e525e4c0..3873eaa6fa2e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2068,6 +2068,35 @@ static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
 	return ret;
 }
 
+static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
+{
+	unsigned long cpu;
+	int err = 0;
+
+	mddev_suspend(conf->mddev);
+	get_online_cpus();
+	for_each_present_cpu(cpu) {
+		struct raid5_percpu *percpu;
+		struct flex_array *scribble;
+
+		percpu = per_cpu_ptr(conf->percpu, cpu);
+		scribble = scribble_alloc(new_disks,
+					  new_sectors / STRIPE_SECTORS,
+					  GFP_NOIO);
+
+		if (scribble) {
+			flex_array_free(percpu->scribble);
+			percpu->scribble = scribble;
+		} else {
+			err = -ENOMEM;
+			break;
+		}
+	}
+	put_online_cpus();
+	mddev_resume(conf->mddev);
+	return err;
+}
+
 static int resize_stripes(struct r5conf *conf, int newsize)
 {
 	/* Make all the stripes able to hold 'newsize' devices.
@@ -2096,7 +2125,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	struct stripe_head *osh, *nsh;
 	LIST_HEAD(newstripes);
 	struct disk_info *ndisks;
-	unsigned long cpu;
 	int err;
 	struct kmem_cache *sc;
 	int i;
@@ -2178,25 +2206,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	} else
 		err = -ENOMEM;
 
-	get_online_cpus();
-	for_each_present_cpu(cpu) {
-		struct raid5_percpu *percpu;
-		struct flex_array *scribble;
-
-		percpu = per_cpu_ptr(conf->percpu, cpu);
-		scribble = scribble_alloc(newsize, conf->chunk_sectors /
-			STRIPE_SECTORS, GFP_NOIO);
-
-		if (scribble) {
-			flex_array_free(percpu->scribble);
-			percpu->scribble = scribble;
-		} else {
-			err = -ENOMEM;
-			break;
-		}
-	}
-	put_online_cpus();
-
 	/* Step 4, return new stripes to service */
 	while(!list_empty(&newstripes)) {
 		nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -6228,8 +6237,11 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu
 		percpu->spare_page = alloc_page(GFP_KERNEL);
 	if (!percpu->scribble)
 		percpu->scribble = scribble_alloc(max(conf->raid_disks,
-			conf->previous_raid_disks), conf->chunk_sectors /
-			STRIPE_SECTORS, GFP_KERNEL);
+						      conf->previous_raid_disks),
+						  max(conf->chunk_sectors,
+						      conf->prev_chunk_sectors)
+						   / STRIPE_SECTORS,
+						  GFP_KERNEL);
 
 	if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
 		free_scratch_buffer(conf, percpu);
@@ -7205,6 +7217,15 @@ static int check_reshape(struct mddev *mddev)
 	if (!check_stripe_cache(mddev))
 		return -ENOSPC;
 
+	if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
+	    mddev->delta_disks > 0)
+		if (resize_chunks(conf,
+				  conf->previous_raid_disks
+				  + max(0, mddev->delta_disks),
+				  max(mddev->new_chunk_sectors,
+				      mddev->chunk_sectors)
+			    ) < 0)
+			return -ENOMEM;
 	return resize_stripes(conf, (conf->previous_raid_disks
 				     + mddev->delta_disks));
 }
-- 
cgit v1.2.3


From bb27051f9fd7643f05d8f0babce3337f0b9b3087 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 8 May 2015 18:19:40 +1000
Subject: md/raid5: fix handling of degraded stripes in batches.

There is no need for special handling of stripe-batches when the array
is degraded.

There may be if there is a failure in the batch, but STRIPE_DEGRADED
does not imply an error.

So don't set STRIPE_BATCH_ERR in ops_run_io just because the array is
degraded.
This actually causes a bug: the STRIPE_DEGRADED flag gets cleared in
check_break_stripe_batch_list() and so the bitmap bit gets cleared
when it shouldn't.

So in check_break_stripe_batch_list(), split the batch up completely -
again STRIPE_DEGRADED isn't meaningful.

Also don't set STRIPE_BATCH_ERR when there is a write error to a
replacement device.  This simply removes the replacement device and
requires no extra handling.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3873eaa6fa2e..1ba97fdc6df1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1078,9 +1078,6 @@ again:
 			pr_debug("skip op %ld on disc %d for sector %llu\n",
 				bi->bi_rw, i, (unsigned long long)sh->sector);
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
-			if (sh->batch_head)
-				set_bit(STRIPE_BATCH_ERR,
-					&sh->batch_head->state);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
 
@@ -2448,7 +2445,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
 	}
 	rdev_dec_pending(rdev, conf->mddev);
 
-	if (sh->batch_head && !uptodate)
+	if (sh->batch_head && !uptodate && !replacement)
 		set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
 
 	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
@@ -4214,15 +4211,9 @@ static void check_break_stripe_batch_list(struct stripe_head *sh)
 		return;
 
 	head_sh = sh;
-	do {
-		sh = list_first_entry(&sh->batch_list,
-				      struct stripe_head, batch_list);
-		BUG_ON(sh == head_sh);
-	} while (!test_bit(STRIPE_DEGRADED, &sh->state));
 
-	while (sh != head_sh) {
-		next = list_first_entry(&sh->batch_list,
-					struct stripe_head, batch_list);
+	list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
+
 		list_del_init(&sh->batch_list);
 
 		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
@@ -4242,8 +4233,6 @@ static void check_break_stripe_batch_list(struct stripe_head *sh)
 
 		set_bit(STRIPE_HANDLE, &sh->state);
 		release_stripe(sh);
-
-		sh = next;
 	}
 }
 
-- 
cgit v1.2.3