From b6538fe32966e63ef38897860ef220980d904974 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Fri, 8 May 2015 18:19:03 +1000 Subject: md-raid0: conditional mddev->queue access to suit dm-raid This patch is a prerequisite for dm-raid "raid0" support to allow dm-raid to access the MD RAID0 personality doing unconditional accesses to mddev->queue, which is NULL in case of dm-raid stacked on top of MD. Most of the conditional mddev->queue accesses made it to upstream but this missing one, which prohibits md raid0 to set disk stack limits (being done in dm core in case of md underneath dm). Signed-off-by: Heinz Mauelshagen Tested-by: Heinz Mauelshagen Signed-off-by: NeilBrown --- drivers/md/raid0.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 2cb59a641cd2..6a68ef5246d4 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -188,8 +188,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } dev[j] = rdev1; - disk_stack_limits(mddev->gendisk, rdev1->bdev, - rdev1->data_offset << 9); + if (mddev->queue) + disk_stack_limits(mddev->gendisk, rdev1->bdev, + rdev1->data_offset << 9); if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) conf->has_merge_bvec = 1; -- cgit v1.2.3 From f18c1a35f62caccb527e8b0990c8801596e7c662 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 8 May 2015 18:19:04 +1000 Subject: md/raid5: new alloc_stripe() to allocate an initialize a stripe. The new batch_lock and batch_list fields are being initialized in grow_one_stripe() but not in resize_stripes(). This causes a crash on resize. So separate the core initialization into a new function and call it from both allocation sites. Signed-off-by: NeilBrown Fixes: 59fc630b8b5f ("RAID5: batch adjacent full stripe write") --- drivers/md/raid5.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 77dfd720aaa0..91a1e8b26b52 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1971,17 +1971,30 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) put_cpu(); } +static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp) +{ + struct stripe_head *sh; + + sh = kmem_cache_zalloc(sc, gfp); + if (sh) { + spin_lock_init(&sh->stripe_lock); + spin_lock_init(&sh->batch_lock); + INIT_LIST_HEAD(&sh->batch_list); + INIT_LIST_HEAD(&sh->lru); + atomic_set(&sh->count, 1); + } + return sh; +} static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) { struct stripe_head *sh; - sh = kmem_cache_zalloc(conf->slab_cache, gfp); + + sh = alloc_stripe(conf->slab_cache, gfp); if (!sh) return 0; sh->raid_conf = conf; - spin_lock_init(&sh->stripe_lock); - if (grow_buffers(sh, gfp)) { shrink_buffers(sh); kmem_cache_free(conf->slab_cache, sh); @@ -1990,13 +2003,8 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) sh->hash_lock_index = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; /* we just created an active stripe so... */ - atomic_set(&sh->count, 1); atomic_inc(&conf->active_stripes); - INIT_LIST_HEAD(&sh->lru); - spin_lock_init(&sh->batch_lock); - INIT_LIST_HEAD(&sh->batch_list); - sh->batch_head = NULL; release_stripe(sh); conf->max_nr_stripes++; return 1; @@ -2109,13 +2117,11 @@ static int resize_stripes(struct r5conf *conf, int newsize) return -ENOMEM; for (i = conf->max_nr_stripes; i; i--) { - nsh = kmem_cache_zalloc(sc, GFP_KERNEL); + nsh = alloc_stripe(sc, GFP_KERNEL); if (!nsh) break; nsh->raid_conf = conf; - spin_lock_init(&nsh->stripe_lock); - list_add(&nsh->lru, &newstripes); } if (i) { @@ -2142,13 +2148,11 @@ static int resize_stripes(struct r5conf *conf, int newsize) lock_device_hash_lock(conf, hash)); osh = get_free_stripe(conf, hash); unlock_device_hash_lock(conf, hash); - atomic_set(&nsh->count, 1); + for(i=0; ipool_size; i++) { nsh->dev[i].page = osh->dev[i].page; nsh->dev[i].orig_page = osh->dev[i].page; } - for( ; idev[i].page = NULL; nsh->hash_lock_index = hash; kmem_cache_free(conf->slab_cache, osh); cnt++; -- cgit v1.2.3 From b0c783b32318bef29d64086fa812e8c659cb5b37 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 8 May 2015 18:19:32 +1000 Subject: md/raid5: more incorrect BUG_ON in handle_stripe_fill. It is not incorrect to call handle_stripe_fill() when a batch of full-stripe writes is active. It is, however, a BUG if fetch_block() then decides it needs to actually fetch anything. So move the 'BUG_ON' to where it belongs. Signed-off-by: NeilBrown Fixes: 59fc630b8b5f ("RAID5: batch adjacent full stripe write") --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 91a1e8b26b52..415cac6d89bd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3302,6 +3302,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, */ BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); BUG_ON(test_bit(R5_Wantread, &dev->flags)); + BUG_ON(sh->batch_head); if ((s->uptodate == disks - 1) && (s->failed && (disk_idx == s->failed_num[0] || disk_idx == s->failed_num[1]))) { @@ -3370,7 +3371,6 @@ static void handle_stripe_fill(struct stripe_head *sh, { int i; - BUG_ON(sh->batch_head); /* look for blocks to read/compute, skip this if a compute * is already in flight, or if the stripe contents are in the * midst of changing due to a write -- cgit v1.2.3 From 10d82c5f0d167ef75a2d8d7d4eed9ee43d3369c9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 8 May 2015 18:19:33 +1000 Subject: md/raid5: avoid reading parity blocks for full-stripe write to degraded array When performing a reconstruct write, we need to read all blocks that are not being over-written .. except the parity (P and Q) blocks. The code currently reads these (as they are not being over-written!) unnecessarily. Signed-off-by: NeilBrown Fixes: ea664c8245f3 ("md/raid5: need_this_block: tidy/fix last condition.") --- drivers/md/raid5.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 415cac6d89bd..85dc0e67fb88 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3282,7 +3282,9 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, /* reconstruct-write isn't being forced */ return 0; for (i = 0; i < s->failed; i++) { - if (!test_bit(R5_UPTODATE, &fdev[i]->flags) && + if (s->failed_num[i] != sh->pd_idx && + s->failed_num[i] != sh->qd_idx && + !test_bit(R5_UPTODATE, &fdev[i]->flags) && !test_bit(R5_OVERWRITE, &fdev[i]->flags)) return 1; } -- cgit v1.2.3 From 6e9eac2dcee5e19f125967dd2be3e36558c42fff Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 8 May 2015 18:19:34 +1000 Subject: md/raid5: don't record new size if resize_stripes fails. If any memory allocation in resize_stripes fails we will return -ENOMEM, but in some cases we update conf->pool_size anyway. This means that if we try again, the allocations will be assumed to be larger than they are, and badness results. So only update pool_size if there is no error. This bug was introduced in 2.6.17 and the patch is suitable for -stable. Fixes: ad01c9e3752f ("[PATCH] md: Allow stripes to be expanded in preparation for expanding an array") Cc: stable@vger.kernel.org (v2.6.17+) Signed-off-by: NeilBrown --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 85dc0e67fb88..9748e525e4c0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2216,7 +2216,8 @@ static int resize_stripes(struct r5conf *conf, int newsize) conf->slab_cache = sc; conf->active_name = 1-conf->active_name; - conf->pool_size = newsize; + if (!err) + conf->pool_size = newsize; return err; } -- cgit v1.2.3 From 738a273806ee0568369c9bb19ef3b102f54beef4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 8 May 2015 18:19:39 +1000 Subject: md/raid5: fix allocation of 'scribble' array. As the new 'scribble' array is sized based on chunk size, we need to make sure the size matches the largest of 'old' and 'new' chunk sizes when the array is undergoing reshape. We also potentially need to resize it even when not resizing the stripe cache, as chunk size can change without changing number of devices. So move the 'resize' code into a separate function, and consider old and new sizes when allocating. Signed-off-by: NeilBrown Fixes: 46d5b785621a ("raid5: use flex_array for scribble data") --- drivers/md/raid5.c | 65 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 22 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9748e525e4c0..3873eaa6fa2e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2068,6 +2068,35 @@ static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags) return ret; } +static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) +{ + unsigned long cpu; + int err = 0; + + mddev_suspend(conf->mddev); + get_online_cpus(); + for_each_present_cpu(cpu) { + struct raid5_percpu *percpu; + struct flex_array *scribble; + + percpu = per_cpu_ptr(conf->percpu, cpu); + scribble = scribble_alloc(new_disks, + new_sectors / STRIPE_SECTORS, + GFP_NOIO); + + if (scribble) { + flex_array_free(percpu->scribble); + percpu->scribble = scribble; + } else { + err = -ENOMEM; + break; + } + } + put_online_cpus(); + mddev_resume(conf->mddev); + return err; +} + static int resize_stripes(struct r5conf *conf, int newsize) { /* Make all the stripes able to hold 'newsize' devices. @@ -2096,7 +2125,6 @@ static int resize_stripes(struct r5conf *conf, int newsize) struct stripe_head *osh, *nsh; LIST_HEAD(newstripes); struct disk_info *ndisks; - unsigned long cpu; int err; struct kmem_cache *sc; int i; @@ -2178,25 +2206,6 @@ static int resize_stripes(struct r5conf *conf, int newsize) } else err = -ENOMEM; - get_online_cpus(); - for_each_present_cpu(cpu) { - struct raid5_percpu *percpu; - struct flex_array *scribble; - - percpu = per_cpu_ptr(conf->percpu, cpu); - scribble = scribble_alloc(newsize, conf->chunk_sectors / - STRIPE_SECTORS, GFP_NOIO); - - if (scribble) { - flex_array_free(percpu->scribble); - percpu->scribble = scribble; - } else { - err = -ENOMEM; - break; - } - } - put_online_cpus(); - /* Step 4, return new stripes to service */ while(!list_empty(&newstripes)) { nsh = list_entry(newstripes.next, struct stripe_head, lru); @@ -6228,8 +6237,11 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu percpu->spare_page = alloc_page(GFP_KERNEL); if (!percpu->scribble) percpu->scribble = scribble_alloc(max(conf->raid_disks, - conf->previous_raid_disks), conf->chunk_sectors / - STRIPE_SECTORS, GFP_KERNEL); + conf->previous_raid_disks), + max(conf->chunk_sectors, + conf->prev_chunk_sectors) + / STRIPE_SECTORS, + GFP_KERNEL); if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { free_scratch_buffer(conf, percpu); @@ -7205,6 +7217,15 @@ static int check_reshape(struct mddev *mddev) if (!check_stripe_cache(mddev)) return -ENOSPC; + if (mddev->new_chunk_sectors > mddev->chunk_sectors || + mddev->delta_disks > 0) + if (resize_chunks(conf, + conf->previous_raid_disks + + max(0, mddev->delta_disks), + max(mddev->new_chunk_sectors, + mddev->chunk_sectors) + ) < 0) + return -ENOMEM; return resize_stripes(conf, (conf->previous_raid_disks + mddev->delta_disks)); } -- cgit v1.2.3 From bb27051f9fd7643f05d8f0babce3337f0b9b3087 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 8 May 2015 18:19:40 +1000 Subject: md/raid5: fix handling of degraded stripes in batches. There is no need for special handling of stripe-batches when the array is degraded. There may be if there is a failure in the batch, but STRIPE_DEGRADED does not imply an error. So don't set STRIPE_BATCH_ERR in ops_run_io just because the array is degraded. This actually causes a bug: the STRIPE_DEGRADED flag gets cleared in check_break_stripe_batch_list() and so the bitmap bit gets cleared when it shouldn't. So in check_break_stripe_batch_list(), split the batch up completely - again STRIPE_DEGRADED isn't meaningful. Also don't set STRIPE_BATCH_ERR when there is a write error to a replacement device. This simply removes the replacement device and requires no extra handling. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3873eaa6fa2e..1ba97fdc6df1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1078,9 +1078,6 @@ again: pr_debug("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); - if (sh->batch_head) - set_bit(STRIPE_BATCH_ERR, - &sh->batch_head->state); set_bit(STRIPE_HANDLE, &sh->state); } @@ -2448,7 +2445,7 @@ static void raid5_end_write_request(struct bio *bi, int error) } rdev_dec_pending(rdev, conf->mddev); - if (sh->batch_head && !uptodate) + if (sh->batch_head && !uptodate && !replacement) set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) @@ -4214,15 +4211,9 @@ static void check_break_stripe_batch_list(struct stripe_head *sh) return; head_sh = sh; - do { - sh = list_first_entry(&sh->batch_list, - struct stripe_head, batch_list); - BUG_ON(sh == head_sh); - } while (!test_bit(STRIPE_DEGRADED, &sh->state)); - while (sh != head_sh) { - next = list_first_entry(&sh->batch_list, - struct stripe_head, batch_list); + list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { + list_del_init(&sh->batch_list); set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, @@ -4242,8 +4233,6 @@ static void check_break_stripe_batch_list(struct stripe_head *sh) set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); - - sh = next; } } -- cgit v1.2.3