diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 325 |
1 files changed, 175 insertions, 150 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 59e44e99eef3..15ef2c641b2b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -223,18 +223,14 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh, return slot; } -static void return_io(struct bio *return_bi) +static void return_io(struct bio_list *return_bi) { - struct bio *bi = return_bi; - while (bi) { - - return_bi = bi->bi_next; - bi->bi_next = NULL; + struct bio *bi; + while ((bi = bio_list_pop(return_bi)) != NULL) { bi->bi_iter.bi_size = 0; trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), bi, 0); - bio_endio(bi, 0); - bi = return_bi; + bio_endio(bi); } } @@ -887,9 +883,9 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) } static void -raid5_end_read_request(struct bio *bi, int error); +raid5_end_read_request(struct bio *bi); static void -raid5_end_write_request(struct bio *bi, int error); +raid5_end_write_request(struct bio *bi); static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) { @@ -1177,7 +1173,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, static void ops_complete_biofill(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - struct bio *return_bi = NULL; + struct bio_list return_bi = BIO_EMPTY_LIST; int i; pr_debug("%s: stripe %llu\n", __func__, @@ -1201,17 +1197,15 @@ static void ops_complete_biofill(void *stripe_head_ref) while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - if (!raid5_dec_bi_active_stripes(rbi)) { - rbi->bi_next = return_bi; - return_bi = rbi; - } + if (!raid5_dec_bi_active_stripes(rbi)) + bio_list_add(&return_bi, rbi); rbi = rbi2; } } } clear_bit(STRIPE_BIOFILL_RUN, &sh->state); - return_io(return_bi); + return_io(&return_bi); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -2162,6 +2156,9 @@ static int resize_stripes(struct r5conf *conf, int newsize) if (!sc) return -ENOMEM; + /* Need to ensure auto-resizing doesn't interfere */ + mutex_lock(&conf->cache_size_mutex); + for (i = conf->max_nr_stripes; i; i--) { nsh = alloc_stripe(sc, GFP_KERNEL); if (!nsh) @@ -2178,6 +2175,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) kmem_cache_free(sc, nsh); } kmem_cache_destroy(sc); + mutex_unlock(&conf->cache_size_mutex); return -ENOMEM; } /* Step 2 - Must use GFP_NOIO now. @@ -2224,6 +2222,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) } else err = -ENOMEM; + mutex_unlock(&conf->cache_size_mutex); /* Step 4, return new stripes to service */ while(!list_empty(&newstripes)) { nsh = list_entry(newstripes.next, struct stripe_head, lru); @@ -2251,7 +2250,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) static int drop_one_stripe(struct r5conf *conf) { struct stripe_head *sh; - int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; + int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; spin_lock_irq(conf->hash_locks + hash); sh = get_free_stripe(conf, hash); @@ -2277,12 +2276,11 @@ static void shrink_stripes(struct r5conf *conf) conf->slab_cache = NULL; } -static void raid5_end_read_request(struct bio * bi, int error) +static void raid5_end_read_request(struct bio * bi) { struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; - int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); char b[BDEVNAME_SIZE]; struct md_rdev *rdev = NULL; sector_t s; @@ -2291,9 +2289,9 @@ static void raid5_end_read_request(struct bio * bi, int error) if (bi == &sh->dev[i].req) break; - pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", + pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), - uptodate); + bi->bi_error); if (i == disks) { BUG(); return; @@ -2312,7 +2310,7 @@ static void raid5_end_read_request(struct bio * bi, int error) s = sh->sector + rdev->new_data_offset; else s = sh->sector + rdev->data_offset; - if (uptodate) { + if (!bi->bi_error) { set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { /* Note that this cannot happen on a @@ -2400,13 +2398,12 @@ static void raid5_end_read_request(struct bio * bi, int error) release_stripe(sh); } -static void raid5_end_write_request(struct bio *bi, int error) +static void raid5_end_write_request(struct bio *bi) { struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; struct md_rdev *uninitialized_var(rdev); - int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); sector_t first_bad; int bad_sectors; int replacement = 0; @@ -2429,23 +2426,23 @@ static void raid5_end_write_request(struct bio *bi, int error) break; } } - pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", + pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), - uptodate); + bi->bi_error); if (i == disks) { BUG(); return; } if (replacement) { - if (!uptodate) + if (bi->bi_error) md_error(conf->mddev, rdev); else if (is_badblock(rdev, sh->sector, STRIPE_SECTORS, &first_bad, &bad_sectors)) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { - if (!uptodate) { + if (bi->bi_error) { set_bit(STRIPE_DEGRADED, &sh->state); set_bit(WriteErrorSeen, &rdev->flags); set_bit(R5_WriteError, &sh->dev[i].flags); @@ -2466,7 +2463,7 @@ static void raid5_end_write_request(struct bio *bi, int error) } rdev_dec_pending(rdev, conf->mddev); - if (sh->batch_head && !uptodate && !replacement) + if (sh->batch_head && bi->bi_error && !replacement) set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) @@ -2514,6 +2511,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); printk(KERN_ALERT "md/raid:%s: Disk failure on %s, disabling device.\n" "md/raid:%s: Operation continuing on %d devices.\n", @@ -3066,7 +3064,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, static void handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks, - struct bio **return_bi) + struct bio_list *return_bi) { int i; BUG_ON(sh->batch_head); @@ -3107,11 +3105,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); + + bi->bi_error = -EIO; if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); - bi->bi_next = *return_bi; - *return_bi = bi; + bio_list_add(return_bi, bi); } bi = nextbi; } @@ -3131,11 +3129,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); + + bi->bi_error = -EIO; if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); - bi->bi_next = *return_bi; - *return_bi = bi; + bio_list_add(return_bi, bi); } bi = bi2; } @@ -3156,11 +3154,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (!raid5_dec_bi_active_stripes(bi)) { - bi->bi_next = *return_bi; - *return_bi = bi; - } + + bi->bi_error = -EIO; + if (!raid5_dec_bi_active_stripes(bi)) + bio_list_add(return_bi, bi); bi = nextbi; } } @@ -3439,7 +3436,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, * never LOCKED, so we don't need to test 'failed' directly. */ static void handle_stripe_clean_event(struct r5conf *conf, - struct stripe_head *sh, int disks, struct bio **return_bi) + struct stripe_head *sh, int disks, struct bio_list *return_bi) { int i; struct r5dev *dev; @@ -3473,8 +3470,7 @@ returnbi: wbi2 = r5_next_bio(wbi, dev->sector); if (!raid5_dec_bi_active_stripes(wbi)) { md_write_end(conf->mddev); - wbi->bi_next = *return_bi; - *return_bi = wbi; + bio_list_add(return_bi, wbi); } wbi = wbi2; } @@ -4061,8 +4057,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) &first_bad, &bad_sectors)) set_bit(R5_ReadRepl, &dev->flags); else { - if (rdev) + if (rdev && !test_bit(Faulty, &rdev->flags)) set_bit(R5_NeedReplace, &dev->flags); + else + clear_bit(R5_NeedReplace, &dev->flags); rdev = rcu_dereference(conf->disks[i].rdev); clear_bit(R5_ReadRepl, &dev->flags); } @@ -4605,7 +4603,15 @@ finish: md_wakeup_thread(conf->mddev->thread); } - return_io(s.return_bi); + if (!bio_list_empty(&s.return_bi)) { + if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->return_bi, &s.return_bi); + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(conf->mddev->thread); + } else + return_io(&s.return_bi); + } clear_bit_unlock(STRIPE_ACTIVE, &sh->state); } @@ -4662,43 +4668,14 @@ static int raid5_congested(struct mddev *mddev, int bits) return 0; } -/* We want read requests to align with chunks where possible, - * but write requests don't need to. - */ -static int raid5_mergeable_bvec(struct mddev *mddev, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); - int max; - unsigned int chunk_sectors = mddev->chunk_sectors; - unsigned int bio_sectors = bvm->bi_size >> 9; - - /* - * always allow writes to be mergeable, read as well if array - * is degraded as we'll go through stripe cache anyway. - */ - if ((bvm->bi_rw & 1) == WRITE || mddev->degraded) - return biovec->bv_len; - - if (mddev->new_chunk_sectors < mddev->chunk_sectors) - chunk_sectors = mddev->new_chunk_sectors; - max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; - if (max < 0) max = 0; - if (max <= biovec->bv_len && bio_sectors == 0) - return biovec->bv_len; - else - return max; -} - static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) { + struct r5conf *conf = mddev->private; sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); - unsigned int chunk_sectors = mddev->chunk_sectors; + unsigned int chunk_sectors; unsigned int bio_sectors = bio_sectors(bio); - if (mddev->new_chunk_sectors < mddev->chunk_sectors) - chunk_sectors = mddev->new_chunk_sectors; + chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); } @@ -4749,13 +4726,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf) * first). * If the read failed.. */ -static void raid5_align_endio(struct bio *bi, int error) +static void raid5_align_endio(struct bio *bi) { struct bio* raid_bi = bi->bi_private; struct mddev *mddev; struct r5conf *conf; - int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); struct md_rdev *rdev; + int error = bi->bi_error; bio_put(bi); @@ -4766,10 +4743,10 @@ static void raid5_align_endio(struct bio *bi, int error) rdev_dec_pending(rdev, conf->mddev); - if (!error && uptodate) { + if (!error) { trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), raid_bi, 0); - bio_endio(raid_bi, 0); + bio_endio(raid_bi); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_quiescent); return; @@ -4780,26 +4757,7 @@ static void raid5_align_endio(struct bio *bi, int error) add_bio_to_retry(raid_bi, conf); } -static int bio_fits_rdev(struct bio *bi) -{ - struct request_queue *q = bdev_get_queue(bi->bi_bdev); - - if (bio_sectors(bi) > queue_max_sectors(q)) - return 0; - blk_recount_segments(q, bi); - if (bi->bi_phys_segments > queue_max_segments(q)) - return 0; - - if (q->merge_bvec_fn) - /* it's too hard to apply the merge_bvec_fn at this stage, - * just just give up - */ - return 0; - - return 1; -} - -static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) +static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) { struct r5conf *conf = mddev->private; int dd_idx; @@ -4808,7 +4766,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) sector_t end_sector; if (!in_chunk_boundary(mddev, raid_bio)) { - pr_debug("chunk_aligned_read : non aligned\n"); + pr_debug("%s: non aligned\n", __func__); return 0; } /* @@ -4850,13 +4808,11 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) rcu_read_unlock(); raid_bio->bi_next = (void*)rdev; align_bi->bi_bdev = rdev->bdev; - __clear_bit(BIO_SEG_VALID, &align_bi->bi_flags); + bio_clear_flag(align_bi, BIO_SEG_VALID); - if (!bio_fits_rdev(align_bi) || - is_badblock(rdev, align_bi->bi_iter.bi_sector, + if (is_badblock(rdev, align_bi->bi_iter.bi_sector, bio_sectors(align_bi), &first_bad, &bad_sectors)) { - /* too big in some way, or has a known bad block */ bio_put(align_bi); rdev_dec_pending(rdev, mddev); return 0; @@ -4885,6 +4841,31 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) } } +static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) +{ + struct bio *split; + + do { + sector_t sector = raid_bio->bi_iter.bi_sector; + unsigned chunk_sects = mddev->chunk_sectors; + unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); + + if (sectors < bio_sectors(raid_bio)) { + split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set); + bio_chain(split, raid_bio); + } else + split = raid_bio; + + if (!raid5_read_one_chunk(mddev, split)) { + if (split != raid_bio) + generic_make_request(raid_bio); + return split; + } + } while (split != raid_bio); + + return NULL; +} + /* __get_priority_stripe - get the next stripe to process * * Full stripe writes are allowed to pass preread active stripes up until @@ -5133,7 +5114,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) remaining = raid5_dec_bi_active_stripes(bi); if (remaining == 0) { md_write_end(mddev); - bio_endio(bi, 0); + bio_endio(bi); } } @@ -5162,9 +5143,11 @@ static void make_request(struct mddev *mddev, struct bio * bi) * data on failed drives. */ if (rw == READ && mddev->degraded == 0 && - mddev->reshape_position == MaxSector && - chunk_aligned_read(mddev,bi)) - return; + mddev->reshape_position == MaxSector) { + bi = chunk_aligned_read(mddev, bi); + if (!bi) + return; + } if (unlikely(bi->bi_rw & REQ_DISCARD)) { make_discard_request(mddev, bi); @@ -5297,7 +5280,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) release_stripe_plug(mddev, sh); } else { /* cannot get stripe for read-ahead, just give-up */ - clear_bit(BIO_UPTODATE, &bi->bi_flags); + bi->bi_error = -EIO; break; } } @@ -5311,7 +5294,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), bi, 0); - bio_endio(bi, 0); + bio_endio(bi); } } @@ -5340,6 +5323,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sector_t stripe_addr; int reshape_sectors; struct list_head stripes; + sector_t retn; if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ @@ -5347,6 +5331,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk conf->reshape_progress < raid5_size(mddev, 0, 0)) { sector_nr = raid5_size(mddev, 0, 0) - conf->reshape_progress; + } else if (mddev->reshape_backwards && + conf->reshape_progress == MaxSector) { + /* shouldn't happen, but just in case, finish up.*/ + sector_nr = MaxSector; } else if (!mddev->reshape_backwards && conf->reshape_progress > 0) sector_nr = conf->reshape_progress; @@ -5355,7 +5343,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk mddev->curr_resync_completed = sector_nr; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); *skipped = 1; - return sector_nr; + retn = sector_nr; + goto finish; } } @@ -5363,10 +5352,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk * If old and new chunk sizes differ, we need to process the * largest of these */ - if (mddev->new_chunk_sectors > mddev->chunk_sectors) - reshape_sectors = mddev->new_chunk_sectors; - else - reshape_sectors = mddev->chunk_sectors; + + reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); /* We update the metadata at least every 10 seconds, or when * the data about to be copied would over-write the source of @@ -5381,11 +5368,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->reshape_backwards) { - writepos -= min_t(sector_t, reshape_sectors, writepos); + BUG_ON(writepos < reshape_sectors); + writepos -= reshape_sectors; readpos += reshape_sectors; safepos += reshape_sectors; } else { writepos += reshape_sectors; + /* readpos and safepos are worst-case calculations. + * A negative number is overly pessimistic, and causes + * obvious problems for unsigned storage. So clip to 0. + */ readpos -= min_t(sector_t, reshape_sectors, readpos); safepos -= min_t(sector_t, reshape_sectors, safepos); } @@ -5528,7 +5520,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk * then we need to write out the superblock. */ sector_nr += reshape_sectors; - if ((sector_nr - mddev->curr_resync_completed) * 2 + retn = reshape_sectors; +finish: + if (mddev->curr_resync_completed > mddev->resync_max || + (sector_nr - mddev->curr_resync_completed) * 2 >= mddev->resync_max - mddev->curr_resync_completed) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, @@ -5553,7 +5548,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } ret: - return reshape_sectors; + return retn; } static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) @@ -5707,7 +5702,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) if (remaining == 0) { trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), raid_bio, 0); - bio_endio(raid_bio, 0); + bio_endio(raid_bio); } if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_quiescent); @@ -5809,6 +5804,18 @@ static void raid5d(struct md_thread *thread) md_check_recovery(mddev); + if (!bio_list_empty(&conf->return_bi) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + struct bio_list tmp = BIO_EMPTY_LIST; + spin_lock_irq(&conf->device_lock); + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + bio_list_merge(&tmp, &conf->return_bi); + bio_list_init(&conf->return_bi); + } + spin_unlock_irq(&conf->device_lock); + return_io(&tmp); + } + blk_start_plug(&plug); handled = 0; spin_lock_irq(&conf->device_lock); @@ -5857,12 +5864,14 @@ static void raid5d(struct md_thread *thread) pr_debug("%d stripes handled\n", handled); spin_unlock_irq(&conf->device_lock); - if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) { + if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && + mutex_trylock(&conf->cache_size_mutex)) { grow_one_stripe(conf, __GFP_NOWARN); /* Set flag even if allocation failed. This helps * slow down allocation requests when mem is short */ set_bit(R5_DID_ALLOC, &conf->cache_state); + mutex_unlock(&conf->cache_size_mutex); } async_tx_issue_pending_all(); @@ -5894,18 +5903,22 @@ raid5_set_cache_size(struct mddev *mddev, int size) return -EINVAL; conf->min_nr_stripes = size; + mutex_lock(&conf->cache_size_mutex); while (size < conf->max_nr_stripes && drop_one_stripe(conf)) ; + mutex_unlock(&conf->cache_size_mutex); err = md_allow_write(mddev); if (err) return err; + mutex_lock(&conf->cache_size_mutex); while (size > conf->max_nr_stripes) if (!grow_one_stripe(conf, GFP_KERNEL)) break; + mutex_unlock(&conf->cache_size_mutex); return 0; } @@ -6243,8 +6256,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) /* size is defined by the smallest of previous and new size */ raid_disks = min(conf->raid_disks, conf->previous_raid_disks); - sectors &= ~((sector_t)mddev->chunk_sectors - 1); - sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); + sectors &= ~((sector_t)conf->chunk_sectors - 1); + sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); return sectors * (raid_disks - conf->max_degraded); } @@ -6371,11 +6384,19 @@ static unsigned long raid5_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); - int ret = 0; - while (ret < sc->nr_to_scan) { - if (drop_one_stripe(conf) == 0) - return SHRINK_STOP; - ret++; + unsigned long ret = SHRINK_STOP; + + if (mutex_trylock(&conf->cache_size_mutex)) { + ret= 0; + while (ret < sc->nr_to_scan && + conf->max_nr_stripes > conf->min_nr_stripes) { + if (drop_one_stripe(conf) == 0) { + ret = SHRINK_STOP; + break; + } + ret++; + } + mutex_unlock(&conf->cache_size_mutex); } return ret; } @@ -6444,6 +6465,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; spin_lock_init(&conf->device_lock); seqcount_init(&conf->gen_lock); + mutex_init(&conf->cache_size_mutex); init_waitqueue_head(&conf->wait_for_quiescent); for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { init_waitqueue_head(&conf->wait_for_stripe[i]); @@ -6453,6 +6475,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); + bio_list_init(&conf->return_bi); init_llist_head(&conf->released_stripes); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); @@ -6542,6 +6565,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (conf->reshape_progress != MaxSector) { conf->prev_chunk_sectors = mddev->chunk_sectors; conf->prev_algo = mddev->layout; + } else { + conf->prev_chunk_sectors = conf->chunk_sectors; + conf->prev_algo = conf->algorithm; } conf->min_nr_stripes = NR_STRIPES; @@ -6661,6 +6687,8 @@ static int run(struct mddev *mddev) sector_t here_new, here_old; int old_disks; int max_degraded = (mddev->level == 6 ? 2 : 1); + int chunk_sectors; + int new_data_disks; if (mddev->new_level != mddev->level) { printk(KERN_ERR "md/raid:%s: unsupported reshape " @@ -6672,28 +6700,25 @@ static int run(struct mddev *mddev) /* reshape_position must be on a new-stripe boundary, and one * further up in new geometry must map after here in old * geometry. + * If the chunk sizes are different, then as we perform reshape + * in units of the largest of the two, reshape_position needs + * be a multiple of the largest chunk size times new data disks. */ here_new = mddev->reshape_position; - if (sector_div(here_new, mddev->new_chunk_sectors * - (mddev->raid_disks - max_degraded))) { + chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); + new_data_disks = mddev->raid_disks - max_degraded; + if (sector_div(here_new, chunk_sectors * new_data_disks)) { printk(KERN_ERR "md/raid:%s: reshape_position not " "on a stripe boundary\n", mdname(mddev)); return -EINVAL; } - reshape_offset = here_new * mddev->new_chunk_sectors; + reshape_offset = here_new * chunk_sectors; /* here_new is the stripe we will write to */ here_old = mddev->reshape_position; - sector_div(here_old, mddev->chunk_sectors * - (old_disks-max_degraded)); + sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); /* here_old is the first stripe that we might need to read * from */ if (mddev->delta_disks == 0) { - if ((here_new * mddev->new_chunk_sectors != - here_old * mddev->chunk_sectors)) { - printk(KERN_ERR "md/raid:%s: reshape position is" - " confused - aborting\n", mdname(mddev)); - return -EINVAL; - } /* We cannot be sure it is safe to start an in-place * reshape. It is only safe if user-space is monitoring * and taking constant backups. @@ -6712,10 +6737,10 @@ static int run(struct mddev *mddev) return -EINVAL; } } else if (mddev->reshape_backwards - ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= - here_old * mddev->chunk_sectors) - : (here_new * mddev->new_chunk_sectors >= - here_old * mddev->chunk_sectors + (-min_offset_diff))) { + ? (here_new * chunk_sectors + min_offset_diff <= + here_old * chunk_sectors) + : (here_new * chunk_sectors >= + here_old * chunk_sectors + (-min_offset_diff))) { /* Reading from the same stripe as writing to - bad */ printk(KERN_ERR "md/raid:%s: reshape_position too early for " "auto-recovery - aborting.\n", @@ -6967,7 +6992,7 @@ static void status(struct seq_file *seq, struct mddev *mddev) int i; seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, - mddev->chunk_sectors / 2, mddev->layout); + conf->chunk_sectors / 2, mddev->layout); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", @@ -7173,7 +7198,9 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) * worth it. */ sector_t newsize; - sectors &= ~((sector_t)mddev->chunk_sectors - 1); + struct r5conf *conf = mddev->private; + + sectors &= ~((sector_t)conf->chunk_sectors - 1); newsize = raid5_size(mddev, sectors, mddev->raid_disks); if (mddev->external_size && mddev->array_sectors > newsize) @@ -7412,6 +7439,7 @@ static void end_reshape(struct r5conf *conf) rdev->data_offset = rdev->new_data_offset; smp_wmb(); conf->reshape_progress = MaxSector; + conf->mddev->reshape_position = MaxSector; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); @@ -7757,7 +7785,6 @@ static struct md_personality raid6_personality = .quiesce = raid5_quiesce, .takeover = raid6_takeover, .congested = raid5_congested, - .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid5_personality = { @@ -7781,7 +7808,6 @@ static struct md_personality raid5_personality = .quiesce = raid5_quiesce, .takeover = raid5_takeover, .congested = raid5_congested, - .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid4_personality = @@ -7806,7 +7832,6 @@ static struct md_personality raid4_personality = .quiesce = raid5_quiesce, .takeover = raid4_takeover, .congested = raid5_congested, - .mergeable_bvec = raid5_mergeable_bvec, }; static int __init raid5_init(void) |