diff options
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r-- | drivers/md/raid10.c | 582 |
1 files changed, 468 insertions, 114 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 685ddf325ee4..6e8aa213f0d5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -73,7 +73,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) struct r10conf *conf = data; int size = offsetof(struct r10bio, devs[conf->copies]); - /* allocate a r10bio with room for raid_disks entries in the bios array */ + /* allocate a r10bio with room for raid_disks entries in the + * bios array */ return kzalloc(size, gfp_flags); } @@ -123,12 +124,19 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) if (!bio) goto out_free_bio; r10_bio->devs[j].bio = bio; + if (!conf->have_replacement) + continue; + bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); + if (!bio) + goto out_free_bio; + r10_bio->devs[j].repl_bio = bio; } /* * Allocate RESYNC_PAGES data pages and attach them * where needed. */ for (j = 0 ; j < nalloc; j++) { + struct bio *rbio = r10_bio->devs[j].repl_bio; bio = r10_bio->devs[j].bio; for (i = 0; i < RESYNC_PAGES; i++) { if (j == 1 && !test_bit(MD_RECOVERY_SYNC, @@ -143,6 +151,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) goto out_free_pages; bio->bi_io_vec[i].bv_page = page; + if (rbio) + rbio->bi_io_vec[i].bv_page = page; } } @@ -156,8 +166,11 @@ out_free_pages: safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); j = -1; out_free_bio: - while ( ++j < nalloc ) + while (++j < nalloc) { bio_put(r10_bio->devs[j].bio); + if (r10_bio->devs[j].repl_bio) + bio_put(r10_bio->devs[j].repl_bio); + } r10bio_pool_free(r10_bio, conf); return NULL; } @@ -178,6 +191,9 @@ static void r10buf_pool_free(void *__r10_bio, void *data) } bio_put(bio); } + bio = r10bio->devs[j].repl_bio; + if (bio) + bio_put(bio); } r10bio_pool_free(r10bio, conf); } @@ -191,6 +207,10 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) if (!BIO_SPECIAL(*bio)) bio_put(*bio); *bio = NULL; + bio = &r10_bio->devs[i].repl_bio; + if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio)) + bio_put(*bio); + *bio = NULL; } } @@ -275,19 +295,27 @@ static inline void update_head_pos(int slot, struct r10bio *r10_bio) * Find the disk number which triggered given bio */ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, - struct bio *bio, int *slotp) + struct bio *bio, int *slotp, int *replp) { int slot; + int repl = 0; - for (slot = 0; slot < conf->copies; slot++) + for (slot = 0; slot < conf->copies; slot++) { if (r10_bio->devs[slot].bio == bio) break; + if (r10_bio->devs[slot].repl_bio == bio) { + repl = 1; + break; + } + } BUG_ON(slot == conf->copies); update_head_pos(slot, r10_bio); if (slotp) *slotp = slot; + if (replp) + *replp = repl; return r10_bio->devs[slot].devnum; } @@ -296,11 +324,13 @@ static void raid10_end_read_request(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r10bio *r10_bio = bio->bi_private; int slot, dev; + struct md_rdev *rdev; struct r10conf *conf = r10_bio->mddev->private; slot = r10_bio->read_slot; dev = r10_bio->devs[slot].devnum; + rdev = r10_bio->devs[slot].rdev; /* * this branch is our 'one mirror IO has finished' event handler: */ @@ -318,7 +348,7 @@ static void raid10_end_read_request(struct bio *bio, int error) */ set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); - rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); + rdev_dec_pending(rdev, conf->mddev); } else { /* * oops, read error - keep the refcount on the rdev @@ -327,7 +357,7 @@ static void raid10_end_read_request(struct bio *bio, int error) printk_ratelimited(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", mdname(conf->mddev), - bdevname(conf->mirrors[dev].rdev->bdev, b), + bdevname(rdev->bdev, b), (unsigned long long)r10_bio->sector); set_bit(R10BIO_ReadError, &r10_bio->state); reschedule_retry(r10_bio); @@ -366,17 +396,35 @@ static void raid10_end_write_request(struct bio *bio, int error) int dev; int dec_rdev = 1; struct r10conf *conf = r10_bio->mddev->private; - int slot; + int slot, repl; + struct md_rdev *rdev = NULL; - dev = find_bio_disk(conf, r10_bio, bio, &slot); + dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[dev].replacement; + if (!rdev) { + smp_rmb(); + repl = 0; + rdev = conf->mirrors[dev].rdev; + } /* * this branch is our 'one mirror IO has finished' event handler: */ if (!uptodate) { - set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags); - set_bit(R10BIO_WriteError, &r10_bio->state); - dec_rdev = 0; + if (repl) + /* Never record new bad blocks to replacement, + * just fail it. + */ + md_error(rdev->mddev, rdev); + else { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + set_bit(R10BIO_WriteError, &r10_bio->state); + dec_rdev = 0; + } } else { /* * Set R10BIO_Uptodate in our master bio, so that @@ -393,12 +441,15 @@ static void raid10_end_write_request(struct bio *bio, int error) set_bit(R10BIO_Uptodate, &r10_bio->state); /* Maybe we can clear some bad blocks. */ - if (is_badblock(conf->mirrors[dev].rdev, + if (is_badblock(rdev, r10_bio->devs[slot].addr, r10_bio->sectors, &first_bad, &bad_sectors)) { bio_put(bio); - r10_bio->devs[slot].bio = IO_MADE_GOOD; + if (repl) + r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; + else + r10_bio->devs[slot].bio = IO_MADE_GOOD; dec_rdev = 0; set_bit(R10BIO_MadeGood, &r10_bio->state); } @@ -414,7 +465,6 @@ static void raid10_end_write_request(struct bio *bio, int error) rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); } - /* * RAID10 layout manager * As well as the chunksize and raid_disks count, there are two @@ -562,14 +612,16 @@ static int raid10_mergeable_bvec(struct request_queue *q, * FIXME: possibly should rethink readbalancing and do it differently * depending on near_copies / far_copies geometry. */ -static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors) +static struct md_rdev *read_balance(struct r10conf *conf, + struct r10bio *r10_bio, + int *max_sectors) { const sector_t this_sector = r10_bio->sector; int disk, slot; int sectors = r10_bio->sectors; int best_good_sectors; sector_t new_distance, best_dist; - struct md_rdev *rdev; + struct md_rdev *rdev, *best_rdev; int do_balance; int best_slot; @@ -578,6 +630,7 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s retry: sectors = r10_bio->sectors; best_slot = -1; + best_rdev = NULL; best_dist = MaxSector; best_good_sectors = 0; do_balance = 1; @@ -599,10 +652,16 @@ retry: if (r10_bio->devs[slot].bio == IO_BLOCKED) continue; disk = r10_bio->devs[slot].devnum; - rdev = rcu_dereference(conf->mirrors[disk].rdev); + rdev = rcu_dereference(conf->mirrors[disk].replacement); + if (rdev == NULL || test_bit(Faulty, &rdev->flags) || + r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) + rdev = rcu_dereference(conf->mirrors[disk].rdev); if (rdev == NULL) continue; - if (!test_bit(In_sync, &rdev->flags)) + if (test_bit(Faulty, &rdev->flags)) + continue; + if (!test_bit(In_sync, &rdev->flags) && + r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) continue; dev_sector = r10_bio->devs[slot].addr; @@ -627,6 +686,7 @@ retry: if (good_sectors > best_good_sectors) { best_good_sectors = good_sectors; best_slot = slot; + best_rdev = rdev; } if (!do_balance) /* Must read from here */ @@ -655,16 +715,15 @@ retry: if (new_distance < best_dist) { best_dist = new_distance; best_slot = slot; + best_rdev = rdev; } } - if (slot == conf->copies) + if (slot >= conf->copies) { slot = best_slot; + rdev = best_rdev; + } if (slot >= 0) { - disk = r10_bio->devs[slot].devnum; - rdev = rcu_dereference(conf->mirrors[disk].rdev); - if (!rdev) - goto retry; atomic_inc(&rdev->nr_pending); if (test_bit(Faulty, &rdev->flags)) { /* Cannot risk returning a device that failed @@ -675,11 +734,11 @@ retry: } r10_bio->read_slot = slot; } else - disk = -1; + rdev = NULL; rcu_read_unlock(); *max_sectors = best_good_sectors; - return disk; + return rdev; } static int raid10_congested(void *data, int bits) @@ -846,7 +905,6 @@ static void unfreeze_array(struct r10conf *conf) static void make_request(struct mddev *mddev, struct bio * bio) { struct r10conf *conf = mddev->private; - struct mirror_info *mirror; struct r10bio *r10_bio; struct bio *read_bio; int i; @@ -945,27 +1003,27 @@ static void make_request(struct mddev *mddev, struct bio * bio) /* * read balancing logic: */ - int disk; + struct md_rdev *rdev; int slot; read_again: - disk = read_balance(conf, r10_bio, &max_sectors); - slot = r10_bio->read_slot; - if (disk < 0) { + rdev = read_balance(conf, r10_bio, &max_sectors); + if (!rdev) { raid_end_bio_io(r10_bio); return; } - mirror = conf->mirrors + disk; + slot = r10_bio->read_slot; read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, max_sectors); r10_bio->devs[slot].bio = read_bio; + r10_bio->devs[slot].rdev = rdev; read_bio->bi_sector = r10_bio->devs[slot].addr + - mirror->rdev->data_offset; - read_bio->bi_bdev = mirror->rdev->bdev; + rdev->data_offset; + read_bio->bi_bdev = rdev->bdev; read_bio->bi_end_io = raid10_end_read_request; read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r10_bio; @@ -1025,6 +1083,7 @@ read_again: */ plugged = mddev_check_plugged(mddev); + r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ raid10_find_phys(conf, r10_bio); retry_write: blocked_rdev = NULL; @@ -1034,12 +1093,25 @@ retry_write: for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[d].replacement); + if (rdev == rrdev) + rrdev = NULL; if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { atomic_inc(&rdev->nr_pending); blocked_rdev = rdev; break; } + if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { + atomic_inc(&rrdev->nr_pending); + blocked_rdev = rrdev; + break; + } + if (rrdev && test_bit(Faulty, &rrdev->flags)) + rrdev = NULL; + r10_bio->devs[i].bio = NULL; + r10_bio->devs[i].repl_bio = NULL; if (!rdev || test_bit(Faulty, &rdev->flags)) { set_bit(R10BIO_Degraded, &r10_bio->state); continue; @@ -1088,6 +1160,10 @@ retry_write: } r10_bio->devs[i].bio = bio; atomic_inc(&rdev->nr_pending); + if (rrdev) { + r10_bio->devs[i].repl_bio = bio; + atomic_inc(&rrdev->nr_pending); + } } rcu_read_unlock(); @@ -1096,11 +1172,23 @@ retry_write: int j; int d; - for (j = 0; j < i; j++) + for (j = 0; j < i; j++) { if (r10_bio->devs[j].bio) { d = r10_bio->devs[j].devnum; rdev_dec_pending(conf->mirrors[d].rdev, mddev); } + if (r10_bio->devs[j].repl_bio) { + struct md_rdev *rdev; + d = r10_bio->devs[j].devnum; + rdev = conf->mirrors[d].replacement; + if (!rdev) { + /* Race with remove_disk */ + smp_mb(); + rdev = conf->mirrors[d].rdev; + } + rdev_dec_pending(rdev, mddev); + } + } allow_barrier(conf); md_wait_for_blocked_rdev(blocked_rdev, mddev); wait_barrier(conf); @@ -1147,6 +1235,31 @@ retry_write: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + + if (!r10_bio->devs[i].repl_bio) + continue; + + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[i].repl_bio = mbio; + + /* We are actively writing to the original device + * so it cannot disappear, so the replacement cannot + * become NULL here + */ + mbio->bi_sector = (r10_bio->devs[i].addr+ + conf->mirrors[d].replacement->data_offset); + mbio->bi_bdev = conf->mirrors[d].replacement->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_private = r10_bio; + + atomic_inc(&r10_bio->remaining); + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + spin_unlock_irqrestore(&conf->device_lock, flags); } /* Don't remove the bias on 'remaining' (one_write_done) until @@ -1309,9 +1422,27 @@ static int raid10_spare_active(struct mddev *mddev) */ for (i = 0; i < conf->raid_disks; i++) { tmp = conf->mirrors + i; - if (tmp->rdev - && !test_bit(Faulty, &tmp->rdev->flags) - && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { + if (tmp->replacement + && tmp->replacement->recovery_offset == MaxSector + && !test_bit(Faulty, &tmp->replacement->flags) + && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { + /* Replacement has just become active */ + if (!tmp->rdev + || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) + count++; + if (tmp->rdev) { + /* Replaced device not technically faulty, + * but we need to be sure it gets removed + * and never re-added. + */ + set_bit(Faulty, &tmp->rdev->flags); + sysfs_notify_dirent_safe( + tmp->rdev->sysfs_state); + } + sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); + } else if (tmp->rdev + && !test_bit(Faulty, &tmp->rdev->flags) + && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { count++; sysfs_notify_dirent(tmp->rdev->sysfs_state); } @@ -1353,8 +1484,25 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) struct mirror_info *p = &conf->mirrors[mirror]; if (p->recovery_disabled == mddev->recovery_disabled) continue; - if (p->rdev) - continue; + if (p->rdev) { + if (!test_bit(WantReplacement, &p->rdev->flags) || + p->replacement != NULL) + continue; + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = mirror; + err = 0; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } + conf->fullsync = 1; + rcu_assign_pointer(p->replacement, rdev); + break; + } disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); @@ -1385,40 +1533,61 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) return err; } -static int raid10_remove_disk(struct mddev *mddev, int number) +static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r10conf *conf = mddev->private; int err = 0; - struct md_rdev *rdev; - struct mirror_info *p = conf->mirrors+ number; + int number = rdev->raid_disk; + struct md_rdev **rdevp; + struct mirror_info *p = conf->mirrors + number; print_conf(conf); - rdev = p->rdev; - if (rdev) { - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { - err = -EBUSY; - goto abort; - } - /* Only remove faulty devices in recovery - * is not possible. - */ - if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != p->recovery_disabled && - enough(conf, -1)) { - err = -EBUSY; - goto abort; - } - p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } - err = md_integrity_register(mddev); + if (rdev == p->rdev) + rdevp = &p->rdev; + else if (rdev == p->replacement) + rdevp = &p->replacement; + else + return 0; + + if (test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) { + err = -EBUSY; + goto abort; } + /* Only remove faulty devices if recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->recovery_disabled != p->recovery_disabled && + (!p->replacement || p->replacement == rdev) && + enough(conf, -1)) { + err = -EBUSY; + goto abort; + } + *rdevp = NULL; + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + goto abort; + } else if (p->replacement) { + /* We must have just cleared 'rdev' */ + p->rdev = p->replacement; + clear_bit(Replacement, &p->replacement->flags); + smp_mb(); /* Make sure other CPUs may see both as identical + * but will never see neither -- if they are careful. + */ + p->replacement = NULL; + clear_bit(WantReplacement, &rdev->flags); + } else + /* We might have just remove the Replacement as faulty + * Clear the flag just in case + */ + clear_bit(WantReplacement, &rdev->flags); + + err = md_integrity_register(mddev); + abort: print_conf(conf); @@ -1432,7 +1601,7 @@ static void end_sync_read(struct bio *bio, int error) struct r10conf *conf = r10_bio->mddev->private; int d; - d = find_bio_disk(conf, r10_bio, bio, NULL); + d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); if (test_bit(BIO_UPTODATE, &bio->bi_flags)) set_bit(R10BIO_Uptodate, &r10_bio->state); @@ -1493,19 +1662,34 @@ static void end_sync_write(struct bio *bio, int error) sector_t first_bad; int bad_sectors; int slot; - - d = find_bio_disk(conf, r10_bio, bio, &slot); + int repl; + struct md_rdev *rdev = NULL; + + d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[d].replacement; + if (!rdev) { + smp_mb(); + rdev = conf->mirrors[d].rdev; + } if (!uptodate) { - set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags); - set_bit(R10BIO_WriteError, &r10_bio->state); - } else if (is_badblock(conf->mirrors[d].rdev, + if (repl) + md_error(mddev, rdev); + else { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + set_bit(R10BIO_WriteError, &r10_bio->state); + } + } else if (is_badblock(rdev, r10_bio->devs[slot].addr, r10_bio->sectors, &first_bad, &bad_sectors)) set_bit(R10BIO_MadeGood, &r10_bio->state); - rdev_dec_pending(conf->mirrors[d].rdev, mddev); + rdev_dec_pending(rdev, mddev); end_sync_request(r10_bio); } @@ -1609,6 +1793,29 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) generic_make_request(tbio); } + /* Now write out to any replacement devices + * that are active + */ + for (i = 0; i < conf->copies; i++) { + int j, d; + int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9); + + tbio = r10_bio->devs[i].repl_bio; + if (!tbio || !tbio->bi_end_io) + continue; + if (r10_bio->devs[i].bio->bi_end_io != end_sync_write + && r10_bio->devs[i].bio != fbio) + for (j = 0; j < vcnt; j++) + memcpy(page_address(tbio->bi_io_vec[j].bv_page), + page_address(fbio->bi_io_vec[j].bv_page), + PAGE_SIZE); + d = r10_bio->devs[i].devnum; + atomic_inc(&r10_bio->remaining); + md_sync_acct(conf->mirrors[d].replacement->bdev, + tbio->bi_size >> 9); + generic_make_request(tbio); + } + done: if (atomic_dec_and_test(&r10_bio->remaining)) { md_done_sync(mddev, r10_bio->sectors, 1); @@ -1668,8 +1875,13 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) s << 9, bio->bi_io_vec[idx].bv_page, WRITE, false); - if (!ok) + if (!ok) { set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, + &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + } } if (!ok) { /* We don't worry if we cannot set a bad block - @@ -1709,7 +1921,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) { struct r10conf *conf = mddev->private; int d; - struct bio *wbio; + struct bio *wbio, *wbio2; if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { fix_recovery_read_error(r10_bio); @@ -1721,12 +1933,20 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) * share the pages with the first bio * and submit the write request */ - wbio = r10_bio->devs[1].bio; d = r10_bio->devs[1].devnum; - - atomic_inc(&conf->mirrors[d].rdev->nr_pending); - md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); - generic_make_request(wbio); + wbio = r10_bio->devs[1].bio; + wbio2 = r10_bio->devs[1].repl_bio; + if (wbio->bi_end_io) { + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); + generic_make_request(wbio); + } + if (wbio2 && wbio2->bi_end_io) { + atomic_inc(&conf->mirrors[d].replacement->nr_pending); + md_sync_acct(conf->mirrors[d].replacement->bdev, + wbio2->bi_size >> 9); + generic_make_request(wbio2); + } } @@ -1779,8 +1999,12 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) /* success */ return 1; - if (rw == WRITE) + if (rw == WRITE) { set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + } /* need to record an error - either for the block or the device */ if (!rdev_set_badblocks(rdev, sector, sectors, 0)) md_error(rdev->mddev, rdev); @@ -2060,10 +2284,9 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) { int slot = r10_bio->read_slot; - int mirror = r10_bio->devs[slot].devnum; struct bio *bio; struct r10conf *conf = mddev->private; - struct md_rdev *rdev; + struct md_rdev *rdev = r10_bio->devs[slot].rdev; char b[BDEVNAME_SIZE]; unsigned long do_sync; int max_sectors; @@ -2081,15 +2304,15 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) fix_read_error(conf, mddev, r10_bio); unfreeze_array(conf); } - rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); + rdev_dec_pending(rdev, mddev); bio = r10_bio->devs[slot].bio; bdevname(bio->bi_bdev, b); r10_bio->devs[slot].bio = mddev->ro ? IO_BLOCKED : NULL; read_more: - mirror = read_balance(conf, r10_bio, &max_sectors); - if (mirror == -1) { + rdev = read_balance(conf, r10_bio, &max_sectors); + if (rdev == NULL) { printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" " read error for block %llu\n", mdname(mddev), b, @@ -2103,7 +2326,6 @@ read_more: if (bio) bio_put(bio); slot = r10_bio->read_slot; - rdev = conf->mirrors[mirror].rdev; printk_ratelimited( KERN_ERR "md/raid10:%s: %s: redirecting" @@ -2117,6 +2339,7 @@ read_more: r10_bio->sector - bio->bi_sector, max_sectors); r10_bio->devs[slot].bio = bio; + r10_bio->devs[slot].rdev = rdev; bio->bi_sector = r10_bio->devs[slot].addr + rdev->data_offset; bio->bi_bdev = rdev->bdev; @@ -2187,6 +2410,22 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) r10_bio->sectors, 0)) md_error(conf->mddev, rdev); } + rdev = conf->mirrors[dev].replacement; + if (r10_bio->devs[m].repl_bio == NULL) + continue; + if (test_bit(BIO_UPTODATE, + &r10_bio->devs[m].repl_bio->bi_flags)) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors); + } else { + if (!rdev_set_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0)) + md_error(conf->mddev, rdev); + } } put_buf(r10_bio); } else { @@ -2209,6 +2448,15 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) } rdev_dec_pending(rdev, conf->mddev); } + bio = r10_bio->devs[m].repl_bio; + rdev = conf->mirrors[dev].replacement; + if (rdev && bio == IO_MADE_GOOD) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors); + rdev_dec_pending(rdev, conf->mddev); + } } if (test_bit(R10BIO_WriteError, &r10_bio->state)) @@ -2272,9 +2520,14 @@ static void raid10d(struct mddev *mddev) static int init_resync(struct r10conf *conf) { int buffs; + int i; buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; BUG_ON(conf->r10buf_pool); + conf->have_replacement = 0; + for (i = 0; i < conf->raid_disks; i++) + if (conf->mirrors[i].replacement) + conf->have_replacement = 1; conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); if (!conf->r10buf_pool) return -ENOMEM; @@ -2355,9 +2608,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, bitmap_end_sync(mddev->bitmap, sect, &sync_blocks, 1); } - } else /* completed sync */ + } else { + /* completed sync */ + if ((!mddev->bitmap || conf->fullsync) + && conf->have_replacement + && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* Completed a full sync so the replacements + * are now fully recovered. + */ + for (i = 0; i < conf->raid_disks; i++) + if (conf->mirrors[i].replacement) + conf->mirrors[i].replacement + ->recovery_offset + = MaxSector; + } conf->fullsync = 0; - + } bitmap_close_sync(mddev->bitmap); close_sync(conf); *skipped = 1; @@ -2414,23 +2680,30 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, sector_t sect; int must_sync; int any_working; - - if (conf->mirrors[i].rdev == NULL || - test_bit(In_sync, &conf->mirrors[i].rdev->flags)) + struct mirror_info *mirror = &conf->mirrors[i]; + + if ((mirror->rdev == NULL || + test_bit(In_sync, &mirror->rdev->flags)) + && + (mirror->replacement == NULL || + test_bit(Faulty, + &mirror->replacement->flags))) continue; still_degraded = 0; /* want to reconstruct this device */ rb2 = r10_bio; sect = raid10_find_virt(conf, sector_nr, i); - /* Unless we are doing a full sync, we only need - * to recover the block if it is set in the bitmap + /* Unless we are doing a full sync, or a replacement + * we only need to recover the block if it is set in + * the bitmap */ must_sync = bitmap_start_sync(mddev->bitmap, sect, &sync_blocks, 1); if (sync_blocks < max_sync) max_sync = sync_blocks; if (!must_sync && + mirror->replacement == NULL && !conf->fullsync) { /* yep, skip the sync_blocks here, but don't assume * that there will never be anything to do here @@ -2500,33 +2773,60 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_end_io = end_sync_read; bio->bi_rw = READ; from_addr = r10_bio->devs[j].addr; - bio->bi_sector = from_addr + - conf->mirrors[d].rdev->data_offset; - bio->bi_bdev = conf->mirrors[d].rdev->bdev; - atomic_inc(&conf->mirrors[d].rdev->nr_pending); - atomic_inc(&r10_bio->remaining); - /* and we write to 'i' */ + bio->bi_sector = from_addr + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + atomic_inc(&rdev->nr_pending); + /* and we write to 'i' (if not in_sync) */ for (k=0; k<conf->copies; k++) if (r10_bio->devs[k].devnum == i) break; BUG_ON(k == conf->copies); - bio = r10_bio->devs[1].bio; - bio->bi_next = biolist; - biolist = bio; - bio->bi_private = r10_bio; - bio->bi_end_io = end_sync_write; - bio->bi_rw = WRITE; to_addr = r10_bio->devs[k].addr; - bio->bi_sector = to_addr + - conf->mirrors[i].rdev->data_offset; - bio->bi_bdev = conf->mirrors[i].rdev->bdev; - r10_bio->devs[0].devnum = d; r10_bio->devs[0].addr = from_addr; r10_bio->devs[1].devnum = i; r10_bio->devs[1].addr = to_addr; + rdev = mirror->rdev; + if (!test_bit(In_sync, &rdev->flags)) { + bio = r10_bio->devs[1].bio; + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_write; + bio->bi_rw = WRITE; + bio->bi_sector = to_addr + + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + atomic_inc(&r10_bio->remaining); + } else + r10_bio->devs[1].bio->bi_end_io = NULL; + + /* and maybe write to replacement */ + bio = r10_bio->devs[1].repl_bio; + if (bio) + bio->bi_end_io = NULL; + rdev = mirror->replacement; + /* Note: if rdev != NULL, then bio + * cannot be NULL as r10buf_pool_alloc will + * have allocated it. + * So the second test here is pointless. + * But it keeps semantic-checkers happy, and + * this comment keeps human reviewers + * happy. + */ + if (rdev == NULL || bio == NULL || + test_bit(Faulty, &rdev->flags)) + break; + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_write; + bio->bi_rw = WRITE; + bio->bi_sector = to_addr + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + atomic_inc(&r10_bio->remaining); break; } if (j == conf->copies) { @@ -2544,8 +2844,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, for (k = 0; k < conf->copies; k++) if (r10_bio->devs[k].devnum == i) break; - if (!rdev_set_badblocks( - conf->mirrors[i].rdev, + if (!test_bit(In_sync, + &mirror->rdev->flags) + && !rdev_set_badblocks( + mirror->rdev, + r10_bio->devs[k].addr, + max_sync, 0)) + any_working = 0; + if (mirror->replacement && + !rdev_set_badblocks( + mirror->replacement, r10_bio->devs[k].addr, max_sync, 0)) any_working = 0; @@ -2556,7 +2864,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, printk(KERN_INFO "md/raid10:%s: insufficient " "working devices for recovery.\n", mdname(mddev)); - conf->mirrors[i].recovery_disabled + mirror->recovery_disabled = mddev->recovery_disabled; } break; @@ -2605,6 +2913,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, sector_t first_bad, sector; int bad_sectors; + if (r10_bio->devs[i].repl_bio) + r10_bio->devs[i].repl_bio->bi_end_io = NULL; + bio = r10_bio->devs[i].bio; bio->bi_end_io = NULL; clear_bit(BIO_UPTODATE, &bio->bi_flags); @@ -2635,6 +2946,27 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, conf->mirrors[d].rdev->data_offset; bio->bi_bdev = conf->mirrors[d].rdev->bdev; count++; + + if (conf->mirrors[d].replacement == NULL || + test_bit(Faulty, + &conf->mirrors[d].replacement->flags)) + continue; + + /* Need to set up for writing to the replacement */ + bio = r10_bio->devs[i].repl_bio; + clear_bit(BIO_UPTODATE, &bio->bi_flags); + + sector = r10_bio->devs[i].addr; + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_write; + bio->bi_rw = WRITE; + bio->bi_sector = sector + + conf->mirrors[d].replacement->data_offset; + bio->bi_bdev = conf->mirrors[d].replacement->bdev; + count++; } if (count < 2) { @@ -2643,6 +2975,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, if (r10_bio->devs[i].bio->bi_end_io) rdev_dec_pending(conf->mirrors[d].rdev, mddev); + if (r10_bio->devs[i].repl_bio && + r10_bio->devs[i].repl_bio->bi_end_io) + rdev_dec_pending( + conf->mirrors[d].replacement, + mddev); } put_buf(r10_bio); biolist = NULL; @@ -2896,6 +3233,16 @@ static int run(struct mddev *mddev) continue; disk = conf->mirrors + disk_idx; + if (test_bit(Replacement, &rdev->flags)) { + if (disk->replacement) + goto out_free_conf; + disk->replacement = rdev; + } else { + if (disk->rdev) + goto out_free_conf; + disk->rdev = rdev; + } + disk->rdev = rdev; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); @@ -2923,6 +3270,13 @@ static int run(struct mddev *mddev) disk = conf->mirrors + i; + if (!disk->rdev && disk->replacement) { + /* The replacement is all we have - use it */ + disk->rdev = disk->replacement; + disk->replacement = NULL; + clear_bit(Replacement, &disk->rdev->flags); + } + if (!disk->rdev || !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; |