summaryrefslogtreecommitdiff
path: root/drivers/md/raid10.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r--drivers/md/raid10.c424
1 files changed, 203 insertions, 221 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8e9462626ec5..6e846688962f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -271,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error)
*/
set_bit(R10BIO_Uptodate, &r10_bio->state);
raid_end_bio_io(r10_bio);
+ rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
} else {
/*
- * oops, read error:
+ * oops, read error - keep the refcount on the rdev
*/
char b[BDEVNAME_SIZE];
if (printk_ratelimit())
@@ -282,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error)
bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
reschedule_retry(r10_bio);
}
-
- rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
}
static void raid10_end_write_request(struct bio *bio, int error)
@@ -488,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q,
static int read_balance(conf_t *conf, r10bio_t *r10_bio)
{
const sector_t this_sector = r10_bio->sector;
- int disk, slot, nslot;
+ int disk, slot;
const int sectors = r10_bio->sectors;
- sector_t new_distance, current_distance;
+ sector_t new_distance, best_dist;
mdk_rdev_t *rdev;
+ int do_balance;
+ int best_slot;
raid10_find_phys(conf, r10_bio);
rcu_read_lock();
+retry:
+ best_slot = -1;
+ best_dist = MaxSector;
+ do_balance = 1;
/*
* Check if we can balance. We can balance on the whole
* device if no resync is going on (recovery is ok), or below
@@ -502,86 +507,58 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
* above the resync window.
*/
if (conf->mddev->recovery_cp < MaxSector
- && (this_sector + sectors >= conf->next_resync)) {
- /* make sure that disk is operational */
- slot = 0;
- disk = r10_bio->devs[slot].devnum;
-
- while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
- r10_bio->devs[slot].bio == IO_BLOCKED ||
- !test_bit(In_sync, &rdev->flags)) {
- slot++;
- if (slot == conf->copies) {
- slot = 0;
- disk = -1;
- break;
- }
- disk = r10_bio->devs[slot].devnum;
- }
- goto rb_out;
- }
-
+ && (this_sector + sectors >= conf->next_resync))
+ do_balance = 0;
- /* make sure the disk is operational */
- slot = 0;
- disk = r10_bio->devs[slot].devnum;
- while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
- r10_bio->devs[slot].bio == IO_BLOCKED ||
- !test_bit(In_sync, &rdev->flags)) {
- slot ++;
- if (slot == conf->copies) {
- disk = -1;
- goto rb_out;
- }
+ for (slot = 0; slot < conf->copies ; slot++) {
+ if (r10_bio->devs[slot].bio == IO_BLOCKED)
+ continue;
disk = r10_bio->devs[slot].devnum;
- }
-
-
- current_distance = abs(r10_bio->devs[slot].addr -
- conf->mirrors[disk].head_position);
-
- /* Find the disk whose head is closest,
- * or - for far > 1 - find the closest to partition beginning */
-
- for (nslot = slot; nslot < conf->copies; nslot++) {
- int ndisk = r10_bio->devs[nslot].devnum;
-
-
- if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
- r10_bio->devs[nslot].bio == IO_BLOCKED ||
- !test_bit(In_sync, &rdev->flags))
+ rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ if (rdev == NULL)
continue;
+ if (!test_bit(In_sync, &rdev->flags))
+ continue;
+
+ if (!do_balance)
+ break;
/* This optimisation is debatable, and completely destroys
* sequential read speed for 'far copies' arrays. So only
* keep it for 'near' arrays, and review those later.
*/
- if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
- disk = ndisk;
- slot = nslot;
+ if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
break;
- }
/* for far > 1 always use the lowest address */
if (conf->far_copies > 1)
- new_distance = r10_bio->devs[nslot].addr;
+ new_distance = r10_bio->devs[slot].addr;
else
- new_distance = abs(r10_bio->devs[nslot].addr -
- conf->mirrors[ndisk].head_position);
- if (new_distance < current_distance) {
- current_distance = new_distance;
- disk = ndisk;
- slot = nslot;
+ new_distance = abs(r10_bio->devs[slot].addr -
+ conf->mirrors[disk].head_position);
+ if (new_distance < best_dist) {
+ best_dist = new_distance;
+ best_slot = slot;
}
}
+ if (slot == conf->copies)
+ slot = best_slot;
-rb_out:
- r10_bio->read_slot = slot;
-/* conf->next_seq_sect = this_sector + sectors;*/
-
- if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
- atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
- else
+ if (slot >= 0) {
+ disk = r10_bio->devs[slot].devnum;
+ rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ if (!rdev)
+ goto retry;
+ atomic_inc(&rdev->nr_pending);
+ if (test_bit(Faulty, &rdev->flags)) {
+ /* Cannot risk returning a device that failed
+ * before we inc'ed nr_pending
+ */
+ rdev_dec_pending(rdev, conf->mddev);
+ goto retry;
+ }
+ r10_bio->read_slot = slot;
+ } else
disk = -1;
rcu_read_unlock();
@@ -1460,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int d = r10_bio->devs[r10_bio->read_slot].devnum;
- rcu_read_lock();
- rdev = rcu_dereference(conf->mirrors[d].rdev);
- if (rdev) { /* If rdev is not NULL */
- char b[BDEVNAME_SIZE];
- int cur_read_error_count = 0;
+ /* still own a reference to this rdev, so it cannot
+ * have been cleared recently.
+ */
+ rdev = conf->mirrors[d].rdev;
- bdevname(rdev->bdev, b);
+ if (test_bit(Faulty, &rdev->flags))
+ /* drive has already been failed, just ignore any
+ more fix_read_error() attempts */
+ return;
- if (test_bit(Faulty, &rdev->flags)) {
- rcu_read_unlock();
- /* drive has already been failed, just ignore any
- more fix_read_error() attempts */
- return;
- }
+ check_decay_read_errors(mddev, rdev);
+ atomic_inc(&rdev->read_errors);
+ if (atomic_read(&rdev->read_errors) > max_read_errors) {
+ char b[BDEVNAME_SIZE];
+ bdevname(rdev->bdev, b);
- check_decay_read_errors(mddev, rdev);
- atomic_inc(&rdev->read_errors);
- cur_read_error_count = atomic_read(&rdev->read_errors);
- if (cur_read_error_count > max_read_errors) {
- rcu_read_unlock();
- printk(KERN_NOTICE
- "md/raid10:%s: %s: Raid device exceeded "
- "read_error threshold "
- "[cur %d:max %d]\n",
- mdname(mddev),
- b, cur_read_error_count, max_read_errors);
- printk(KERN_NOTICE
- "md/raid10:%s: %s: Failing raid "
- "device\n", mdname(mddev), b);
- md_error(mddev, conf->mirrors[d].rdev);
- return;
- }
+ printk(KERN_NOTICE
+ "md/raid10:%s: %s: Raid device exceeded "
+ "read_error threshold [cur %d:max %d]\n",
+ mdname(mddev), b,
+ atomic_read(&rdev->read_errors), max_read_errors);
+ printk(KERN_NOTICE
+ "md/raid10:%s: %s: Failing raid device\n",
+ mdname(mddev), b);
+ md_error(mddev, conf->mirrors[d].rdev);
+ return;
}
- rcu_read_unlock();
while(sectors) {
int s = sectors;
@@ -1562,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
"write failed"
" (%d sectors at %llu on %s)\n",
mdname(mddev), s,
- (unsigned long long)(sect+
- rdev->data_offset),
+ (unsigned long long)(
+ sect + rdev->data_offset),
bdevname(rdev->bdev, b));
printk(KERN_NOTICE "md/raid10:%s: %s: failing "
"drive\n",
@@ -1599,8 +1569,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
"corrected sectors"
" (%d sectors at %llu on %s)\n",
mdname(mddev), s,
- (unsigned long long)(sect+
- rdev->data_offset),
+ (unsigned long long)(
+ sect + rdev->data_offset),
bdevname(rdev->bdev, b));
printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
mdname(mddev),
@@ -1612,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
"md/raid10:%s: read error corrected"
" (%d sectors at %llu on %s)\n",
mdname(mddev), s,
- (unsigned long long)(sect+
- rdev->data_offset),
+ (unsigned long long)(
+ sect + rdev->data_offset),
bdevname(rdev->bdev, b));
}
@@ -1663,7 +1633,8 @@ static void raid10d(mddev_t *mddev)
else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
recovery_request_write(mddev, r10_bio);
else {
- int mirror;
+ int slot = r10_bio->read_slot;
+ int mirror = r10_bio->devs[slot].devnum;
/* we got a read error. Maybe the drive is bad. Maybe just
* the block and we can fix it.
* We freeze all other IO, and try reading the block from
@@ -1677,9 +1648,10 @@ static void raid10d(mddev_t *mddev)
fix_read_error(conf, mddev, r10_bio);
unfreeze_array(conf);
}
+ rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
- bio = r10_bio->devs[r10_bio->read_slot].bio;
- r10_bio->devs[r10_bio->read_slot].bio =
+ bio = r10_bio->devs[slot].bio;
+ r10_bio->devs[slot].bio =
mddev->ro ? IO_BLOCKED : NULL;
mirror = read_balance(conf, r10_bio);
if (mirror == -1) {
@@ -1693,6 +1665,7 @@ static void raid10d(mddev_t *mddev)
} else {
const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
bio_put(bio);
+ slot = r10_bio->read_slot;
rdev = conf->mirrors[mirror].rdev;
if (printk_ratelimit())
printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
@@ -1702,8 +1675,8 @@ static void raid10d(mddev_t *mddev)
(unsigned long long)r10_bio->sector);
bio = bio_clone_mddev(r10_bio->master_bio,
GFP_NOIO, mddev);
- r10_bio->devs[r10_bio->read_slot].bio = bio;
- bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
+ r10_bio->devs[slot].bio = bio;
+ bio->bi_sector = r10_bio->devs[slot].addr
+ rdev->data_offset;
bio->bi_bdev = rdev->bdev;
bio->bi_rw = READ | do_sync;
@@ -1763,13 +1736,13 @@ static int init_resync(conf_t *conf)
*
*/
-static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
+ int *skipped, int go_faster)
{
conf_t *conf = mddev->private;
r10bio_t *r10_bio;
struct bio *biolist = NULL, *bio;
sector_t max_sector, nr_sectors;
- int disk;
int i;
int max_sync;
sector_t sync_blocks;
@@ -1858,108 +1831,114 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
int j, k;
r10_bio = NULL;
- for (i=0 ; i<conf->raid_disks; i++)
- if (conf->mirrors[i].rdev &&
- !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
- int still_degraded = 0;
- /* want to reconstruct this device */
- r10bio_t *rb2 = r10_bio;
- sector_t sect = raid10_find_virt(conf, sector_nr, i);
- int must_sync;
- /* Unless we are doing a full sync, we only need
- * to recover the block if it is set in the bitmap
- */
- must_sync = bitmap_start_sync(mddev->bitmap, sect,
- &sync_blocks, 1);
- if (sync_blocks < max_sync)
- max_sync = sync_blocks;
- if (!must_sync &&
- !conf->fullsync) {
- /* yep, skip the sync_blocks here, but don't assume
- * that there will never be anything to do here
- */
- chunks_skipped = -1;
- continue;
- }
+ for (i=0 ; i<conf->raid_disks; i++) {
+ int still_degraded;
+ r10bio_t *rb2;
+ sector_t sect;
+ int must_sync;
- r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
- raise_barrier(conf, rb2 != NULL);
- atomic_set(&r10_bio->remaining, 0);
+ if (conf->mirrors[i].rdev == NULL ||
+ test_bit(In_sync, &conf->mirrors[i].rdev->flags))
+ continue;
- r10_bio->master_bio = (struct bio*)rb2;
- if (rb2)
- atomic_inc(&rb2->remaining);
- r10_bio->mddev = mddev;
- set_bit(R10BIO_IsRecover, &r10_bio->state);
- r10_bio->sector = sect;
+ still_degraded = 0;
+ /* want to reconstruct this device */
+ rb2 = r10_bio;
+ sect = raid10_find_virt(conf, sector_nr, i);
+ /* Unless we are doing a full sync, we only need
+ * to recover the block if it is set in the bitmap
+ */
+ must_sync = bitmap_start_sync(mddev->bitmap, sect,
+ &sync_blocks, 1);
+ if (sync_blocks < max_sync)
+ max_sync = sync_blocks;
+ if (!must_sync &&
+ !conf->fullsync) {
+ /* yep, skip the sync_blocks here, but don't assume
+ * that there will never be anything to do here
+ */
+ chunks_skipped = -1;
+ continue;
+ }
- raid10_find_phys(conf, r10_bio);
+ r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+ raise_barrier(conf, rb2 != NULL);
+ atomic_set(&r10_bio->remaining, 0);
- /* Need to check if the array will still be
- * degraded
- */
- for (j=0; j<conf->raid_disks; j++)
- if (conf->mirrors[j].rdev == NULL ||
- test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
- still_degraded = 1;
- break;
- }
-
- must_sync = bitmap_start_sync(mddev->bitmap, sect,
- &sync_blocks, still_degraded);
-
- for (j=0; j<conf->copies;j++) {
- int d = r10_bio->devs[j].devnum;
- if (conf->mirrors[d].rdev &&
- test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
- /* This is where we read from */
- bio = r10_bio->devs[0].bio;
- bio->bi_next = biolist;
- biolist = bio;
- bio->bi_private = r10_bio;
- bio->bi_end_io = end_sync_read;
- bio->bi_rw = READ;
- bio->bi_sector = r10_bio->devs[j].addr +
- conf->mirrors[d].rdev->data_offset;
- bio->bi_bdev = conf->mirrors[d].rdev->bdev;
- atomic_inc(&conf->mirrors[d].rdev->nr_pending);
- atomic_inc(&r10_bio->remaining);
- /* and we write to 'i' */
-
- for (k=0; k<conf->copies; k++)
- if (r10_bio->devs[k].devnum == i)
- break;
- BUG_ON(k == conf->copies);
- bio = r10_bio->devs[1].bio;
- bio->bi_next = biolist;
- biolist = bio;
- bio->bi_private = r10_bio;
- bio->bi_end_io = end_sync_write;
- bio->bi_rw = WRITE;
- bio->bi_sector = r10_bio->devs[k].addr +
- conf->mirrors[i].rdev->data_offset;
- bio->bi_bdev = conf->mirrors[i].rdev->bdev;
-
- r10_bio->devs[0].devnum = d;
- r10_bio->devs[1].devnum = i;
+ r10_bio->master_bio = (struct bio*)rb2;
+ if (rb2)
+ atomic_inc(&rb2->remaining);
+ r10_bio->mddev = mddev;
+ set_bit(R10BIO_IsRecover, &r10_bio->state);
+ r10_bio->sector = sect;
- break;
- }
- }
- if (j == conf->copies) {
- /* Cannot recover, so abort the recovery */
- put_buf(r10_bio);
- if (rb2)
- atomic_dec(&rb2->remaining);
- r10_bio = rb2;
- if (!test_and_set_bit(MD_RECOVERY_INTR,
- &mddev->recovery))
- printk(KERN_INFO "md/raid10:%s: insufficient "
- "working devices for recovery.\n",
- mdname(mddev));
+ raid10_find_phys(conf, r10_bio);
+
+ /* Need to check if the array will still be
+ * degraded
+ */
+ for (j=0; j<conf->raid_disks; j++)
+ if (conf->mirrors[j].rdev == NULL ||
+ test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
+ still_degraded = 1;
break;
}
+
+ must_sync = bitmap_start_sync(mddev->bitmap, sect,
+ &sync_blocks, still_degraded);
+
+ for (j=0; j<conf->copies;j++) {
+ int d = r10_bio->devs[j].devnum;
+ if (!conf->mirrors[d].rdev ||
+ !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
+ continue;
+ /* This is where we read from */
+ bio = r10_bio->devs[0].bio;
+ bio->bi_next = biolist;
+ biolist = bio;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = end_sync_read;
+ bio->bi_rw = READ;
+ bio->bi_sector = r10_bio->devs[j].addr +
+ conf->mirrors[d].rdev->data_offset;
+ bio->bi_bdev = conf->mirrors[d].rdev->bdev;
+ atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+ atomic_inc(&r10_bio->remaining);
+ /* and we write to 'i' */
+
+ for (k=0; k<conf->copies; k++)
+ if (r10_bio->devs[k].devnum == i)
+ break;
+ BUG_ON(k == conf->copies);
+ bio = r10_bio->devs[1].bio;
+ bio->bi_next = biolist;
+ biolist = bio;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = end_sync_write;
+ bio->bi_rw = WRITE;
+ bio->bi_sector = r10_bio->devs[k].addr +
+ conf->mirrors[i].rdev->data_offset;
+ bio->bi_bdev = conf->mirrors[i].rdev->bdev;
+
+ r10_bio->devs[0].devnum = d;
+ r10_bio->devs[1].devnum = i;
+
+ break;
+ }
+ if (j == conf->copies) {
+ /* Cannot recover, so abort the recovery */
+ put_buf(r10_bio);
+ if (rb2)
+ atomic_dec(&rb2->remaining);
+ r10_bio = rb2;
+ if (!test_and_set_bit(MD_RECOVERY_INTR,
+ &mddev->recovery))
+ printk(KERN_INFO "md/raid10:%s: insufficient "
+ "working devices for recovery.\n",
+ mdname(mddev));
+ break;
}
+ }
if (biolist == NULL) {
while (r10_bio) {
r10bio_t *rb2 = r10_bio;
@@ -1977,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
if (!bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) &&
- !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
+ &mddev->recovery)) {
/* We can skip this block */
*skipped = 1;
return sync_blocks + sectors_skipped;
@@ -2022,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
for (i=0; i<conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
if (r10_bio->devs[i].bio->bi_end_io)
- rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+ rdev_dec_pending(conf->mirrors[d].rdev,
+ mddev);
}
put_buf(r10_bio);
biolist = NULL;
@@ -2047,26 +2028,27 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
do {
struct page *page;
int len = PAGE_SIZE;
- disk = 0;
if (sector_nr + (len>>9) > max_sector)
len = (max_sector - sector_nr) << 9;
if (len == 0)
break;
for (bio= biolist ; bio ; bio=bio->bi_next) {
+ struct bio *bio2;
page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
- if (bio_add_page(bio, page, len, 0) == 0) {
- /* stop here */
- struct bio *bio2;
- bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
- for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
- /* remove last page from this bio */
- bio2->bi_vcnt--;
- bio2->bi_size -= len;
- bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
- }
- goto bio_full;
+ if (bio_add_page(bio, page, len, 0))
+ continue;
+
+ /* stop here */
+ bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+ for (bio2 = biolist;
+ bio2 && bio2 != bio;
+ bio2 = bio2->bi_next) {
+ /* remove last page from this bio */
+ bio2->bi_vcnt--;
+ bio2->bi_size -= len;
+ bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
}
- disk = i;
+ goto bio_full;
}
nr_sectors += len>>9;
sector_nr += len>>9;