summaryrefslogtreecommitdiff
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c244
1 files changed, 188 insertions, 56 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a9c7981ddd24..611b5f797618 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -46,6 +46,20 @@
*/
#define NR_RAID1_BIOS 256
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error. To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio *)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context. So we record
+ * the success by setting devs[n].bio to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
+
/* When there are this many requests queue to be written by
* the raid1 thread, we become 'congested' to provide back-pressure
* for writeback.
@@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
const sector_t this_sector = r1_bio->sector;
int sectors;
int best_good_sectors;
- int start_disk;
- int best_disk;
- int i;
+ int best_disk, best_dist_disk, best_pending_disk;
+ int has_nonrot_disk;
+ int disk;
sector_t best_dist;
+ unsigned int min_pending;
struct md_rdev *rdev;
int choose_first;
+ int choose_next_idle;
rcu_read_lock();
/*
@@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
retry:
sectors = r1_bio->sectors;
best_disk = -1;
+ best_dist_disk = -1;
best_dist = MaxSector;
+ best_pending_disk = -1;
+ min_pending = UINT_MAX;
best_good_sectors = 0;
+ has_nonrot_disk = 0;
+ choose_next_idle = 0;
if (conf->mddev->recovery_cp < MaxSector &&
- (this_sector + sectors >= conf->next_resync)) {
+ (this_sector + sectors >= conf->next_resync))
choose_first = 1;
- start_disk = 0;
- } else {
+ else
choose_first = 0;
- start_disk = conf->last_used;
- }
- for (i = 0 ; i < conf->raid_disks * 2 ; i++) {
+ for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
sector_t dist;
sector_t first_bad;
int bad_sectors;
-
- int disk = start_disk + i;
- if (disk >= conf->raid_disks)
- disk -= conf->raid_disks;
+ unsigned int pending;
+ bool nonrot;
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (r1_bio->bios[disk] == IO_BLOCKED
@@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
} else
best_good_sectors = sectors;
+ nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
+ has_nonrot_disk |= nonrot;
+ pending = atomic_read(&rdev->nr_pending);
dist = abs(this_sector - conf->mirrors[disk].head_position);
- if (choose_first
- /* Don't change to another disk for sequential reads */
- || conf->next_seq_sect == this_sector
- || dist == 0
- /* If device is idle, use it */
- || atomic_read(&rdev->nr_pending) == 0) {
+ if (choose_first) {
best_disk = disk;
break;
}
+ /* Don't change to another disk for sequential reads */
+ if (conf->mirrors[disk].next_seq_sect == this_sector
+ || dist == 0) {
+ int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
+ struct raid1_info *mirror = &conf->mirrors[disk];
+
+ best_disk = disk;
+ /*
+ * If buffered sequential IO size exceeds optimal
+ * iosize, check if there is idle disk. If yes, choose
+ * the idle disk. read_balance could already choose an
+ * idle disk before noticing it's a sequential IO in
+ * this disk. This doesn't matter because this disk
+ * will idle, next time it will be utilized after the
+ * first disk has IO size exceeds optimal iosize. In
+ * this way, iosize of the first disk will be optimal
+ * iosize at least. iosize of the second disk might be
+ * small, but not a big deal since when the second disk
+ * starts IO, the first disk is likely still busy.
+ */
+ if (nonrot && opt_iosize > 0 &&
+ mirror->seq_start != MaxSector &&
+ mirror->next_seq_sect > opt_iosize &&
+ mirror->next_seq_sect - opt_iosize >=
+ mirror->seq_start) {
+ choose_next_idle = 1;
+ continue;
+ }
+ break;
+ }
+ /* If device is idle, use it */
+ if (pending == 0) {
+ best_disk = disk;
+ break;
+ }
+
+ if (choose_next_idle)
+ continue;
+
+ if (min_pending > pending) {
+ min_pending = pending;
+ best_pending_disk = disk;
+ }
+
if (dist < best_dist) {
best_dist = dist;
- best_disk = disk;
+ best_dist_disk = disk;
}
}
+ /*
+ * If all disks are rotational, choose the closest disk. If any disk is
+ * non-rotational, choose the disk with less pending request even the
+ * disk is rotational, which might/might not be optimal for raids with
+ * mixed ratation/non-rotational disks depending on workload.
+ */
+ if (best_disk == -1) {
+ if (has_nonrot_disk)
+ best_disk = best_pending_disk;
+ else
+ best_disk = best_dist_disk;
+ }
+
if (best_disk >= 0) {
rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
if (!rdev)
@@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
goto retry;
}
sectors = best_good_sectors;
- conf->next_seq_sect = this_sector + sectors;
- conf->last_used = best_disk;
+
+ if (conf->mirrors[best_disk].next_seq_sect != this_sector)
+ conf->mirrors[best_disk].seq_start = this_sector;
+
+ conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
}
rcu_read_unlock();
*max_sectors = sectors;
@@ -870,10 +944,48 @@ do_sync_io:
pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
}
+struct raid1_plug_cb {
+ struct blk_plug_cb cb;
+ struct bio_list pending;
+ int pending_cnt;
+};
+
+static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb,
+ cb);
+ struct mddev *mddev = plug->cb.data;
+ struct r1conf *conf = mddev->private;
+ struct bio *bio;
+
+ if (from_schedule) {
+ spin_lock_irq(&conf->device_lock);
+ bio_list_merge(&conf->pending_bio_list, &plug->pending);
+ conf->pending_count += plug->pending_cnt;
+ spin_unlock_irq(&conf->device_lock);
+ md_wakeup_thread(mddev->thread);
+ kfree(plug);
+ return;
+ }
+
+ /* we aren't scheduling, so we can do the write-out directly. */
+ bio = bio_list_get(&plug->pending);
+ bitmap_unplug(mddev->bitmap);
+ wake_up(&conf->wait_barrier);
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ bio->bi_next = NULL;
+ generic_make_request(bio);
+ bio = next;
+ }
+ kfree(plug);
+}
+
static void make_request(struct mddev *mddev, struct bio * bio)
{
struct r1conf *conf = mddev->private;
- struct mirror_info *mirror;
+ struct raid1_info *mirror;
struct r1bio *r1_bio;
struct bio *read_bio;
int i, disks;
@@ -883,7 +995,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
struct md_rdev *blocked_rdev;
- int plugged;
+ struct blk_plug_cb *cb;
+ struct raid1_plug_cb *plug = NULL;
int first_clone;
int sectors_handled;
int max_sectors;
@@ -1034,7 +1147,6 @@ read_again:
* the bad blocks. Each set of writes gets it's own r1bio
* with a set of bios attached.
*/
- plugged = mddev_check_plugged(mddev);
disks = conf->raid_disks * 2;
retry_write:
@@ -1187,10 +1299,23 @@ read_again:
mbio->bi_private = r1_bio;
atomic_inc(&r1_bio->remaining);
+
+ cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
+ if (cb)
+ plug = container_of(cb, struct raid1_plug_cb, cb);
+ else
+ plug = NULL;
spin_lock_irqsave(&conf->device_lock, flags);
- bio_list_add(&conf->pending_bio_list, mbio);
- conf->pending_count++;
+ if (plug) {
+ bio_list_add(&plug->pending, mbio);
+ plug->pending_cnt++;
+ } else {
+ bio_list_add(&conf->pending_bio_list, mbio);
+ conf->pending_count++;
+ }
spin_unlock_irqrestore(&conf->device_lock, flags);
+ if (!plug)
+ md_wakeup_thread(mddev->thread);
}
/* Mustn't call r1_bio_write_done before this next test,
* as it could result in the bio being freed.
@@ -1213,9 +1338,6 @@ read_again:
/* In case raid1d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
-
- if (do_sync || !bitmap || !plugged)
- md_wakeup_thread(mddev->thread);
}
static void status(struct seq_file *seq, struct mddev *mddev)
@@ -1367,7 +1489,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
struct r1conf *conf = mddev->private;
int err = -EEXIST;
int mirror = 0;
- struct mirror_info *p;
+ struct raid1_info *p;
int first = 0;
int last = conf->raid_disks - 1;
struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -1436,7 +1558,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct r1conf *conf = mddev->private;
int err = 0;
int number = rdev->raid_disk;
- struct mirror_info *p = conf->mirrors+ number;
+ struct raid1_info *p = conf->mirrors + number;
if (rdev != p->rdev)
p = conf->mirrors + conf->raid_disks + number;
@@ -1821,8 +1943,14 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
if (atomic_dec_and_test(&r1_bio->remaining)) {
/* if we're here, all write(s) have completed, so clean up */
- md_done_sync(mddev, r1_bio->sectors, 1);
- put_buf(r1_bio);
+ int s = r1_bio->sectors;
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ put_buf(r1_bio);
+ md_done_sync(mddev, s, 1);
+ }
}
}
@@ -2170,8 +2298,7 @@ static void raid1d(struct mddev *mddev)
blk_start_plug(&plug);
for (;;) {
- if (atomic_read(&mddev->plug_cnt) == 0)
- flush_pending_writes(conf);
+ flush_pending_writes(conf);
spin_lock_irqsave(&conf->device_lock, flags);
if (list_empty(head)) {
@@ -2368,6 +2495,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
bio->bi_rw = READ;
bio->bi_end_io = end_sync_read;
read_targets++;
+ } else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
+ test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
+ /*
+ * The device is suitable for reading (InSync),
+ * but has bad block(s) here. Let's try to correct them,
+ * if we are doing resync or repair. Otherwise, leave
+ * this device alone for this sync request.
+ */
+ bio->bi_rw = WRITE;
+ bio->bi_end_io = end_sync_write;
+ write_targets++;
}
}
if (bio->bi_end_io) {
@@ -2425,7 +2564,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
/* There is nowhere to write, so all non-sync
* drives must be failed - so we are finished
*/
- sector_t rv = max_sector - sector_nr;
+ sector_t rv;
+ if (min_bad > 0)
+ max_sector = sector_nr + min_bad;
+ rv = max_sector - sector_nr;
*skipped = 1;
put_buf(r1_bio);
return rv;
@@ -2488,9 +2630,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
*/
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
atomic_set(&r1_bio->remaining, read_targets);
- for (i = 0; i < conf->raid_disks * 2; i++) {
+ for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) {
bio = r1_bio->bios[i];
if (bio->bi_end_io == end_sync_read) {
+ read_targets--;
md_sync_acct(bio->bi_bdev, nr_sectors);
generic_make_request(bio);
}
@@ -2517,7 +2660,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
{
struct r1conf *conf;
int i;
- struct mirror_info *disk;
+ struct raid1_info *disk;
struct md_rdev *rdev;
int err = -ENOMEM;
@@ -2525,7 +2668,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf)
goto abort;
- conf->mirrors = kzalloc(sizeof(struct mirror_info)
+ conf->mirrors = kzalloc(sizeof(struct raid1_info)
* mddev->raid_disks * 2,
GFP_KERNEL);
if (!conf->mirrors)
@@ -2568,6 +2711,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
mddev->merge_check_needed = 1;
disk->head_position = 0;
+ disk->seq_start = MaxSector;
}
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
@@ -2581,7 +2725,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->recovery_disabled = mddev->recovery_disabled - 1;
err = -EIO;
- conf->last_used = -1;
for (i = 0; i < conf->raid_disks * 2; i++) {
disk = conf->mirrors + i;
@@ -2607,21 +2750,11 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (disk->rdev &&
(disk->rdev->saved_raid_disk < 0))
conf->fullsync = 1;
- } else if (conf->last_used < 0)
- /*
- * The first working device is used as a
- * starting point to read balancing.
- */
- conf->last_used = i;
+ }
}
- if (conf->last_used < 0) {
- printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
- mdname(mddev));
- goto abort;
- }
err = -ENOMEM;
- conf->thread = md_register_thread(raid1d, mddev, NULL);
+ conf->thread = md_register_thread(raid1d, mddev, "raid1");
if (!conf->thread) {
printk(KERN_ERR
"md/raid1:%s: couldn't allocate thread\n",
@@ -2794,7 +2927,7 @@ static int raid1_reshape(struct mddev *mddev)
*/
mempool_t *newpool, *oldpool;
struct pool_info *newpoolinfo;
- struct mirror_info *newmirrors;
+ struct raid1_info *newmirrors;
struct r1conf *conf = mddev->private;
int cnt, raid_disks;
unsigned long flags;
@@ -2837,7 +2970,7 @@ static int raid1_reshape(struct mddev *mddev)
kfree(newpoolinfo);
return -ENOMEM;
}
- newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2,
+ newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
GFP_KERNEL);
if (!newmirrors) {
kfree(newpoolinfo);
@@ -2876,7 +3009,6 @@ static int raid1_reshape(struct mddev *mddev)
conf->raid_disks = mddev->raid_disks = raid_disks;
mddev->delta_disks = 0;
- conf->last_used = 0; /* just make sure it is in-range */
lower_barrier(conf);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);