summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c21
-rw-r--r--drivers/md/dm-flakey.c11
-rw-r--r--drivers/md/dm-linear.c12
-rw-r--r--drivers/md/dm-mpath.c6
-rw-r--r--drivers/md/dm-raid.c12
-rw-r--r--drivers/md/dm-table.c6
-rw-r--r--drivers/md/dm.c1
-rw-r--r--drivers/md/linear.c1
-rw-r--r--drivers/md/md.c150
-rw-r--r--drivers/md/md.h82
-rw-r--r--drivers/md/multipath.c7
-rw-r--r--drivers/md/raid1.c185
-rw-r--r--drivers/md/raid1.h7
-rw-r--r--drivers/md/raid10.c582
-rw-r--r--drivers/md/raid10.h61
-rw-r--r--drivers/md/raid5.c577
-rw-r--r--drivers/md/raid5.h98
17 files changed, 1359 insertions, 460 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 7878712721bf..cdf36b1e9aa6 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1106,10 +1106,12 @@ void bitmap_write_all(struct bitmap *bitmap)
*/
int i;
+ spin_lock_irq(&bitmap->lock);
for (i = 0; i < bitmap->file_pages; i++)
set_page_attr(bitmap, bitmap->filemap[i],
BITMAP_PAGE_NEEDWRITE);
bitmap->allclean = 0;
+ spin_unlock_irq(&bitmap->lock);
}
static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
@@ -1147,12 +1149,12 @@ void bitmap_daemon_work(struct mddev *mddev)
return;
}
if (time_before(jiffies, bitmap->daemon_lastrun
- + bitmap->mddev->bitmap_info.daemon_sleep))
+ + mddev->bitmap_info.daemon_sleep))
goto done;
bitmap->daemon_lastrun = jiffies;
if (bitmap->allclean) {
- bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+ mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
goto done;
}
bitmap->allclean = 1;
@@ -1204,7 +1206,7 @@ void bitmap_daemon_work(struct mddev *mddev)
* sure that events_cleared is up-to-date.
*/
if (bitmap->need_sync &&
- bitmap->mddev->bitmap_info.external == 0) {
+ mddev->bitmap_info.external == 0) {
bitmap_super_t *sb;
bitmap->need_sync = 0;
sb = kmap_atomic(bitmap->sb_page, KM_USER0);
@@ -1268,8 +1270,8 @@ void bitmap_daemon_work(struct mddev *mddev)
done:
if (bitmap->allclean == 0)
- bitmap->mddev->thread->timeout =
- bitmap->mddev->bitmap_info.daemon_sleep;
+ mddev->thread->timeout =
+ mddev->bitmap_info.daemon_sleep;
mutex_unlock(&mddev->bitmap_info.mutex);
}
@@ -1391,9 +1393,6 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
atomic_read(&bitmap->behind_writes),
bitmap->mddev->bitmap_info.max_write_behind);
}
- if (bitmap->mddev->degraded)
- /* Never clear bits or update events_cleared when degraded */
- success = 0;
while (sectors) {
sector_t blocks;
@@ -1407,7 +1406,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
return;
}
- if (success &&
+ if (success && !bitmap->mddev->degraded &&
bitmap->events_cleared < bitmap->mddev->events) {
bitmap->events_cleared = bitmap->mddev->events;
bitmap->need_sync = 1;
@@ -1588,7 +1587,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
}
if (!*bmc) {
struct page *page;
- *bmc = 1 | (needed ? NEEDED_MASK : 0);
+ *bmc = 2 | (needed ? NEEDED_MASK : 0);
bitmap_count_page(bitmap, offset, 1);
page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
@@ -1605,7 +1604,9 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
for (chunk = s; chunk <= e; chunk++) {
sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap);
bitmap_set_memory_bits(bitmap, sec, 1);
+ spin_lock_irq(&bitmap->lock);
bitmap_file_set_bit(bitmap, sec);
+ spin_unlock_irq(&bitmap->lock);
if (sec < bitmap->mddev->recovery_cp)
/* We are asserting that the array is dirty,
* so move the recovery_cp address back so
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index f84c08029b21..9fb18c147825 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -368,8 +368,17 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
{
struct flakey_c *fc = ti->private;
+ struct dm_dev *dev = fc->dev;
+ int r = 0;
- return __blkdev_driver_ioctl(fc->dev->bdev, fc->dev->mode, cmd, arg);
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (fc->start ||
+ ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+ r = scsi_verify_blk_ioctl(NULL, cmd);
+
+ return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
}
static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 3921e3bb43c1..9728839f844a 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -116,7 +116,17 @@ static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
unsigned long arg)
{
struct linear_c *lc = (struct linear_c *) ti->private;
- return __blkdev_driver_ioctl(lc->dev->bdev, lc->dev->mode, cmd, arg);
+ struct dm_dev *dev = lc->dev;
+ int r = 0;
+
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (lc->start ||
+ ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+ r = scsi_verify_blk_ioctl(NULL, cmd);
+
+ return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
}
static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 5e0090ef4182..801d92d237cf 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1520,6 +1520,12 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
spin_unlock_irqrestore(&m->lock, flags);
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
+ r = scsi_verify_blk_ioctl(NULL, cmd);
+
return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
}
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index c2907d836e4e..86cb7e5d83d5 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -56,7 +56,8 @@ struct raid_dev {
struct raid_set {
struct dm_target *ti;
- uint64_t print_flags;
+ uint32_t bitmap_loaded;
+ uint32_t print_flags;
struct mddev md;
struct raid_type *raid_type;
@@ -1085,7 +1086,7 @@ static int raid_status(struct dm_target *ti, status_type_t type,
raid_param_cnt += 2;
}
- raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2);
+ raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2);
if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
raid_param_cnt--;
@@ -1197,7 +1198,12 @@ static void raid_resume(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
- bitmap_load(&rs->md);
+ if (!rs->bitmap_loaded) {
+ bitmap_load(&rs->md);
+ rs->bitmap_loaded = 1;
+ } else
+ md_wakeup_thread(rs->md.thread);
+
mddev_resume(&rs->md);
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 8e9132130142..63cc54289aff 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -699,7 +699,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
while (i < dm_table_get_num_targets(table)) {
ti = dm_table_get_target(table, i++);
- blk_set_default_limits(&ti_limits);
+ blk_set_stacking_limits(&ti_limits);
/* combine all target devices' limits */
if (ti->type->iterate_devices)
@@ -1221,10 +1221,10 @@ int dm_calculate_queue_limits(struct dm_table *table,
struct queue_limits ti_limits;
unsigned i = 0;
- blk_set_default_limits(limits);
+ blk_set_stacking_limits(limits);
while (i < dm_table_get_num_targets(table)) {
- blk_set_default_limits(&ti_limits);
+ blk_set_stacking_limits(&ti_limits);
ti = dm_table_get_target(table, i++);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4720f68f817e..b89c548ec3f8 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -14,7 +14,6 @@
#include <linux/moduleparam.h>
#include <linux/blkpg.h>
#include <linux/bio.h>
-#include <linux/buffer_head.h>
#include <linux/mempool.h>
#include <linux/slab.h>
#include <linux/idr.h>
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index c3273efd08cb..627456542fb3 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -230,6 +230,7 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
return -EINVAL;
rdev->raid_disk = rdev->saved_raid_disk;
+ rdev->saved_raid_disk = -1;
newconf = linear_conf(mddev,mddev->raid_disks+1);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 84acfe7d10e4..ce88755baf4a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -36,8 +36,7 @@
#include <linux/blkdev.h>
#include <linux/sysctl.h>
#include <linux/seq_file.h>
-#include <linux/mutex.h>
-#include <linux/buffer_head.h> /* for invalidate_bdev */
+#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/ctype.h>
#include <linux/string.h>
@@ -570,7 +569,7 @@ static void mddev_put(struct mddev *mddev)
mddev->ctime == 0 && !mddev->hold_active) {
/* Array is not configured at all, and not held active,
* so destroy it */
- list_del(&mddev->all_mddevs);
+ list_del_init(&mddev->all_mddevs);
bs = mddev->bio_set;
mddev->bio_set = NULL;
if (mddev->gendisk) {
@@ -1714,6 +1713,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
}
if (sb->devflags & WriteMostly1)
set_bit(WriteMostly, &rdev->flags);
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
+ set_bit(Replacement, &rdev->flags);
} else /* MULTIPATH are always insync */
set_bit(In_sync, &rdev->flags);
@@ -1767,6 +1768,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->recovery_offset =
cpu_to_le64(rdev->recovery_offset);
}
+ if (test_bit(Replacement, &rdev->flags))
+ sb->feature_map |=
+ cpu_to_le32(MD_FEATURE_REPLACEMENT);
if (mddev->reshape_position != MaxSector) {
sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
@@ -2546,7 +2550,8 @@ state_show(struct md_rdev *rdev, char *page)
sep = ",";
}
if (test_bit(Blocked, &rdev->flags) ||
- rdev->badblocks.unacked_exist) {
+ (rdev->badblocks.unacked_exist
+ && !test_bit(Faulty, &rdev->flags))) {
len += sprintf(page+len, "%sblocked", sep);
sep = ",";
}
@@ -2559,6 +2564,15 @@ state_show(struct md_rdev *rdev, char *page)
len += sprintf(page+len, "%swrite_error", sep);
sep = ",";
}
+ if (test_bit(WantReplacement, &rdev->flags)) {
+ len += sprintf(page+len, "%swant_replacement", sep);
+ sep = ",";
+ }
+ if (test_bit(Replacement, &rdev->flags)) {
+ len += sprintf(page+len, "%sreplacement", sep);
+ sep = ",";
+ }
+
return len+sprintf(page+len, "\n");
}
@@ -2627,6 +2641,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
} else if (cmd_match(buf, "-write_error")) {
clear_bit(WriteErrorSeen, &rdev->flags);
err = 0;
+ } else if (cmd_match(buf, "want_replacement")) {
+ /* Any non-spare device that is not a replacement can
+ * become want_replacement at any time, but we then need to
+ * check if recovery is needed.
+ */
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Replacement, &rdev->flags))
+ set_bit(WantReplacement, &rdev->flags);
+ set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
+ md_wakeup_thread(rdev->mddev->thread);
+ err = 0;
+ } else if (cmd_match(buf, "-want_replacement")) {
+ /* Clearing 'want_replacement' is always allowed.
+ * Once replacements starts it is too late though.
+ */
+ err = 0;
+ clear_bit(WantReplacement, &rdev->flags);
+ } else if (cmd_match(buf, "replacement")) {
+ /* Can only set a device as a replacement when array has not
+ * yet been started. Once running, replacement is automatic
+ * from spares, or by assigning 'slot'.
+ */
+ if (rdev->mddev->pers)
+ err = -EBUSY;
+ else {
+ set_bit(Replacement, &rdev->flags);
+ err = 0;
+ }
+ } else if (cmd_match(buf, "-replacement")) {
+ /* Similarly, can only clear Replacement before start */
+ if (rdev->mddev->pers)
+ err = -EBUSY;
+ else {
+ clear_bit(Replacement, &rdev->flags);
+ err = 0;
+ }
}
if (!err)
sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -2688,7 +2738,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
if (rdev->mddev->pers->hot_remove_disk == NULL)
return -EINVAL;
err = rdev->mddev->pers->
- hot_remove_disk(rdev->mddev, rdev->raid_disk);
+ hot_remove_disk(rdev->mddev, rdev);
if (err)
return err;
sysfs_unlink_rdev(rdev->mddev, rdev);
@@ -2696,7 +2746,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
} else if (rdev->mddev->pers) {
- struct md_rdev *rdev2;
/* Activating a spare .. or possibly reactivating
* if we ever get bitmaps working here.
*/
@@ -2710,10 +2759,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
if (rdev->mddev->pers->hot_add_disk == NULL)
return -EINVAL;
- list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
- if (rdev2->raid_disk == slot)
- return -EEXIST;
-
if (slot >= rdev->mddev->raid_disks &&
slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
return -ENOSPC;
@@ -3788,6 +3833,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
if (err)
return err;
else {
+ if (mddev->hold_active == UNTIL_IOCTL)
+ mddev->hold_active = 0;
sysfs_notify_dirent_safe(mddev->sysfs_state);
return len;
}
@@ -4487,11 +4534,20 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
if (!entry->show)
return -EIO;
+ spin_lock(&all_mddevs_lock);
+ if (list_empty(&mddev->all_mddevs)) {
+ spin_unlock(&all_mddevs_lock);
+ return -EBUSY;
+ }
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
+
rv = mddev_lock(mddev);
if (!rv) {
rv = entry->show(mddev, page);
mddev_unlock(mddev);
}
+ mddev_put(mddev);
return rv;
}
@@ -4507,13 +4563,19 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
return -EIO;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
+ spin_lock(&all_mddevs_lock);
+ if (list_empty(&mddev->all_mddevs)) {
+ spin_unlock(&all_mddevs_lock);
+ return -EBUSY;
+ }
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
rv = mddev_lock(mddev);
- if (mddev->hold_active == UNTIL_IOCTL)
- mddev->hold_active = 0;
if (!rv) {
rv = entry->store(mddev, page, length);
mddev_unlock(mddev);
}
+ mddev_put(mddev);
return rv;
}
@@ -4604,6 +4666,7 @@ static int md_alloc(dev_t dev, char *name)
mddev->queue->queuedata = mddev;
blk_queue_make_request(mddev->queue, md_make_request);
+ blk_set_stacking_limits(&mddev->queue->limits);
disk = alloc_disk(1 << shift);
if (!disk) {
@@ -6036,8 +6099,15 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
struct mddev *mddev = NULL;
int ro;
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
+ switch (cmd) {
+ case RAID_VERSION:
+ case GET_ARRAY_INFO:
+ case GET_DISK_INFO:
+ break;
+ default:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
/*
* Commands dealing with the RAID driver but not any
@@ -6697,8 +6767,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
if (test_bit(Faulty, &rdev->flags)) {
seq_printf(seq, "(F)");
continue;
- } else if (rdev->raid_disk < 0)
+ }
+ if (rdev->raid_disk < 0)
seq_printf(seq, "(S)"); /* spare */
+ if (test_bit(Replacement, &rdev->flags))
+ seq_printf(seq, "(R)");
sectors += rdev->sectors;
}
@@ -7260,7 +7333,8 @@ void md_do_sync(struct mddev *mddev)
printk(KERN_INFO
"md: checkpointing %s of %s.\n",
desc, mdname(mddev));
- mddev->recovery_cp = mddev->curr_resync;
+ mddev->recovery_cp =
+ mddev->curr_resync_completed;
}
} else
mddev->recovery_cp = MaxSector;
@@ -7278,9 +7352,9 @@ void md_do_sync(struct mddev *mddev)
rcu_read_unlock();
}
}
+ skip:
set_bit(MD_CHANGE_DEVS, &mddev->flags);
- skip:
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
@@ -7310,6 +7384,7 @@ static int remove_and_add_spares(struct mddev *mddev)
{
struct md_rdev *rdev;
int spares = 0;
+ int removed = 0;
mddev->curr_resync_completed = 0;
@@ -7320,30 +7395,32 @@ static int remove_and_add_spares(struct mddev *mddev)
! test_bit(In_sync, &rdev->flags)) &&
atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(
- mddev, rdev->raid_disk)==0) {
+ mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1;
+ removed++;
}
}
+ if (removed)
+ sysfs_notify(&mddev->kobj, NULL,
+ "degraded");
- if (mddev->degraded) {
- list_for_each_entry(rdev, &mddev->disks, same_set) {
- if (rdev->raid_disk >= 0 &&
- !test_bit(In_sync, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags))
+
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(In_sync, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags))
+ spares++;
+ if (rdev->raid_disk < 0
+ && !test_bit(Faulty, &rdev->flags)) {
+ rdev->recovery_offset = 0;
+ if (mddev->pers->
+ hot_add_disk(mddev, rdev) == 0) {
+ if (sysfs_link_rdev(mddev, rdev))
+ /* failure here is OK */;
spares++;
- if (rdev->raid_disk < 0
- && !test_bit(Faulty, &rdev->flags)) {
- rdev->recovery_offset = 0;
- if (mddev->pers->
- hot_add_disk(mddev, rdev) == 0) {
- if (sysfs_link_rdev(mddev, rdev))
- /* failure here is OK */;
- spares++;
- md_new_event(mddev);
- set_bit(MD_CHANGE_DEVS, &mddev->flags);
- } else
- break;
+ md_new_event(mddev);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
}
}
@@ -7458,7 +7535,7 @@ void md_check_recovery(struct mddev *mddev)
test_bit(Faulty, &rdev->flags) &&
atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(
- mddev, rdev->raid_disk)==0) {
+ mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1;
}
@@ -7840,6 +7917,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
s + rdev->data_offset, sectors, acknowledged);
if (rv) {
/* Make sure they get written out promptly */
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
md_wakeup_thread(rdev->mddev->thread);
}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index cf742d9306ec..44c63dfeeb2b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -72,34 +72,7 @@ struct md_rdev {
* This reduces the burden of testing multiple flags in many cases
*/
- unsigned long flags;
-#define Faulty 1 /* device is known to have a fault */
-#define In_sync 2 /* device is in_sync with rest of array */
-#define WriteMostly 4 /* Avoid reading if at all possible */
-#define AutoDetected 7 /* added by auto-detect */
-#define Blocked 8 /* An error occurred but has not yet
- * been acknowledged by the metadata
- * handler, so don't allow writes
- * until it is cleared */
-#define WriteErrorSeen 9 /* A write error has been seen on this
- * device
- */
-#define FaultRecorded 10 /* Intermediate state for clearing
- * Blocked. The Fault is/will-be
- * recorded in the metadata, but that
- * metadata hasn't been stored safely
- * on disk yet.
- */
-#define BlockedBadBlocks 11 /* A writer is blocked because they
- * found an unacknowledged bad-block.
- * This can safely be cleared at any
- * time, and the writer will re-check.
- * It may be set at any time, and at
- * worst the writer will timeout and
- * re-check. So setting it as
- * accurately as possible is good, but
- * not absolutely critical.
- */
+ unsigned long flags; /* bit set of 'enum flag_bits' bits. */
wait_queue_head_t blocked_wait;
int desc_nr; /* descriptor index in the superblock */
@@ -152,6 +125,44 @@ struct md_rdev {
sector_t size; /* in sectors */
} badblocks;
};
+enum flag_bits {
+ Faulty, /* device is known to have a fault */
+ In_sync, /* device is in_sync with rest of array */
+ WriteMostly, /* Avoid reading if at all possible */
+ AutoDetected, /* added by auto-detect */
+ Blocked, /* An error occurred but has not yet
+ * been acknowledged by the metadata
+ * handler, so don't allow writes
+ * until it is cleared */
+ WriteErrorSeen, /* A write error has been seen on this
+ * device
+ */
+ FaultRecorded, /* Intermediate state for clearing
+ * Blocked. The Fault is/will-be
+ * recorded in the metadata, but that
+ * metadata hasn't been stored safely
+ * on disk yet.
+ */
+ BlockedBadBlocks, /* A writer is blocked because they
+ * found an unacknowledged bad-block.
+ * This can safely be cleared at any
+ * time, and the writer will re-check.
+ * It may be set at any time, and at
+ * worst the writer will timeout and
+ * re-check. So setting it as
+ * accurately as possible is good, but
+ * not absolutely critical.
+ */
+ WantReplacement, /* This device is a candidate to be
+ * hot-replaced, either because it has
+ * reported some faults, or because
+ * of explicit request.
+ */
+ Replacement, /* This device is a replacement for
+ * a want_replacement device with same
+ * raid_disk number.
+ */
+};
#define BB_LEN_MASK (0x00000000000001FFULL)
#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
@@ -428,7 +439,7 @@ struct md_personality
*/
void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev);
int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
- int (*hot_remove_disk) (struct mddev *mddev, int number);
+ int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
int (*spare_active) (struct mddev *mddev);
sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster);
int (*resize) (struct mddev *mddev, sector_t sectors);
@@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev)
static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
- sprintf(nm, "rd%d", rdev->raid_disk);
- return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+ if (!test_bit(Replacement, &rdev->flags)) {
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+ } else
+ return 0;
}
static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
- sprintf(nm, "rd%d", rdev->raid_disk);
- sysfs_remove_link(&mddev->kobj, nm);
+ if (!test_bit(Replacement, &rdev->flags)) {
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ }
}
/*
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 5899246fa37e..a222f516660e 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -292,17 +292,16 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
return err;
}
-static int multipath_remove_disk(struct mddev *mddev, int number)
+static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct mpconf *conf = mddev->private;
int err = 0;
- struct md_rdev *rdev;
+ int number = rdev->raid_disk;
struct multipath_info *p = conf->multipaths + number;
print_multipath_conf(conf);
- rdev = p->rdev;
- if (rdev) {
+ if (rdev == p->rdev) {
if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) {
printk(KERN_ERR "hot-remove-disk, slot %d is identified"
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ede2461e79c5..a368db2431a5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -135,7 +135,7 @@ out_free_pages:
put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
j = -1;
out_free_bio:
- while ( ++j < pi->raid_disks )
+ while (++j < pi->raid_disks)
bio_put(r1_bio->bios[j]);
r1bio_pool_free(r1_bio, data);
return NULL;
@@ -164,7 +164,7 @@ static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
{
int i;
- for (i = 0; i < conf->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks * 2; i++) {
struct bio **bio = r1_bio->bios + i;
if (!BIO_SPECIAL(*bio))
bio_put(*bio);
@@ -185,7 +185,7 @@ static void put_buf(struct r1bio *r1_bio)
struct r1conf *conf = r1_bio->mddev->private;
int i;
- for (i=0; i<conf->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks * 2; i++) {
struct bio *bio = r1_bio->bios[i];
if (bio->bi_end_io)
rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
@@ -277,13 +277,14 @@ static inline void update_head_pos(int disk, struct r1bio *r1_bio)
static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
{
int mirror;
- int raid_disks = r1_bio->mddev->raid_disks;
+ struct r1conf *conf = r1_bio->mddev->private;
+ int raid_disks = conf->raid_disks;
- for (mirror = 0; mirror < raid_disks; mirror++)
+ for (mirror = 0; mirror < raid_disks * 2; mirror++)
if (r1_bio->bios[mirror] == bio)
break;
- BUG_ON(mirror == raid_disks);
+ BUG_ON(mirror == raid_disks * 2);
update_head_pos(mirror, r1_bio);
return mirror;
@@ -390,6 +391,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
if (!uptodate) {
set_bit(WriteErrorSeen,
&conf->mirrors[mirror].rdev->flags);
+ if (!test_and_set_bit(WantReplacement,
+ &conf->mirrors[mirror].rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED, &
+ conf->mddev->recovery);
+
set_bit(R1BIO_WriteError, &r1_bio->state);
} else {
/*
@@ -505,7 +511,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
start_disk = conf->last_used;
}
- for (i = 0 ; i < conf->raid_disks ; i++) {
+ for (i = 0 ; i < conf->raid_disks * 2 ; i++) {
sector_t dist;
sector_t first_bad;
int bad_sectors;
@@ -525,8 +531,17 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
if (test_bit(WriteMostly, &rdev->flags)) {
/* Don't balance among write-mostly, just
* use the first as a last resort */
- if (best_disk < 0)
+ if (best_disk < 0) {
+ if (is_badblock(rdev, this_sector, sectors,
+ &first_bad, &bad_sectors)) {
+ if (first_bad < this_sector)
+ /* Cannot use this */
+ continue;
+ best_good_sectors = first_bad - this_sector;
+ } else
+ best_good_sectors = sectors;
best_disk = disk;
+ }
continue;
}
/* This is a reasonable device to use. It might
@@ -609,7 +624,7 @@ int md_raid1_congested(struct mddev *mddev, int bits)
return 1;
rcu_read_lock();
- for (i = 0; i < mddev->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -974,7 +989,7 @@ read_again:
*/
plugged = mddev_check_plugged(mddev);
- disks = conf->raid_disks;
+ disks = conf->raid_disks * 2;
retry_write:
blocked_rdev = NULL;
rcu_read_lock();
@@ -988,7 +1003,8 @@ read_again:
}
r1_bio->bios[i] = NULL;
if (!rdev || test_bit(Faulty, &rdev->flags)) {
- set_bit(R1BIO_Degraded, &r1_bio->state);
+ if (i < conf->raid_disks)
+ set_bit(R1BIO_Degraded, &r1_bio->state);
continue;
}
@@ -1263,6 +1279,25 @@ static int raid1_spare_active(struct mddev *mddev)
*/
for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = conf->mirrors[i].rdev;
+ struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
+ if (repl
+ && repl->recovery_offset == MaxSector
+ && !test_bit(Faulty, &repl->flags)
+ && !test_and_set_bit(In_sync, &repl->flags)) {
+ /* replacement has just become active */
+ if (!rdev ||
+ !test_and_clear_bit(In_sync, &rdev->flags))
+ count++;
+ if (rdev) {
+ /* Replaced device not technically
+ * faulty, but we need to be sure
+ * it gets removed and never re-added
+ */
+ set_bit(Faulty, &rdev->flags);
+ sysfs_notify_dirent_safe(
+ rdev->sysfs_state);
+ }
+ }
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_and_set_bit(In_sync, &rdev->flags)) {
@@ -1286,7 +1321,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int mirror = 0;
struct mirror_info *p;
int first = 0;
- int last = mddev->raid_disks - 1;
+ int last = conf->raid_disks - 1;
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;
@@ -1294,8 +1329,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
- for (mirror = first; mirror <= last; mirror++)
- if ( !(p=conf->mirrors+mirror)->rdev) {
+ for (mirror = first; mirror <= last; mirror++) {
+ p = conf->mirrors+mirror;
+ if (!p->rdev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
@@ -1322,21 +1358,35 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rcu_assign_pointer(p->rdev, rdev);
break;
}
+ if (test_bit(WantReplacement, &p->rdev->flags) &&
+ p[conf->raid_disks].rdev == NULL) {
+ /* Add this device as a replacement */
+ clear_bit(In_sync, &rdev->flags);
+ set_bit(Replacement, &rdev->flags);
+ rdev->raid_disk = mirror;
+ err = 0;
+ conf->fullsync = 1;
+ rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
+ break;
+ }
+ }
md_integrity_add_rdev(rdev, mddev);
print_conf(conf);
return err;
}
-static int raid1_remove_disk(struct mddev *mddev, int number)
+static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r1conf *conf = mddev->private;
int err = 0;
- struct md_rdev *rdev;
+ int number = rdev->raid_disk;
struct mirror_info *p = conf->mirrors+ number;
+ if (rdev != p->rdev)
+ p = conf->mirrors + conf->raid_disks + number;
+
print_conf(conf);
- rdev = p->rdev;
- if (rdev) {
+ if (rdev == p->rdev) {
if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) {
err = -EBUSY;
@@ -1358,7 +1408,21 @@ static int raid1_remove_disk(struct mddev *mddev, int number)
err = -EBUSY;
p->rdev = rdev;
goto abort;
- }
+ } else if (conf->mirrors[conf->raid_disks + number].rdev) {
+ /* We just removed a device that is being replaced.
+ * Move down the replacement. We drain all IO before
+ * doing this to avoid confusion.
+ */
+ struct md_rdev *repl =
+ conf->mirrors[conf->raid_disks + number].rdev;
+ raise_barrier(conf);
+ clear_bit(Replacement, &repl->flags);
+ p->rdev = repl;
+ conf->mirrors[conf->raid_disks + number].rdev = NULL;
+ lower_barrier(conf);
+ clear_bit(WantReplacement, &rdev->flags);
+ } else
+ clear_bit(WantReplacement, &rdev->flags);
err = md_integrity_register(mddev);
}
abort:
@@ -1411,6 +1475,10 @@ static void end_sync_write(struct bio *bio, int error)
} while (sectors_to_go > 0);
set_bit(WriteErrorSeen,
&conf->mirrors[mirror].rdev->flags);
+ if (!test_and_set_bit(WantReplacement,
+ &conf->mirrors[mirror].rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED, &
+ mddev->recovery);
set_bit(R1BIO_WriteError, &r1_bio->state);
} else if (is_badblock(conf->mirrors[mirror].rdev,
r1_bio->sector,
@@ -1441,8 +1509,13 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
/* success */
return 1;
- if (rw == WRITE)
+ if (rw == WRITE) {
set_bit(WriteErrorSeen, &rdev->flags);
+ if (!test_and_set_bit(WantReplacement,
+ &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED, &
+ rdev->mddev->recovery);
+ }
/* need to record an error - either for the block or the device */
if (!rdev_set_badblocks(rdev, sector, sectors, 0))
md_error(rdev->mddev, rdev);
@@ -1493,7 +1566,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
}
}
d++;
- if (d == conf->raid_disks)
+ if (d == conf->raid_disks * 2)
d = 0;
} while (!success && d != r1_bio->read_disk);
@@ -1510,7 +1583,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
mdname(mddev),
bdevname(bio->bi_bdev, b),
(unsigned long long)r1_bio->sector);
- for (d = 0; d < conf->raid_disks; d++) {
+ for (d = 0; d < conf->raid_disks * 2; d++) {
rdev = conf->mirrors[d].rdev;
if (!rdev || test_bit(Faulty, &rdev->flags))
continue;
@@ -1536,7 +1609,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
/* write it back and re-read */
while (d != r1_bio->read_disk) {
if (d == 0)
- d = conf->raid_disks;
+ d = conf->raid_disks * 2;
d--;
if (r1_bio->bios[d]->bi_end_io != end_sync_read)
continue;
@@ -1551,7 +1624,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
d = start;
while (d != r1_bio->read_disk) {
if (d == 0)
- d = conf->raid_disks;
+ d = conf->raid_disks * 2;
d--;
if (r1_bio->bios[d]->bi_end_io != end_sync_read)
continue;
@@ -1584,7 +1657,7 @@ static int process_checks(struct r1bio *r1_bio)
int primary;
int i;
- for (primary = 0; primary < conf->raid_disks; primary++)
+ for (primary = 0; primary < conf->raid_disks * 2; primary++)
if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
r1_bio->bios[primary]->bi_end_io = NULL;
@@ -1592,7 +1665,7 @@ static int process_checks(struct r1bio *r1_bio)
break;
}
r1_bio->read_disk = primary;
- for (i = 0; i < conf->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks * 2; i++) {
int j;
int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
struct bio *pbio = r1_bio->bios[primary];
@@ -1656,7 +1729,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
{
struct r1conf *conf = mddev->private;
int i;
- int disks = conf->raid_disks;
+ int disks = conf->raid_disks * 2;
struct bio *bio, *wbio;
bio = r1_bio->bios[r1_bio->read_disk];
@@ -1737,7 +1810,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
success = 1;
else {
d++;
- if (d == conf->raid_disks)
+ if (d == conf->raid_disks * 2)
d = 0;
}
} while (!success && d != read_disk);
@@ -1753,7 +1826,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
start = d;
while (d != read_disk) {
if (d==0)
- d = conf->raid_disks;
+ d = conf->raid_disks * 2;
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
@@ -1765,7 +1838,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
while (d != read_disk) {
char b[BDEVNAME_SIZE];
if (d==0)
- d = conf->raid_disks;
+ d = conf->raid_disks * 2;
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
@@ -1887,7 +1960,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
{
int m;
int s = r1_bio->sectors;
- for (m = 0; m < conf->raid_disks ; m++) {
+ for (m = 0; m < conf->raid_disks * 2 ; m++) {
struct md_rdev *rdev = conf->mirrors[m].rdev;
struct bio *bio = r1_bio->bios[m];
if (bio->bi_end_io == NULL)
@@ -1909,7 +1982,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
{
int m;
- for (m = 0; m < conf->raid_disks ; m++)
+ for (m = 0; m < conf->raid_disks * 2 ; m++)
if (r1_bio->bios[m] == IO_MADE_GOOD) {
struct md_rdev *rdev = conf->mirrors[m].rdev;
rdev_clear_badblocks(rdev,
@@ -2184,7 +2257,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
r1_bio->state = 0;
set_bit(R1BIO_IsSync, &r1_bio->state);
- for (i=0; i < conf->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks * 2; i++) {
struct md_rdev *rdev;
bio = r1_bio->bios[i];
@@ -2203,7 +2276,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev == NULL ||
test_bit(Faulty, &rdev->flags)) {
- still_degraded = 1;
+ if (i < conf->raid_disks)
+ still_degraded = 1;
} else if (!test_bit(In_sync, &rdev->flags)) {
bio->bi_rw = WRITE;
bio->bi_end_io = end_sync_write;
@@ -2254,7 +2328,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
* need to mark them bad on all write targets
*/
int ok = 1;
- for (i = 0 ; i < conf->raid_disks ; i++)
+ for (i = 0 ; i < conf->raid_disks * 2 ; i++)
if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
struct md_rdev *rdev =
rcu_dereference(conf->mirrors[i].rdev);
@@ -2323,7 +2397,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
len = sync_blocks<<9;
}
- for (i=0 ; i < conf->raid_disks; i++) {
+ for (i = 0 ; i < conf->raid_disks * 2; i++) {
bio = r1_bio->bios[i];
if (bio->bi_end_io) {
page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
@@ -2356,7 +2430,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
*/
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
atomic_set(&r1_bio->remaining, read_targets);
- for (i=0; i<conf->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks * 2; i++) {
bio = r1_bio->bios[i];
if (bio->bi_end_io == end_sync_read) {
md_sync_acct(bio->bi_bdev, nr_sectors);
@@ -2393,7 +2467,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf)
goto abort;
- conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
+ conf->mirrors = kzalloc(sizeof(struct mirror_info)
+ * mddev->raid_disks * 2,
GFP_KERNEL);
if (!conf->mirrors)
goto abort;
@@ -2405,7 +2480,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
if (!conf->poolinfo)
goto abort;
- conf->poolinfo->raid_disks = mddev->raid_disks;
+ conf->poolinfo->raid_disks = mddev->raid_disks * 2;
conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
r1bio_pool_free,
conf->poolinfo);
@@ -2414,14 +2489,20 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->poolinfo->mddev = mddev;
+ err = -EINVAL;
spin_lock_init(&conf->device_lock);
list_for_each_entry(rdev, &mddev->disks, same_set) {
int disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0)
continue;
- disk = conf->mirrors + disk_idx;
+ if (test_bit(Replacement, &rdev->flags))
+ disk = conf->mirrors + conf->raid_disks + disk_idx;
+ else
+ disk = conf->mirrors + disk_idx;
+ if (disk->rdev)
+ goto abort;
disk->rdev = rdev;
disk->head_position = 0;
@@ -2437,11 +2518,27 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->pending_count = 0;
conf->recovery_disabled = mddev->recovery_disabled - 1;
+ err = -EIO;
conf->last_used = -1;
- for (i = 0; i < conf->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks * 2; i++) {
disk = conf->mirrors + i;
+ if (i < conf->raid_disks &&
+ disk[conf->raid_disks].rdev) {
+ /* This slot has a replacement. */
+ if (!disk->rdev) {
+ /* No original, just make the replacement
+ * a recovering spare
+ */
+ disk->rdev =
+ disk[conf->raid_disks].rdev;
+ disk[conf->raid_disks].rdev = NULL;
+ } else if (!test_bit(In_sync, &disk->rdev->flags))
+ /* Original is not in_sync - bad */
+ goto abort;
+ }
+
if (!disk->rdev ||
!test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0;
@@ -2455,7 +2552,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->last_used = i;
}
- err = -EIO;
if (conf->last_used < 0) {
printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
mdname(mddev));
@@ -2665,7 +2761,7 @@ static int raid1_reshape(struct mddev *mddev)
if (!newpoolinfo)
return -ENOMEM;
newpoolinfo->mddev = mddev;
- newpoolinfo->raid_disks = raid_disks;
+ newpoolinfo->raid_disks = raid_disks * 2;
newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
r1bio_pool_free, newpoolinfo);
@@ -2673,7 +2769,8 @@ static int raid1_reshape(struct mddev *mddev)
kfree(newpoolinfo);
return -ENOMEM;
}
- newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
+ newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2,
+ GFP_KERNEL);
if (!newmirrors) {
kfree(newpoolinfo);
mempool_destroy(newpool);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index c732b6cce935..80ded139314c 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -12,6 +12,9 @@ struct mirror_info {
* pool was allocated for, so they know how much to allocate and free.
* mddev->raid_disks cannot be used, as it can change while a pool is active
* These two datums are stored in a kmalloced struct.
+ * The 'raid_disks' here is twice the raid_disks in r1conf.
+ * This allows space for each 'real' device can have a replacement in the
+ * second half of the array.
*/
struct pool_info {
@@ -21,7 +24,9 @@ struct pool_info {
struct r1conf {
struct mddev *mddev;
- struct mirror_info *mirrors;
+ struct mirror_info *mirrors; /* twice 'raid_disks' to
+ * allow for replacements.
+ */
int raid_disks;
/* When choose the best device for a read (read_balance())
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 685ddf325ee4..6e8aa213f0d5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -73,7 +73,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
struct r10conf *conf = data;
int size = offsetof(struct r10bio, devs[conf->copies]);
- /* allocate a r10bio with room for raid_disks entries in the bios array */
+ /* allocate a r10bio with room for raid_disks entries in the
+ * bios array */
return kzalloc(size, gfp_flags);
}
@@ -123,12 +124,19 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
if (!bio)
goto out_free_bio;
r10_bio->devs[j].bio = bio;
+ if (!conf->have_replacement)
+ continue;
+ bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
+ if (!bio)
+ goto out_free_bio;
+ r10_bio->devs[j].repl_bio = bio;
}
/*
* Allocate RESYNC_PAGES data pages and attach them
* where needed.
*/
for (j = 0 ; j < nalloc; j++) {
+ struct bio *rbio = r10_bio->devs[j].repl_bio;
bio = r10_bio->devs[j].bio;
for (i = 0; i < RESYNC_PAGES; i++) {
if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
@@ -143,6 +151,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
goto out_free_pages;
bio->bi_io_vec[i].bv_page = page;
+ if (rbio)
+ rbio->bi_io_vec[i].bv_page = page;
}
}
@@ -156,8 +166,11 @@ out_free_pages:
safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
j = -1;
out_free_bio:
- while ( ++j < nalloc )
+ while (++j < nalloc) {
bio_put(r10_bio->devs[j].bio);
+ if (r10_bio->devs[j].repl_bio)
+ bio_put(r10_bio->devs[j].repl_bio);
+ }
r10bio_pool_free(r10_bio, conf);
return NULL;
}
@@ -178,6 +191,9 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
}
bio_put(bio);
}
+ bio = r10bio->devs[j].repl_bio;
+ if (bio)
+ bio_put(bio);
}
r10bio_pool_free(r10bio, conf);
}
@@ -191,6 +207,10 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
if (!BIO_SPECIAL(*bio))
bio_put(*bio);
*bio = NULL;
+ bio = &r10_bio->devs[i].repl_bio;
+ if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
+ bio_put(*bio);
+ *bio = NULL;
}
}
@@ -275,19 +295,27 @@ static inline void update_head_pos(int slot, struct r10bio *r10_bio)
* Find the disk number which triggered given bio
*/
static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
- struct bio *bio, int *slotp)
+ struct bio *bio, int *slotp, int *replp)
{
int slot;
+ int repl = 0;
- for (slot = 0; slot < conf->copies; slot++)
+ for (slot = 0; slot < conf->copies; slot++) {
if (r10_bio->devs[slot].bio == bio)
break;
+ if (r10_bio->devs[slot].repl_bio == bio) {
+ repl = 1;
+ break;
+ }
+ }
BUG_ON(slot == conf->copies);
update_head_pos(slot, r10_bio);
if (slotp)
*slotp = slot;
+ if (replp)
+ *replp = repl;
return r10_bio->devs[slot].devnum;
}
@@ -296,11 +324,13 @@ static void raid10_end_read_request(struct bio *bio, int error)
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r10bio *r10_bio = bio->bi_private;
int slot, dev;
+ struct md_rdev *rdev;
struct r10conf *conf = r10_bio->mddev->private;
slot = r10_bio->read_slot;
dev = r10_bio->devs[slot].devnum;
+ rdev = r10_bio->devs[slot].rdev;
/*
* this branch is our 'one mirror IO has finished' event handler:
*/
@@ -318,7 +348,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
*/
set_bit(R10BIO_Uptodate, &r10_bio->state);
raid_end_bio_io(r10_bio);
- rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
+ rdev_dec_pending(rdev, conf->mddev);
} else {
/*
* oops, read error - keep the refcount on the rdev
@@ -327,7 +357,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
printk_ratelimited(KERN_ERR
"md/raid10:%s: %s: rescheduling sector %llu\n",
mdname(conf->mddev),
- bdevname(conf->mirrors[dev].rdev->bdev, b),
+ bdevname(rdev->bdev, b),
(unsigned long long)r10_bio->sector);
set_bit(R10BIO_ReadError, &r10_bio->state);
reschedule_retry(r10_bio);
@@ -366,17 +396,35 @@ static void raid10_end_write_request(struct bio *bio, int error)
int dev;
int dec_rdev = 1;
struct r10conf *conf = r10_bio->mddev->private;
- int slot;
+ int slot, repl;
+ struct md_rdev *rdev = NULL;
- dev = find_bio_disk(conf, r10_bio, bio, &slot);
+ dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
+ if (repl)
+ rdev = conf->mirrors[dev].replacement;
+ if (!rdev) {
+ smp_rmb();
+ repl = 0;
+ rdev = conf->mirrors[dev].rdev;
+ }
/*
* this branch is our 'one mirror IO has finished' event handler:
*/
if (!uptodate) {
- set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags);
- set_bit(R10BIO_WriteError, &r10_bio->state);
- dec_rdev = 0;
+ if (repl)
+ /* Never record new bad blocks to replacement,
+ * just fail it.
+ */
+ md_error(rdev->mddev, rdev);
+ else {
+ set_bit(WriteErrorSeen, &rdev->flags);
+ if (!test_and_set_bit(WantReplacement, &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED,
+ &rdev->mddev->recovery);
+ set_bit(R10BIO_WriteError, &r10_bio->state);
+ dec_rdev = 0;
+ }
} else {
/*
* Set R10BIO_Uptodate in our master bio, so that
@@ -393,12 +441,15 @@ static void raid10_end_write_request(struct bio *bio, int error)
set_bit(R10BIO_Uptodate, &r10_bio->state);
/* Maybe we can clear some bad blocks. */
- if (is_badblock(conf->mirrors[dev].rdev,
+ if (is_badblock(rdev,
r10_bio->devs[slot].addr,
r10_bio->sectors,
&first_bad, &bad_sectors)) {
bio_put(bio);
- r10_bio->devs[slot].bio = IO_MADE_GOOD;
+ if (repl)
+ r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
+ else
+ r10_bio->devs[slot].bio = IO_MADE_GOOD;
dec_rdev = 0;
set_bit(R10BIO_MadeGood, &r10_bio->state);
}
@@ -414,7 +465,6 @@ static void raid10_end_write_request(struct bio *bio, int error)
rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
}
-
/*
* RAID10 layout manager
* As well as the chunksize and raid_disks count, there are two
@@ -562,14 +612,16 @@ static int raid10_mergeable_bvec(struct request_queue *q,
* FIXME: possibly should rethink readbalancing and do it differently
* depending on near_copies / far_copies geometry.
*/
-static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors)
+static struct md_rdev *read_balance(struct r10conf *conf,
+ struct r10bio *r10_bio,
+ int *max_sectors)
{
const sector_t this_sector = r10_bio->sector;
int disk, slot;
int sectors = r10_bio->sectors;
int best_good_sectors;
sector_t new_distance, best_dist;
- struct md_rdev *rdev;
+ struct md_rdev *rdev, *best_rdev;
int do_balance;
int best_slot;
@@ -578,6 +630,7 @@ static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_s
retry:
sectors = r10_bio->sectors;
best_slot = -1;
+ best_rdev = NULL;
best_dist = MaxSector;
best_good_sectors = 0;
do_balance = 1;
@@ -599,10 +652,16 @@ retry:
if (r10_bio->devs[slot].bio == IO_BLOCKED)
continue;
disk = r10_bio->devs[slot].devnum;
- rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ rdev = rcu_dereference(conf->mirrors[disk].replacement);
+ if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
+ r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
+ rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (rdev == NULL)
continue;
- if (!test_bit(In_sync, &rdev->flags))
+ if (test_bit(Faulty, &rdev->flags))
+ continue;
+ if (!test_bit(In_sync, &rdev->flags) &&
+ r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
continue;
dev_sector = r10_bio->devs[slot].addr;
@@ -627,6 +686,7 @@ retry:
if (good_sectors > best_good_sectors) {
best_good_sectors = good_sectors;
best_slot = slot;
+ best_rdev = rdev;
}
if (!do_balance)
/* Must read from here */
@@ -655,16 +715,15 @@ retry:
if (new_distance < best_dist) {
best_dist = new_distance;
best_slot = slot;
+ best_rdev = rdev;
}
}
- if (slot == conf->copies)
+ if (slot >= conf->copies) {
slot = best_slot;
+ rdev = best_rdev;
+ }
if (slot >= 0) {
- disk = r10_bio->devs[slot].devnum;
- rdev = rcu_dereference(conf->mirrors[disk].rdev);
- if (!rdev)
- goto retry;
atomic_inc(&rdev->nr_pending);
if (test_bit(Faulty, &rdev->flags)) {
/* Cannot risk returning a device that failed
@@ -675,11 +734,11 @@ retry:
}
r10_bio->read_slot = slot;
} else
- disk = -1;
+ rdev = NULL;
rcu_read_unlock();
*max_sectors = best_good_sectors;
- return disk;
+ return rdev;
}
static int raid10_congested(void *data, int bits)
@@ -846,7 +905,6 @@ static void unfreeze_array(struct r10conf *conf)
static void make_request(struct mddev *mddev, struct bio * bio)
{
struct r10conf *conf = mddev->private;
- struct mirror_info *mirror;
struct r10bio *r10_bio;
struct bio *read_bio;
int i;
@@ -945,27 +1003,27 @@ static void make_request(struct mddev *mddev, struct bio * bio)
/*
* read balancing logic:
*/
- int disk;
+ struct md_rdev *rdev;
int slot;
read_again:
- disk = read_balance(conf, r10_bio, &max_sectors);
- slot = r10_bio->read_slot;
- if (disk < 0) {
+ rdev = read_balance(conf, r10_bio, &max_sectors);
+ if (!rdev) {
raid_end_bio_io(r10_bio);
return;
}
- mirror = conf->mirrors + disk;
+ slot = r10_bio->read_slot;
read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
max_sectors);
r10_bio->devs[slot].bio = read_bio;
+ r10_bio->devs[slot].rdev = rdev;
read_bio->bi_sector = r10_bio->devs[slot].addr +
- mirror->rdev->data_offset;
- read_bio->bi_bdev = mirror->rdev->bdev;
+ rdev->data_offset;
+ read_bio->bi_bdev = rdev->bdev;
read_bio->bi_end_io = raid10_end_read_request;
read_bio->bi_rw = READ | do_sync;
read_bio->bi_private = r10_bio;
@@ -1025,6 +1083,7 @@ read_again:
*/
plugged = mddev_check_plugged(mddev);
+ r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
raid10_find_phys(conf, r10_bio);
retry_write:
blocked_rdev = NULL;
@@ -1034,12 +1093,25 @@ retry_write:
for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
+ struct md_rdev *rrdev = rcu_dereference(
+ conf->mirrors[d].replacement);
+ if (rdev == rrdev)
+ rrdev = NULL;
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
atomic_inc(&rdev->nr_pending);
blocked_rdev = rdev;
break;
}
+ if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
+ atomic_inc(&rrdev->nr_pending);
+ blocked_rdev = rrdev;
+ break;
+ }
+ if (rrdev && test_bit(Faulty, &rrdev->flags))
+ rrdev = NULL;
+
r10_bio->devs[i].bio = NULL;
+ r10_bio->devs[i].repl_bio = NULL;
if (!rdev || test_bit(Faulty, &rdev->flags)) {
set_bit(R10BIO_Degraded, &r10_bio->state);
continue;
@@ -1088,6 +1160,10 @@ retry_write:
}
r10_bio->devs[i].bio = bio;
atomic_inc(&rdev->nr_pending);
+ if (rrdev) {
+ r10_bio->devs[i].repl_bio = bio;
+ atomic_inc(&rrdev->nr_pending);
+ }
}
rcu_read_unlock();
@@ -1096,11 +1172,23 @@ retry_write:
int j;
int d;
- for (j = 0; j < i; j++)
+ for (j = 0; j < i; j++) {
if (r10_bio->devs[j].bio) {
d = r10_bio->devs[j].devnum;
rdev_dec_pending(conf->mirrors[d].rdev, mddev);
}
+ if (r10_bio->devs[j].repl_bio) {
+ struct md_rdev *rdev;
+ d = r10_bio->devs[j].devnum;
+ rdev = conf->mirrors[d].replacement;
+ if (!rdev) {
+ /* Race with remove_disk */
+ smp_mb();
+ rdev = conf->mirrors[d].rdev;
+ }
+ rdev_dec_pending(rdev, mddev);
+ }
+ }
allow_barrier(conf);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf);
@@ -1147,6 +1235,31 @@ retry_write:
bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++;
spin_unlock_irqrestore(&conf->device_lock, flags);
+
+ if (!r10_bio->devs[i].repl_bio)
+ continue;
+
+ mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
+ max_sectors);
+ r10_bio->devs[i].repl_bio = mbio;
+
+ /* We are actively writing to the original device
+ * so it cannot disappear, so the replacement cannot
+ * become NULL here
+ */
+ mbio->bi_sector = (r10_bio->devs[i].addr+
+ conf->mirrors[d].replacement->data_offset);
+ mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
+ mbio->bi_end_io = raid10_end_write_request;
+ mbio->bi_rw = WRITE | do_sync | do_fua;
+ mbio->bi_private = r10_bio;
+
+ atomic_inc(&r10_bio->remaining);
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio_list_add(&conf->pending_bio_list, mbio);
+ conf->pending_count++;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
}
/* Don't remove the bias on 'remaining' (one_write_done) until
@@ -1309,9 +1422,27 @@ static int raid10_spare_active(struct mddev *mddev)
*/
for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->mirrors + i;
- if (tmp->rdev
- && !test_bit(Faulty, &tmp->rdev->flags)
- && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+ if (tmp->replacement
+ && tmp->replacement->recovery_offset == MaxSector
+ && !test_bit(Faulty, &tmp->replacement->flags)
+ && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
+ /* Replacement has just become active */
+ if (!tmp->rdev
+ || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
+ count++;
+ if (tmp->rdev) {
+ /* Replaced device not technically faulty,
+ * but we need to be sure it gets removed
+ * and never re-added.
+ */
+ set_bit(Faulty, &tmp->rdev->flags);
+ sysfs_notify_dirent_safe(
+ tmp->rdev->sysfs_state);
+ }
+ sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
+ } else if (tmp->rdev
+ && !test_bit(Faulty, &tmp->rdev->flags)
+ && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
count++;
sysfs_notify_dirent(tmp->rdev->sysfs_state);
}
@@ -1353,8 +1484,25 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
struct mirror_info *p = &conf->mirrors[mirror];
if (p->recovery_disabled == mddev->recovery_disabled)
continue;
- if (p->rdev)
- continue;
+ if (p->rdev) {
+ if (!test_bit(WantReplacement, &p->rdev->flags) ||
+ p->replacement != NULL)
+ continue;
+ clear_bit(In_sync, &rdev->flags);
+ set_bit(Replacement, &rdev->flags);
+ rdev->raid_disk = mirror;
+ err = 0;
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+ blk_queue_max_segments(mddev->queue, 1);
+ blk_queue_segment_boundary(mddev->queue,
+ PAGE_CACHE_SIZE - 1);
+ }
+ conf->fullsync = 1;
+ rcu_assign_pointer(p->replacement, rdev);
+ break;
+ }
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
@@ -1385,40 +1533,61 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
return err;
}
-static int raid10_remove_disk(struct mddev *mddev, int number)
+static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r10conf *conf = mddev->private;
int err = 0;
- struct md_rdev *rdev;
- struct mirror_info *p = conf->mirrors+ number;
+ int number = rdev->raid_disk;
+ struct md_rdev **rdevp;
+ struct mirror_info *p = conf->mirrors + number;
print_conf(conf);
- rdev = p->rdev;
- if (rdev) {
- if (test_bit(In_sync, &rdev->flags) ||
- atomic_read(&rdev->nr_pending)) {
- err = -EBUSY;
- goto abort;
- }
- /* Only remove faulty devices in recovery
- * is not possible.
- */
- if (!test_bit(Faulty, &rdev->flags) &&
- mddev->recovery_disabled != p->recovery_disabled &&
- enough(conf, -1)) {
- err = -EBUSY;
- goto abort;
- }
- p->rdev = NULL;
- synchronize_rcu();
- if (atomic_read(&rdev->nr_pending)) {
- /* lost the race, try later */
- err = -EBUSY;
- p->rdev = rdev;
- goto abort;
- }
- err = md_integrity_register(mddev);
+ if (rdev == p->rdev)
+ rdevp = &p->rdev;
+ else if (rdev == p->replacement)
+ rdevp = &p->replacement;
+ else
+ return 0;
+
+ if (test_bit(In_sync, &rdev->flags) ||
+ atomic_read(&rdev->nr_pending)) {
+ err = -EBUSY;
+ goto abort;
}
+ /* Only remove faulty devices if recovery
+ * is not possible.
+ */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ mddev->recovery_disabled != p->recovery_disabled &&
+ (!p->replacement || p->replacement == rdev) &&
+ enough(conf, -1)) {
+ err = -EBUSY;
+ goto abort;
+ }
+ *rdevp = NULL;
+ synchronize_rcu();
+ if (atomic_read(&rdev->nr_pending)) {
+ /* lost the race, try later */
+ err = -EBUSY;
+ *rdevp = rdev;
+ goto abort;
+ } else if (p->replacement) {
+ /* We must have just cleared 'rdev' */
+ p->rdev = p->replacement;
+ clear_bit(Replacement, &p->replacement->flags);
+ smp_mb(); /* Make sure other CPUs may see both as identical
+ * but will never see neither -- if they are careful.
+ */
+ p->replacement = NULL;
+ clear_bit(WantReplacement, &rdev->flags);
+ } else
+ /* We might have just remove the Replacement as faulty
+ * Clear the flag just in case
+ */
+ clear_bit(WantReplacement, &rdev->flags);
+
+ err = md_integrity_register(mddev);
+
abort:
print_conf(conf);
@@ -1432,7 +1601,7 @@ static void end_sync_read(struct bio *bio, int error)
struct r10conf *conf = r10_bio->mddev->private;
int d;
- d = find_bio_disk(conf, r10_bio, bio, NULL);
+ d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1493,19 +1662,34 @@ static void end_sync_write(struct bio *bio, int error)
sector_t first_bad;
int bad_sectors;
int slot;
-
- d = find_bio_disk(conf, r10_bio, bio, &slot);
+ int repl;
+ struct md_rdev *rdev = NULL;
+
+ d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
+ if (repl)
+ rdev = conf->mirrors[d].replacement;
+ if (!rdev) {
+ smp_mb();
+ rdev = conf->mirrors[d].rdev;
+ }
if (!uptodate) {
- set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);
- set_bit(R10BIO_WriteError, &r10_bio->state);
- } else if (is_badblock(conf->mirrors[d].rdev,
+ if (repl)
+ md_error(mddev, rdev);
+ else {
+ set_bit(WriteErrorSeen, &rdev->flags);
+ if (!test_and_set_bit(WantReplacement, &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED,
+ &rdev->mddev->recovery);
+ set_bit(R10BIO_WriteError, &r10_bio->state);
+ }
+ } else if (is_badblock(rdev,
r10_bio->devs[slot].addr,
r10_bio->sectors,
&first_bad, &bad_sectors))
set_bit(R10BIO_MadeGood, &r10_bio->state);
- rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+ rdev_dec_pending(rdev, mddev);
end_sync_request(r10_bio);
}
@@ -1609,6 +1793,29 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
generic_make_request(tbio);
}
+ /* Now write out to any replacement devices
+ * that are active
+ */
+ for (i = 0; i < conf->copies; i++) {
+ int j, d;
+ int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
+
+ tbio = r10_bio->devs[i].repl_bio;
+ if (!tbio || !tbio->bi_end_io)
+ continue;
+ if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
+ && r10_bio->devs[i].bio != fbio)
+ for (j = 0; j < vcnt; j++)
+ memcpy(page_address(tbio->bi_io_vec[j].bv_page),
+ page_address(fbio->bi_io_vec[j].bv_page),
+ PAGE_SIZE);
+ d = r10_bio->devs[i].devnum;
+ atomic_inc(&r10_bio->remaining);
+ md_sync_acct(conf->mirrors[d].replacement->bdev,
+ tbio->bi_size >> 9);
+ generic_make_request(tbio);
+ }
+
done:
if (atomic_dec_and_test(&r10_bio->remaining)) {
md_done_sync(mddev, r10_bio->sectors, 1);
@@ -1668,8 +1875,13 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
s << 9,
bio->bi_io_vec[idx].bv_page,
WRITE, false);
- if (!ok)
+ if (!ok) {
set_bit(WriteErrorSeen, &rdev->flags);
+ if (!test_and_set_bit(WantReplacement,
+ &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED,
+ &rdev->mddev->recovery);
+ }
}
if (!ok) {
/* We don't worry if we cannot set a bad block -
@@ -1709,7 +1921,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
{
struct r10conf *conf = mddev->private;
int d;
- struct bio *wbio;
+ struct bio *wbio, *wbio2;
if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
fix_recovery_read_error(r10_bio);
@@ -1721,12 +1933,20 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* share the pages with the first bio
* and submit the write request
*/
- wbio = r10_bio->devs[1].bio;
d = r10_bio->devs[1].devnum;
-
- atomic_inc(&conf->mirrors[d].rdev->nr_pending);
- md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
- generic_make_request(wbio);
+ wbio = r10_bio->devs[1].bio;
+ wbio2 = r10_bio->devs[1].repl_bio;
+ if (wbio->bi_end_io) {
+ atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+ md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
+ generic_make_request(wbio);
+ }
+ if (wbio2 && wbio2->bi_end_io) {
+ atomic_inc(&conf->mirrors[d].replacement->nr_pending);
+ md_sync_acct(conf->mirrors[d].replacement->bdev,
+ wbio2->bi_size >> 9);
+ generic_make_request(wbio2);
+ }
}
@@ -1779,8 +1999,12 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
/* success */
return 1;
- if (rw == WRITE)
+ if (rw == WRITE) {
set_bit(WriteErrorSeen, &rdev->flags);
+ if (!test_and_set_bit(WantReplacement, &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED,
+ &rdev->mddev->recovery);
+ }
/* need to record an error - either for the block or the device */
if (!rdev_set_badblocks(rdev, sector, sectors, 0))
md_error(rdev->mddev, rdev);
@@ -2060,10 +2284,9 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
{
int slot = r10_bio->read_slot;
- int mirror = r10_bio->devs[slot].devnum;
struct bio *bio;
struct r10conf *conf = mddev->private;
- struct md_rdev *rdev;
+ struct md_rdev *rdev = r10_bio->devs[slot].rdev;
char b[BDEVNAME_SIZE];
unsigned long do_sync;
int max_sectors;
@@ -2081,15 +2304,15 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
fix_read_error(conf, mddev, r10_bio);
unfreeze_array(conf);
}
- rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
+ rdev_dec_pending(rdev, mddev);
bio = r10_bio->devs[slot].bio;
bdevname(bio->bi_bdev, b);
r10_bio->devs[slot].bio =
mddev->ro ? IO_BLOCKED : NULL;
read_more:
- mirror = read_balance(conf, r10_bio, &max_sectors);
- if (mirror == -1) {
+ rdev = read_balance(conf, r10_bio, &max_sectors);
+ if (rdev == NULL) {
printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
" read error for block %llu\n",
mdname(mddev), b,
@@ -2103,7 +2326,6 @@ read_more:
if (bio)
bio_put(bio);
slot = r10_bio->read_slot;
- rdev = conf->mirrors[mirror].rdev;
printk_ratelimited(
KERN_ERR
"md/raid10:%s: %s: redirecting"
@@ -2117,6 +2339,7 @@ read_more:
r10_bio->sector - bio->bi_sector,
max_sectors);
r10_bio->devs[slot].bio = bio;
+ r10_bio->devs[slot].rdev = rdev;
bio->bi_sector = r10_bio->devs[slot].addr
+ rdev->data_offset;
bio->bi_bdev = rdev->bdev;
@@ -2187,6 +2410,22 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
r10_bio->sectors, 0))
md_error(conf->mddev, rdev);
}
+ rdev = conf->mirrors[dev].replacement;
+ if (r10_bio->devs[m].repl_bio == NULL)
+ continue;
+ if (test_bit(BIO_UPTODATE,
+ &r10_bio->devs[m].repl_bio->bi_flags)) {
+ rdev_clear_badblocks(
+ rdev,
+ r10_bio->devs[m].addr,
+ r10_bio->sectors);
+ } else {
+ if (!rdev_set_badblocks(
+ rdev,
+ r10_bio->devs[m].addr,
+ r10_bio->sectors, 0))
+ md_error(conf->mddev, rdev);
+ }
}
put_buf(r10_bio);
} else {
@@ -2209,6 +2448,15 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
}
rdev_dec_pending(rdev, conf->mddev);
}
+ bio = r10_bio->devs[m].repl_bio;
+ rdev = conf->mirrors[dev].replacement;
+ if (rdev && bio == IO_MADE_GOOD) {
+ rdev_clear_badblocks(
+ rdev,
+ r10_bio->devs[m].addr,
+ r10_bio->sectors);
+ rdev_dec_pending(rdev, conf->mddev);
+ }
}
if (test_bit(R10BIO_WriteError,
&r10_bio->state))
@@ -2272,9 +2520,14 @@ static void raid10d(struct mddev *mddev)
static int init_resync(struct r10conf *conf)
{
int buffs;
+ int i;
buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
BUG_ON(conf->r10buf_pool);
+ conf->have_replacement = 0;
+ for (i = 0; i < conf->raid_disks; i++)
+ if (conf->mirrors[i].replacement)
+ conf->have_replacement = 1;
conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
if (!conf->r10buf_pool)
return -ENOMEM;
@@ -2355,9 +2608,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
bitmap_end_sync(mddev->bitmap, sect,
&sync_blocks, 1);
}
- } else /* completed sync */
+ } else {
+ /* completed sync */
+ if ((!mddev->bitmap || conf->fullsync)
+ && conf->have_replacement
+ && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ /* Completed a full sync so the replacements
+ * are now fully recovered.
+ */
+ for (i = 0; i < conf->raid_disks; i++)
+ if (conf->mirrors[i].replacement)
+ conf->mirrors[i].replacement
+ ->recovery_offset
+ = MaxSector;
+ }
conf->fullsync = 0;
-
+ }
bitmap_close_sync(mddev->bitmap);
close_sync(conf);
*skipped = 1;
@@ -2414,23 +2680,30 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t sect;
int must_sync;
int any_working;
-
- if (conf->mirrors[i].rdev == NULL ||
- test_bit(In_sync, &conf->mirrors[i].rdev->flags))
+ struct mirror_info *mirror = &conf->mirrors[i];
+
+ if ((mirror->rdev == NULL ||
+ test_bit(In_sync, &mirror->rdev->flags))
+ &&
+ (mirror->replacement == NULL ||
+ test_bit(Faulty,
+ &mirror->replacement->flags)))
continue;
still_degraded = 0;
/* want to reconstruct this device */
rb2 = r10_bio;
sect = raid10_find_virt(conf, sector_nr, i);
- /* Unless we are doing a full sync, we only need
- * to recover the block if it is set in the bitmap
+ /* Unless we are doing a full sync, or a replacement
+ * we only need to recover the block if it is set in
+ * the bitmap
*/
must_sync = bitmap_start_sync(mddev->bitmap, sect,
&sync_blocks, 1);
if (sync_blocks < max_sync)
max_sync = sync_blocks;
if (!must_sync &&
+ mirror->replacement == NULL &&
!conf->fullsync) {
/* yep, skip the sync_blocks here, but don't assume
* that there will never be anything to do here
@@ -2500,33 +2773,60 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_end_io = end_sync_read;
bio->bi_rw = READ;
from_addr = r10_bio->devs[j].addr;
- bio->bi_sector = from_addr +
- conf->mirrors[d].rdev->data_offset;
- bio->bi_bdev = conf->mirrors[d].rdev->bdev;
- atomic_inc(&conf->mirrors[d].rdev->nr_pending);
- atomic_inc(&r10_bio->remaining);
- /* and we write to 'i' */
+ bio->bi_sector = from_addr + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ atomic_inc(&rdev->nr_pending);
+ /* and we write to 'i' (if not in_sync) */
for (k=0; k<conf->copies; k++)
if (r10_bio->devs[k].devnum == i)
break;
BUG_ON(k == conf->copies);
- bio = r10_bio->devs[1].bio;
- bio->bi_next = biolist;
- biolist = bio;
- bio->bi_private = r10_bio;
- bio->bi_end_io = end_sync_write;
- bio->bi_rw = WRITE;
to_addr = r10_bio->devs[k].addr;
- bio->bi_sector = to_addr +
- conf->mirrors[i].rdev->data_offset;
- bio->bi_bdev = conf->mirrors[i].rdev->bdev;
-
r10_bio->devs[0].devnum = d;
r10_bio->devs[0].addr = from_addr;
r10_bio->devs[1].devnum = i;
r10_bio->devs[1].addr = to_addr;
+ rdev = mirror->rdev;
+ if (!test_bit(In_sync, &rdev->flags)) {
+ bio = r10_bio->devs[1].bio;
+ bio->bi_next = biolist;
+ biolist = bio;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = end_sync_write;
+ bio->bi_rw = WRITE;
+ bio->bi_sector = to_addr
+ + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ atomic_inc(&r10_bio->remaining);
+ } else
+ r10_bio->devs[1].bio->bi_end_io = NULL;
+
+ /* and maybe write to replacement */
+ bio = r10_bio->devs[1].repl_bio;
+ if (bio)
+ bio->bi_end_io = NULL;
+ rdev = mirror->replacement;
+ /* Note: if rdev != NULL, then bio
+ * cannot be NULL as r10buf_pool_alloc will
+ * have allocated it.
+ * So the second test here is pointless.
+ * But it keeps semantic-checkers happy, and
+ * this comment keeps human reviewers
+ * happy.
+ */
+ if (rdev == NULL || bio == NULL ||
+ test_bit(Faulty, &rdev->flags))
+ break;
+ bio->bi_next = biolist;
+ biolist = bio;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = end_sync_write;
+ bio->bi_rw = WRITE;
+ bio->bi_sector = to_addr + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ atomic_inc(&r10_bio->remaining);
break;
}
if (j == conf->copies) {
@@ -2544,8 +2844,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
for (k = 0; k < conf->copies; k++)
if (r10_bio->devs[k].devnum == i)
break;
- if (!rdev_set_badblocks(
- conf->mirrors[i].rdev,
+ if (!test_bit(In_sync,
+ &mirror->rdev->flags)
+ && !rdev_set_badblocks(
+ mirror->rdev,
+ r10_bio->devs[k].addr,
+ max_sync, 0))
+ any_working = 0;
+ if (mirror->replacement &&
+ !rdev_set_badblocks(
+ mirror->replacement,
r10_bio->devs[k].addr,
max_sync, 0))
any_working = 0;
@@ -2556,7 +2864,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
printk(KERN_INFO "md/raid10:%s: insufficient "
"working devices for recovery.\n",
mdname(mddev));
- conf->mirrors[i].recovery_disabled
+ mirror->recovery_disabled
= mddev->recovery_disabled;
}
break;
@@ -2605,6 +2913,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t first_bad, sector;
int bad_sectors;
+ if (r10_bio->devs[i].repl_bio)
+ r10_bio->devs[i].repl_bio->bi_end_io = NULL;
+
bio = r10_bio->devs[i].bio;
bio->bi_end_io = NULL;
clear_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -2635,6 +2946,27 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
conf->mirrors[d].rdev->data_offset;
bio->bi_bdev = conf->mirrors[d].rdev->bdev;
count++;
+
+ if (conf->mirrors[d].replacement == NULL ||
+ test_bit(Faulty,
+ &conf->mirrors[d].replacement->flags))
+ continue;
+
+ /* Need to set up for writing to the replacement */
+ bio = r10_bio->devs[i].repl_bio;
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+
+ sector = r10_bio->devs[i].addr;
+ atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+ bio->bi_next = biolist;
+ biolist = bio;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = end_sync_write;
+ bio->bi_rw = WRITE;
+ bio->bi_sector = sector +
+ conf->mirrors[d].replacement->data_offset;
+ bio->bi_bdev = conf->mirrors[d].replacement->bdev;
+ count++;
}
if (count < 2) {
@@ -2643,6 +2975,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
if (r10_bio->devs[i].bio->bi_end_io)
rdev_dec_pending(conf->mirrors[d].rdev,
mddev);
+ if (r10_bio->devs[i].repl_bio &&
+ r10_bio->devs[i].repl_bio->bi_end_io)
+ rdev_dec_pending(
+ conf->mirrors[d].replacement,
+ mddev);
}
put_buf(r10_bio);
biolist = NULL;
@@ -2896,6 +3233,16 @@ static int run(struct mddev *mddev)
continue;
disk = conf->mirrors + disk_idx;
+ if (test_bit(Replacement, &rdev->flags)) {
+ if (disk->replacement)
+ goto out_free_conf;
+ disk->replacement = rdev;
+ } else {
+ if (disk->rdev)
+ goto out_free_conf;
+ disk->rdev = rdev;
+ }
+
disk->rdev = rdev;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
@@ -2923,6 +3270,13 @@ static int run(struct mddev *mddev)
disk = conf->mirrors + i;
+ if (!disk->rdev && disk->replacement) {
+ /* The replacement is all we have - use it */
+ disk->rdev = disk->replacement;
+ disk->replacement = NULL;
+ clear_bit(Replacement, &disk->rdev->flags);
+ }
+
if (!disk->rdev ||
!test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 7facfdf841f4..7c615613c381 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -2,7 +2,7 @@
#define _RAID10_H
struct mirror_info {
- struct md_rdev *rdev;
+ struct md_rdev *rdev, *replacement;
sector_t head_position;
int recovery_disabled; /* matches
* mddev->recovery_disabled
@@ -18,12 +18,13 @@ struct r10conf {
spinlock_t device_lock;
/* geometry */
- int near_copies; /* number of copies laid out raid0 style */
+ int near_copies; /* number of copies laid out
+ * raid0 style */
int far_copies; /* number of copies laid out
* at large strides across drives
*/
- int far_offset; /* far_copies are offset by 1 stripe
- * instead of many
+ int far_offset; /* far_copies are offset by 1
+ * stripe instead of many
*/
int copies; /* near_copies * far_copies.
* must be <= raid_disks
@@ -34,10 +35,11 @@ struct r10conf {
* 1 stripe.
*/
- sector_t dev_sectors; /* temp copy of mddev->dev_sectors */
+ sector_t dev_sectors; /* temp copy of
+ * mddev->dev_sectors */
- int chunk_shift; /* shift from chunks to sectors */
- sector_t chunk_mask;
+ int chunk_shift; /* shift from chunks to sectors */
+ sector_t chunk_mask;
struct list_head retry_list;
/* queue pending writes and submit them on unplug */
@@ -45,20 +47,22 @@ struct r10conf {
int pending_count;
spinlock_t resync_lock;
- int nr_pending;
- int nr_waiting;
- int nr_queued;
- int barrier;
+ int nr_pending;
+ int nr_waiting;
+ int nr_queued;
+ int barrier;
sector_t next_resync;
int fullsync; /* set to 1 if a full sync is needed,
* (fresh device added).
* Cleared when a sync completes.
*/
-
+ int have_replacement; /* There is at least one
+ * replacement device.
+ */
wait_queue_head_t wait_barrier;
- mempool_t *r10bio_pool;
- mempool_t *r10buf_pool;
+ mempool_t *r10bio_pool;
+ mempool_t *r10buf_pool;
struct page *tmppage;
/* When taking over an array from a different personality, we store
@@ -98,11 +102,18 @@ struct r10bio {
* When resyncing we also use one for each copy.
* When reconstructing, we use 2 bios, one for read, one for write.
* We choose the number when they are allocated.
+ * We sometimes need an extra bio to write to the replacement.
*/
struct {
- struct bio *bio;
- sector_t addr;
- int devnum;
+ struct bio *bio;
+ union {
+ struct bio *repl_bio; /* used for resync and
+ * writes */
+ struct md_rdev *rdev; /* used for reads
+ * (read_slot >= 0) */
+ };
+ sector_t addr;
+ int devnum;
} devs[0];
};
@@ -121,17 +132,19 @@ struct r10bio {
#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
/* bits for r10bio.state */
-#define R10BIO_Uptodate 0
-#define R10BIO_IsSync 1
-#define R10BIO_IsRecover 2
-#define R10BIO_Degraded 3
+enum r10bio_state {
+ R10BIO_Uptodate,
+ R10BIO_IsSync,
+ R10BIO_IsRecover,
+ R10BIO_Degraded,
/* Set ReadError on bios that experience a read error
* so that raid10d knows what to do with them.
*/
-#define R10BIO_ReadError 4
+ R10BIO_ReadError,
/* If a write for this request means we can clear some
* known-bad-block records, we set this flag.
*/
-#define R10BIO_MadeGood 5
-#define R10BIO_WriteError 6
+ R10BIO_MadeGood,
+ R10BIO_WriteError,
+};
#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 297e26092178..360f2b98f62b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -370,12 +370,10 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
* of the two sections, and some non-in_sync devices may
* be insync in the section most affected by failed devices.
*/
-static int has_failed(struct r5conf *conf)
+static int calc_degraded(struct r5conf *conf)
{
- int degraded;
+ int degraded, degraded2;
int i;
- if (conf->mddev->reshape_position == MaxSector)
- return conf->mddev->degraded > conf->max_degraded;
rcu_read_lock();
degraded = 0;
@@ -399,14 +397,14 @@ static int has_failed(struct r5conf *conf)
degraded++;
}
rcu_read_unlock();
- if (degraded > conf->max_degraded)
- return 1;
+ if (conf->raid_disks == conf->previous_raid_disks)
+ return degraded;
rcu_read_lock();
- degraded = 0;
+ degraded2 = 0;
for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
if (!rdev || test_bit(Faulty, &rdev->flags))
- degraded++;
+ degraded2++;
else if (test_bit(In_sync, &rdev->flags))
;
else
@@ -416,9 +414,22 @@ static int has_failed(struct r5conf *conf)
* almost certainly hasn't.
*/
if (conf->raid_disks <= conf->previous_raid_disks)
- degraded++;
+ degraded2++;
}
rcu_read_unlock();
+ if (degraded2 > degraded)
+ return degraded2;
+ return degraded;
+}
+
+static int has_failed(struct r5conf *conf)
+{
+ int degraded;
+
+ if (conf->mddev->reshape_position == MaxSector)
+ return conf->mddev->degraded > conf->max_degraded;
+
+ degraded = calc_degraded(conf);
if (degraded > conf->max_degraded)
return 1;
return 0;
@@ -492,8 +503,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
for (i = disks; i--; ) {
int rw;
- struct bio *bi;
- struct md_rdev *rdev;
+ int replace_only = 0;
+ struct bio *bi, *rbi;
+ struct md_rdev *rdev, *rrdev = NULL;
if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
rw = WRITE_FUA;
@@ -501,27 +513,57 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
rw = WRITE;
} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
rw = READ;
- else
+ else if (test_and_clear_bit(R5_WantReplace,
+ &sh->dev[i].flags)) {
+ rw = WRITE;
+ replace_only = 1;
+ } else
continue;
bi = &sh->dev[i].req;
+ rbi = &sh->dev[i].rreq; /* For writing to replacement */
bi->bi_rw = rw;
- if (rw & WRITE)
+ rbi->bi_rw = rw;
+ if (rw & WRITE) {
bi->bi_end_io = raid5_end_write_request;
- else
+ rbi->bi_end_io = raid5_end_write_request;
+ } else
bi->bi_end_io = raid5_end_read_request;
rcu_read_lock();
+ rrdev = rcu_dereference(conf->disks[i].replacement);
+ smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
rdev = rcu_dereference(conf->disks[i].rdev);
+ if (!rdev) {
+ rdev = rrdev;
+ rrdev = NULL;
+ }
+ if (rw & WRITE) {
+ if (replace_only)
+ rdev = NULL;
+ if (rdev == rrdev)
+ /* We raced and saw duplicates */
+ rrdev = NULL;
+ } else {
+ if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
+ rdev = rrdev;
+ rrdev = NULL;
+ }
+
if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL;
if (rdev)
atomic_inc(&rdev->nr_pending);
+ if (rrdev && test_bit(Faulty, &rrdev->flags))
+ rrdev = NULL;
+ if (rrdev)
+ atomic_inc(&rrdev->nr_pending);
rcu_read_unlock();
/* We have already checked bad blocks for reads. Now
- * need to check for writes.
+ * need to check for writes. We never accept write errors
+ * on the replacement, so we don't to check rrdev.
*/
while ((rw & WRITE) && rdev &&
test_bit(WriteErrorSeen, &rdev->flags)) {
@@ -551,7 +593,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
}
if (rdev) {
- if (s->syncing || s->expanding || s->expanded)
+ if (s->syncing || s->expanding || s->expanded
+ || s->replacing)
md_sync_acct(rdev->bdev, STRIPE_SECTORS);
set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -563,16 +606,38 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
atomic_inc(&sh->count);
bi->bi_sector = sh->sector + rdev->data_offset;
bi->bi_flags = 1 << BIO_UPTODATE;
- bi->bi_vcnt = 1;
- bi->bi_max_vecs = 1;
bi->bi_idx = 0;
- bi->bi_io_vec = &sh->dev[i].vec;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
bi->bi_io_vec[0].bv_offset = 0;
bi->bi_size = STRIPE_SIZE;
bi->bi_next = NULL;
+ if (rrdev)
+ set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
generic_make_request(bi);
- } else {
+ }
+ if (rrdev) {
+ if (s->syncing || s->expanding || s->expanded
+ || s->replacing)
+ md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
+
+ set_bit(STRIPE_IO_STARTED, &sh->state);
+
+ rbi->bi_bdev = rrdev->bdev;
+ pr_debug("%s: for %llu schedule op %ld on "
+ "replacement disc %d\n",
+ __func__, (unsigned long long)sh->sector,
+ rbi->bi_rw, i);
+ atomic_inc(&sh->count);
+ rbi->bi_sector = sh->sector + rrdev->data_offset;
+ rbi->bi_flags = 1 << BIO_UPTODATE;
+ rbi->bi_idx = 0;
+ rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+ rbi->bi_io_vec[0].bv_offset = 0;
+ rbi->bi_size = STRIPE_SIZE;
+ rbi->bi_next = NULL;
+ generic_make_request(rbi);
+ }
+ if (!rdev && !rrdev) {
if (rw & WRITE)
set_bit(STRIPE_DEGRADED, &sh->state);
pr_debug("skip op %ld on disc %d for sector %llu\n",
@@ -1583,7 +1648,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
int disks = sh->disks, i;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
char b[BDEVNAME_SIZE];
- struct md_rdev *rdev;
+ struct md_rdev *rdev = NULL;
for (i=0 ; i<disks; i++)
@@ -1597,11 +1662,23 @@ static void raid5_end_read_request(struct bio * bi, int error)
BUG();
return;
}
+ if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
+ /* If replacement finished while this request was outstanding,
+ * 'replacement' might be NULL already.
+ * In that case it moved down to 'rdev'.
+ * rdev is not removed until all requests are finished.
+ */
+ rdev = conf->disks[i].replacement;
+ if (!rdev)
+ rdev = conf->disks[i].rdev;
if (uptodate) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
- rdev = conf->disks[i].rdev;
+ /* Note that this cannot happen on a
+ * replacement device. We just fail those on
+ * any error
+ */
printk_ratelimited(
KERN_INFO
"md/raid:%s: read error corrected"
@@ -1614,16 +1691,24 @@ static void raid5_end_read_request(struct bio * bi, int error)
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
}
- if (atomic_read(&conf->disks[i].rdev->read_errors))
- atomic_set(&conf->disks[i].rdev->read_errors, 0);
+ if (atomic_read(&rdev->read_errors))
+ atomic_set(&rdev->read_errors, 0);
} else {
- const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
+ const char *bdn = bdevname(rdev->bdev, b);
int retry = 0;
- rdev = conf->disks[i].rdev;
clear_bit(R5_UPTODATE, &sh->dev[i].flags);
atomic_inc(&rdev->read_errors);
- if (conf->mddev->degraded >= conf->max_degraded)
+ if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
+ printk_ratelimited(
+ KERN_WARNING
+ "md/raid:%s: read error on replacement device "
+ "(sector %llu on %s).\n",
+ mdname(conf->mddev),
+ (unsigned long long)(sh->sector
+ + rdev->data_offset),
+ bdn);
+ else if (conf->mddev->degraded >= conf->max_degraded)
printk_ratelimited(
KERN_WARNING
"md/raid:%s: read error not correctable "
@@ -1657,7 +1742,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
md_error(conf->mddev, rdev);
}
}
- rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
+ rdev_dec_pending(rdev, conf->mddev);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
@@ -1668,14 +1753,30 @@ static void raid5_end_write_request(struct bio *bi, int error)
struct stripe_head *sh = bi->bi_private;
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i;
+ struct md_rdev *uninitialized_var(rdev);
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
sector_t first_bad;
int bad_sectors;
+ int replacement = 0;
- for (i=0 ; i<disks; i++)
- if (bi == &sh->dev[i].req)
+ for (i = 0 ; i < disks; i++) {
+ if (bi == &sh->dev[i].req) {
+ rdev = conf->disks[i].rdev;
break;
-
+ }
+ if (bi == &sh->dev[i].rreq) {
+ rdev = conf->disks[i].replacement;
+ if (rdev)
+ replacement = 1;
+ else
+ /* rdev was removed and 'replacement'
+ * replaced it. rdev is not removed
+ * until all requests are finished.
+ */
+ rdev = conf->disks[i].rdev;
+ break;
+ }
+ }
pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
uptodate);
@@ -1684,21 +1785,33 @@ static void raid5_end_write_request(struct bio *bi, int error)
return;
}
- if (!uptodate) {
- set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags);
- set_bit(R5_WriteError, &sh->dev[i].flags);
- } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS,
- &first_bad, &bad_sectors))
- set_bit(R5_MadeGood, &sh->dev[i].flags);
+ if (replacement) {
+ if (!uptodate)
+ md_error(conf->mddev, rdev);
+ else if (is_badblock(rdev, sh->sector,
+ STRIPE_SECTORS,
+ &first_bad, &bad_sectors))
+ set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
+ } else {
+ if (!uptodate) {
+ set_bit(WriteErrorSeen, &rdev->flags);
+ set_bit(R5_WriteError, &sh->dev[i].flags);
+ if (!test_and_set_bit(WantReplacement, &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED,
+ &rdev->mddev->recovery);
+ } else if (is_badblock(rdev, sh->sector,
+ STRIPE_SECTORS,
+ &first_bad, &bad_sectors))
+ set_bit(R5_MadeGood, &sh->dev[i].flags);
+ }
+ rdev_dec_pending(rdev, conf->mddev);
- rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-
- clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
+ clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
}
-
static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
static void raid5_build_block(struct stripe_head *sh, int i, int previous)
@@ -1709,12 +1822,15 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
dev->req.bi_io_vec = &dev->vec;
dev->req.bi_vcnt++;
dev->req.bi_max_vecs++;
+ dev->req.bi_private = sh;
dev->vec.bv_page = dev->page;
- dev->vec.bv_len = STRIPE_SIZE;
- dev->vec.bv_offset = 0;
- dev->req.bi_sector = sh->sector;
- dev->req.bi_private = sh;
+ bio_init(&dev->rreq);
+ dev->rreq.bi_io_vec = &dev->rvec;
+ dev->rreq.bi_vcnt++;
+ dev->rreq.bi_max_vecs++;
+ dev->rreq.bi_private = sh;
+ dev->rvec.bv_page = dev->page;
dev->flags = 0;
dev->sector = compute_blocknr(sh, i, previous);
@@ -1724,18 +1840,15 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
{
char b[BDEVNAME_SIZE];
struct r5conf *conf = mddev->private;
+ unsigned long flags;
pr_debug("raid456: error called\n");
- if (test_and_clear_bit(In_sync, &rdev->flags)) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- mddev->degraded++;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- /*
- * if recovery was running, make sure it aborts.
- */
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- }
+ spin_lock_irqsave(&conf->device_lock, flags);
+ clear_bit(In_sync, &rdev->flags);
+ mddev->degraded = calc_degraded(conf);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -2362,8 +2475,9 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
clear_bit(STRIPE_SYNCING, &sh->state);
s->syncing = 0;
+ s->replacing = 0;
/* There is nothing more to do for sync/check/repair.
- * For recover we need to record a bad block on all
+ * For recover/replace we need to record a bad block on all
* non-sync devices, or abort the recovery
*/
if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
@@ -2373,12 +2487,18 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
*/
for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = conf->disks[i].rdev;
- if (!rdev
- || test_bit(Faulty, &rdev->flags)
- || test_bit(In_sync, &rdev->flags))
- continue;
- if (!rdev_set_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0))
+ if (rdev
+ && !test_bit(Faulty, &rdev->flags)
+ && !test_bit(In_sync, &rdev->flags)
+ && !rdev_set_badblocks(rdev, sh->sector,
+ STRIPE_SECTORS, 0))
+ abort = 1;
+ rdev = conf->disks[i].replacement;
+ if (rdev
+ && !test_bit(Faulty, &rdev->flags)
+ && !test_bit(In_sync, &rdev->flags)
+ && !rdev_set_badblocks(rdev, sh->sector,
+ STRIPE_SECTORS, 0))
abort = 1;
}
if (abort) {
@@ -2387,6 +2507,22 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
}
}
+static int want_replace(struct stripe_head *sh, int disk_idx)
+{
+ struct md_rdev *rdev;
+ int rv = 0;
+ /* Doing recovery so rcu locking not required */
+ rdev = sh->raid_conf->disks[disk_idx].replacement;
+ if (rdev
+ && !test_bit(Faulty, &rdev->flags)
+ && !test_bit(In_sync, &rdev->flags)
+ && (rdev->recovery_offset <= sh->sector
+ || rdev->mddev->recovery_cp <= sh->sector))
+ rv = 1;
+
+ return rv;
+}
+
/* fetch_block - checks the given member device to see if its data needs
* to be read or computed to satisfy a request.
*
@@ -2406,6 +2542,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
(dev->toread ||
(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
s->syncing || s->expanding ||
+ (s->replacing && want_replace(sh, disk_idx)) ||
(s->failed >= 1 && fdev[0]->toread) ||
(s->failed >= 2 && fdev[1]->toread) ||
(sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
@@ -2959,22 +3096,18 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
}
}
-
/*
* handle_stripe - do things to a stripe.
*
- * We lock the stripe and then examine the state of various bits
- * to see what needs to be done.
+ * We lock the stripe by setting STRIPE_ACTIVE and then examine the
+ * state of various bits to see what needs to be done.
* Possible results:
- * return some read request which now have data
- * return some write requests which are safely on disc
+ * return some read requests which now have data
+ * return some write requests which are safely on storage
* schedule a read on some buffers
* schedule a write of some buffers
* return confirmation of parity correctness
*
- * buffers are taken off read_list or write_list, and bh_cache buffers
- * get BH_Lock set before the stripe lock is released.
- *
*/
static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
@@ -2983,10 +3116,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
int disks = sh->disks;
struct r5dev *dev;
int i;
+ int do_recovery = 0;
memset(s, 0, sizeof(*s));
- s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
s->failed_num[0] = -1;
@@ -3004,7 +3137,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
dev = &sh->dev[i];
pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
- i, dev->flags, dev->toread, dev->towrite, dev->written);
+ i, dev->flags,
+ dev->toread, dev->towrite, dev->written);
/* maybe we can reply to a read
*
* new wantfill requests are only permitted while
@@ -3035,7 +3169,23 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
}
if (dev->written)
s->written++;
- rdev = rcu_dereference(conf->disks[i].rdev);
+ /* Prefer to use the replacement for reads, but only
+ * if it is recovered enough and has no bad blocks.
+ */
+ rdev = rcu_dereference(conf->disks[i].replacement);
+ if (rdev && !test_bit(Faulty, &rdev->flags) &&
+ rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
+ !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+ &first_bad, &bad_sectors))
+ set_bit(R5_ReadRepl, &dev->flags);
+ else {
+ if (rdev)
+ set_bit(R5_NeedReplace, &dev->flags);
+ rdev = rcu_dereference(conf->disks[i].rdev);
+ clear_bit(R5_ReadRepl, &dev->flags);
+ }
+ if (rdev && test_bit(Faulty, &rdev->flags))
+ rdev = NULL;
if (rdev) {
is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
&first_bad, &bad_sectors);
@@ -3063,26 +3213,50 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
}
} else if (test_bit(In_sync, &rdev->flags))
set_bit(R5_Insync, &dev->flags);
- else if (!test_bit(Faulty, &rdev->flags)) {
+ else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
/* in sync if before recovery_offset */
- if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
- set_bit(R5_Insync, &dev->flags);
- }
- if (test_bit(R5_WriteError, &dev->flags)) {
- clear_bit(R5_Insync, &dev->flags);
- if (!test_bit(Faulty, &rdev->flags)) {
+ set_bit(R5_Insync, &dev->flags);
+ else if (test_bit(R5_UPTODATE, &dev->flags) &&
+ test_bit(R5_Expanded, &dev->flags))
+ /* If we've reshaped into here, we assume it is Insync.
+ * We will shortly update recovery_offset to make
+ * it official.
+ */
+ set_bit(R5_Insync, &dev->flags);
+
+ if (rdev && test_bit(R5_WriteError, &dev->flags)) {
+ /* This flag does not apply to '.replacement'
+ * only to .rdev, so make sure to check that*/
+ struct md_rdev *rdev2 = rcu_dereference(
+ conf->disks[i].rdev);
+ if (rdev2 == rdev)
+ clear_bit(R5_Insync, &dev->flags);
+ if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
s->handle_bad_blocks = 1;
- atomic_inc(&rdev->nr_pending);
+ atomic_inc(&rdev2->nr_pending);
} else
clear_bit(R5_WriteError, &dev->flags);
}
- if (test_bit(R5_MadeGood, &dev->flags)) {
- if (!test_bit(Faulty, &rdev->flags)) {
+ if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
+ /* This flag does not apply to '.replacement'
+ * only to .rdev, so make sure to check that*/
+ struct md_rdev *rdev2 = rcu_dereference(
+ conf->disks[i].rdev);
+ if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
s->handle_bad_blocks = 1;
- atomic_inc(&rdev->nr_pending);
+ atomic_inc(&rdev2->nr_pending);
} else
clear_bit(R5_MadeGood, &dev->flags);
}
+ if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
+ struct md_rdev *rdev2 = rcu_dereference(
+ conf->disks[i].replacement);
+ if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
+ s->handle_bad_blocks = 1;
+ atomic_inc(&rdev2->nr_pending);
+ } else
+ clear_bit(R5_MadeGoodRepl, &dev->flags);
+ }
if (!test_bit(R5_Insync, &dev->flags)) {
/* The ReadError flag will just be confusing now */
clear_bit(R5_ReadError, &dev->flags);
@@ -3094,9 +3268,25 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (s->failed < 2)
s->failed_num[s->failed] = i;
s->failed++;
+ if (rdev && !test_bit(Faulty, &rdev->flags))
+ do_recovery = 1;
}
}
spin_unlock_irq(&conf->device_lock);
+ if (test_bit(STRIPE_SYNCING, &sh->state)) {
+ /* If there is a failed device being replaced,
+ * we must be recovering.
+ * else if we are after recovery_cp, we must be syncing
+ * else we can only be replacing
+ * sync and recovery both need to read all devices, and so
+ * use the same flag.
+ */
+ if (do_recovery ||
+ sh->sector >= conf->mddev->recovery_cp)
+ s->syncing = 1;
+ else
+ s->replacing = 1;
+ }
rcu_read_unlock();
}
@@ -3138,7 +3328,7 @@ static void handle_stripe(struct stripe_head *sh)
if (unlikely(s.blocked_rdev)) {
if (s.syncing || s.expanding || s.expanded ||
- s.to_write || s.written) {
+ s.replacing || s.to_write || s.written) {
set_bit(STRIPE_HANDLE, &sh->state);
goto finish;
}
@@ -3164,7 +3354,7 @@ static void handle_stripe(struct stripe_head *sh)
sh->reconstruct_state = 0;
if (s.to_read+s.to_write+s.written)
handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
- if (s.syncing)
+ if (s.syncing + s.replacing)
handle_failed_sync(conf, sh, &s);
}
@@ -3195,7 +3385,9 @@ static void handle_stripe(struct stripe_head *sh)
*/
if (s.to_read || s.non_overwrite
|| (conf->level == 6 && s.to_write && s.failed)
- || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
+ || (s.syncing && (s.uptodate + s.compute < disks))
+ || s.replacing
+ || s.expanding)
handle_stripe_fill(sh, &s, disks);
/* Now we check to see if any write operations have recently
@@ -3257,7 +3449,20 @@ static void handle_stripe(struct stripe_head *sh)
handle_parity_checks5(conf, sh, &s, disks);
}
- if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+ if (s.replacing && s.locked == 0
+ && !test_bit(STRIPE_INSYNC, &sh->state)) {
+ /* Write out to replacement devices where possible */
+ for (i = 0; i < conf->raid_disks; i++)
+ if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
+ test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
+ set_bit(R5_WantReplace, &sh->dev[i].flags);
+ set_bit(R5_LOCKED, &sh->dev[i].flags);
+ s.locked++;
+ }
+ set_bit(STRIPE_INSYNC, &sh->state);
+ }
+ if ((s.syncing || s.replacing) && s.locked == 0 &&
+ test_bit(STRIPE_INSYNC, &sh->state)) {
md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
clear_bit(STRIPE_SYNCING, &sh->state);
}
@@ -3355,6 +3560,15 @@ finish:
STRIPE_SECTORS);
rdev_dec_pending(rdev, conf->mddev);
}
+ if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
+ rdev = conf->disks[i].replacement;
+ if (!rdev)
+ /* rdev have been moved down */
+ rdev = conf->disks[i].rdev;
+ rdev_clear_badblocks(rdev, sh->sector,
+ STRIPE_SECTORS);
+ rdev_dec_pending(rdev, conf->mddev);
+ }
}
if (s.ops_request)
@@ -3578,6 +3792,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
int dd_idx;
struct bio* align_bi;
struct md_rdev *rdev;
+ sector_t end_sector;
if (!in_chunk_boundary(mddev, raid_bio)) {
pr_debug("chunk_aligned_read : non aligned\n");
@@ -3602,9 +3817,19 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
0,
&dd_idx, NULL);
+ end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
rcu_read_lock();
- rdev = rcu_dereference(conf->disks[dd_idx].rdev);
- if (rdev && test_bit(In_sync, &rdev->flags)) {
+ rdev = rcu_dereference(conf->disks[dd_idx].replacement);
+ if (!rdev || test_bit(Faulty, &rdev->flags) ||
+ rdev->recovery_offset < end_sector) {
+ rdev = rcu_dereference(conf->disks[dd_idx].rdev);
+ if (rdev &&
+ (test_bit(Faulty, &rdev->flags) ||
+ !(test_bit(In_sync, &rdev->flags) ||
+ rdev->recovery_offset >= end_sector)))
+ rdev = NULL;
+ }
+ if (rdev) {
sector_t first_bad;
int bad_sectors;
@@ -4129,7 +4354,6 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
}
-
bitmap_cond_end_sync(mddev->bitmap, sector_nr);
sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
@@ -4200,7 +4424,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
return handled;
}
- set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
release_stripe(sh);
raid5_set_bi_hw_segments(raid_bio, scnt);
@@ -4627,7 +4850,15 @@ static struct r5conf *setup_conf(struct mddev *mddev)
continue;
disk = conf->disks + raid_disk;
- disk->rdev = rdev;
+ if (test_bit(Replacement, &rdev->flags)) {
+ if (disk->replacement)
+ goto abort;
+ disk->replacement = rdev;
+ } else {
+ if (disk->rdev)
+ goto abort;
+ disk->rdev = rdev;
+ }
if (test_bit(In_sync, &rdev->flags)) {
char b[BDEVNAME_SIZE];
@@ -4716,6 +4947,7 @@ static int run(struct mddev *mddev)
int dirty_parity_disks = 0;
struct md_rdev *rdev;
sector_t reshape_offset = 0;
+ int i;
if (mddev->recovery_cp != MaxSector)
printk(KERN_NOTICE "md/raid:%s: not clean"
@@ -4805,12 +5037,25 @@ static int run(struct mddev *mddev)
conf->thread = NULL;
mddev->private = conf;
- /*
- * 0 for a fully functional array, 1 or 2 for a degraded array.
- */
- list_for_each_entry(rdev, &mddev->disks, same_set) {
- if (rdev->raid_disk < 0)
+ for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
+ i++) {
+ rdev = conf->disks[i].rdev;
+ if (!rdev && conf->disks[i].replacement) {
+ /* The replacement is all we have yet */
+ rdev = conf->disks[i].replacement;
+ conf->disks[i].replacement = NULL;
+ clear_bit(Replacement, &rdev->flags);
+ conf->disks[i].rdev = rdev;
+ }
+ if (!rdev)
continue;
+ if (conf->disks[i].replacement &&
+ conf->reshape_progress != MaxSector) {
+ /* replacements and reshape simply do not mix. */
+ printk(KERN_ERR "md: cannot handle concurrent "
+ "replacement and reshape.\n");
+ goto abort;
+ }
if (test_bit(In_sync, &rdev->flags)) {
working_disks++;
continue;
@@ -4844,8 +5089,10 @@ static int run(struct mddev *mddev)
dirty_parity_disks++;
}
- mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
- - working_disks);
+ /*
+ * 0 for a fully functional array, 1 or 2 for a degraded array.
+ */
+ mddev->degraded = calc_degraded(conf);
if (has_failed(conf)) {
printk(KERN_ERR "md/raid:%s: not enough operational devices"
@@ -5008,7 +5255,25 @@ static int raid5_spare_active(struct mddev *mddev)
for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->disks + i;
- if (tmp->rdev
+ if (tmp->replacement
+ && tmp->replacement->recovery_offset == MaxSector
+ && !test_bit(Faulty, &tmp->replacement->flags)
+ && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
+ /* Replacement has just become active. */
+ if (!tmp->rdev
+ || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
+ count++;
+ if (tmp->rdev) {
+ /* Replaced device not technically faulty,
+ * but we need to be sure it gets removed
+ * and never re-added.
+ */
+ set_bit(Faulty, &tmp->rdev->flags);
+ sysfs_notify_dirent_safe(
+ tmp->rdev->sysfs_state);
+ }
+ sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
+ } else if (tmp->rdev
&& tmp->rdev->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
@@ -5017,49 +5282,68 @@ static int raid5_spare_active(struct mddev *mddev)
}
}
spin_lock_irqsave(&conf->device_lock, flags);
- mddev->degraded -= count;
+ mddev->degraded = calc_degraded(conf);
spin_unlock_irqrestore(&conf->device_lock, flags);
print_raid5_conf(conf);
return count;
}
-static int raid5_remove_disk(struct mddev *mddev, int number)
+static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r5conf *conf = mddev->private;
int err = 0;
- struct md_rdev *rdev;
+ int number = rdev->raid_disk;
+ struct md_rdev **rdevp;
struct disk_info *p = conf->disks + number;
print_raid5_conf(conf);
- rdev = p->rdev;
- if (rdev) {
- if (number >= conf->raid_disks &&
- conf->reshape_progress == MaxSector)
- clear_bit(In_sync, &rdev->flags);
+ if (rdev == p->rdev)
+ rdevp = &p->rdev;
+ else if (rdev == p->replacement)
+ rdevp = &p->replacement;
+ else
+ return 0;
- if (test_bit(In_sync, &rdev->flags) ||
- atomic_read(&rdev->nr_pending)) {
- err = -EBUSY;
- goto abort;
- }
- /* Only remove non-faulty devices if recovery
- * isn't possible.
- */
- if (!test_bit(Faulty, &rdev->flags) &&
- mddev->recovery_disabled != conf->recovery_disabled &&
- !has_failed(conf) &&
- number < conf->raid_disks) {
- err = -EBUSY;
- goto abort;
- }
- p->rdev = NULL;
- synchronize_rcu();
- if (atomic_read(&rdev->nr_pending)) {
- /* lost the race, try later */
- err = -EBUSY;
- p->rdev = rdev;
- }
+ if (number >= conf->raid_disks &&
+ conf->reshape_progress == MaxSector)
+ clear_bit(In_sync, &rdev->flags);
+
+ if (test_bit(In_sync, &rdev->flags) ||
+ atomic_read(&rdev->nr_pending)) {
+ err = -EBUSY;
+ goto abort;
}
+ /* Only remove non-faulty devices if recovery
+ * isn't possible.
+ */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ mddev->recovery_disabled != conf->recovery_disabled &&
+ !has_failed(conf) &&
+ (!p->replacement || p->replacement == rdev) &&
+ number < conf->raid_disks) {
+ err = -EBUSY;
+ goto abort;
+ }
+ *rdevp = NULL;
+ synchronize_rcu();
+ if (atomic_read(&rdev->nr_pending)) {
+ /* lost the race, try later */
+ err = -EBUSY;
+ *rdevp = rdev;
+ } else if (p->replacement) {
+ /* We must have just cleared 'rdev' */
+ p->rdev = p->replacement;
+ clear_bit(Replacement, &p->replacement->flags);
+ smp_mb(); /* Make sure other CPUs may see both as identical
+ * but will never see neither - if they are careful
+ */
+ p->replacement = NULL;
+ clear_bit(WantReplacement, &rdev->flags);
+ } else
+ /* We might have just removed the Replacement as faulty-
+ * clear the bit just in case
+ */
+ clear_bit(WantReplacement, &rdev->flags);
abort:
print_raid5_conf(conf);
@@ -5095,8 +5379,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
disk = rdev->saved_raid_disk;
else
disk = first;
- for ( ; disk <= last ; disk++)
- if ((p=conf->disks + disk)->rdev == NULL) {
+ for ( ; disk <= last ; disk++) {
+ p = conf->disks + disk;
+ if (p->rdev == NULL) {
clear_bit(In_sync, &rdev->flags);
rdev->raid_disk = disk;
err = 0;
@@ -5105,6 +5390,17 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rcu_assign_pointer(p->rdev, rdev);
break;
}
+ if (test_bit(WantReplacement, &p->rdev->flags) &&
+ p->replacement == NULL) {
+ clear_bit(In_sync, &rdev->flags);
+ set_bit(Replacement, &rdev->flags);
+ rdev->raid_disk = disk;
+ err = 0;
+ conf->fullsync = 1;
+ rcu_assign_pointer(p->replacement, rdev);
+ break;
+ }
+ }
print_raid5_conf(conf);
return err;
}
@@ -5278,8 +5574,7 @@ static int raid5_start_reshape(struct mddev *mddev)
* pre and post number of devices.
*/
spin_lock_irqsave(&conf->device_lock, flags);
- mddev->degraded += (conf->raid_disks - conf->previous_raid_disks)
- - added_devices;
+ mddev->degraded = calc_degraded(conf);
spin_unlock_irqrestore(&conf->device_lock, flags);
}
mddev->raid_disks = conf->raid_disks;
@@ -5348,17 +5643,15 @@ static void raid5_finish_reshape(struct mddev *mddev)
revalidate_disk(mddev->gendisk);
} else {
int d;
- mddev->degraded = conf->raid_disks;
- for (d = 0; d < conf->raid_disks ; d++)
- if (conf->disks[d].rdev &&
- test_bit(In_sync,
- &conf->disks[d].rdev->flags))
- mddev->degraded--;
+ spin_lock_irq(&conf->device_lock);
+ mddev->degraded = calc_degraded(conf);
+ spin_unlock_irq(&conf->device_lock);
for (d = conf->raid_disks ;
d < conf->raid_disks - mddev->delta_disks;
d++) {
struct md_rdev *rdev = conf->disks[d].rdev;
- if (rdev && raid5_remove_disk(mddev, d) == 0) {
+ if (rdev &&
+ raid5_remove_disk(mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1;
}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index e10c5531f9c5..8d8e13934a48 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -27,7 +27,7 @@
* The possible state transitions are:
*
* Empty -> Want - on read or write to get old data for parity calc
- * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
+ * Empty -> Dirty - on compute_parity to satisfy write/sync request.
* Empty -> Clean - on compute_block when computing a block for failed drive
* Want -> Empty - on failed read
* Want -> Clean - on successful completion of read request
@@ -226,8 +226,11 @@ struct stripe_head {
#endif
} ops;
struct r5dev {
- struct bio req;
- struct bio_vec vec;
+ /* rreq and rvec are used for the replacement device when
+ * writing data to both devices.
+ */
+ struct bio req, rreq;
+ struct bio_vec vec, rvec;
struct page *page;
struct bio *toread, *read, *towrite, *written;
sector_t sector; /* sector of this page */
@@ -239,7 +242,13 @@ struct stripe_head {
* for handle_stripe.
*/
struct stripe_head_state {
- int syncing, expanding, expanded;
+ /* 'syncing' means that we need to read all devices, either
+ * to check/correct parity, or to reconstruct a missing device.
+ * 'replacing' means we are replacing one or more drives and
+ * the source is valid at this point so we don't need to
+ * read all devices, just the replacement targets.
+ */
+ int syncing, expanding, expanded, replacing;
int locked, uptodate, to_read, to_write, failed, written;
int to_fill, compute, req_compute, non_overwrite;
int failed_num[2];
@@ -252,38 +261,41 @@ struct stripe_head_state {
int handle_bad_blocks;
};
-/* Flags */
-#define R5_UPTODATE 0 /* page contains current data */
-#define R5_LOCKED 1 /* IO has been submitted on "req" */
-#define R5_OVERWRITE 2 /* towrite covers whole page */
+/* Flags for struct r5dev.flags */
+enum r5dev_flags {
+ R5_UPTODATE, /* page contains current data */
+ R5_LOCKED, /* IO has been submitted on "req" */
+ R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */
+ R5_OVERWRITE, /* towrite covers whole page */
/* and some that are internal to handle_stripe */
-#define R5_Insync 3 /* rdev && rdev->in_sync at start */
-#define R5_Wantread 4 /* want to schedule a read */
-#define R5_Wantwrite 5
-#define R5_Overlap 7 /* There is a pending overlapping request on this block */
-#define R5_ReadError 8 /* seen a read error here recently */
-#define R5_ReWrite 9 /* have tried to over-write the readerror */
+ R5_Insync, /* rdev && rdev->in_sync at start */
+ R5_Wantread, /* want to schedule a read */
+ R5_Wantwrite,
+ R5_Overlap, /* There is a pending overlapping request
+ * on this block */
+ R5_ReadError, /* seen a read error here recently */
+ R5_ReWrite, /* have tried to over-write the readerror */
-#define R5_Expanded 10 /* This block now has post-expand data */
-#define R5_Wantcompute 11 /* compute_block in progress treat as
- * uptodate
- */
-#define R5_Wantfill 12 /* dev->toread contains a bio that needs
- * filling
- */
-#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
-#define R5_WantFUA 14 /* Write should be FUA */
-#define R5_WriteError 15 /* got a write error - need to record it */
-#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/
-/*
- * Write method
- */
-#define RECONSTRUCT_WRITE 1
-#define READ_MODIFY_WRITE 2
-/* not a write method, but a compute_parity mode */
-#define CHECK_PARITY 3
-/* Additional compute_parity mode -- updates the parity w/o LOCKING */
-#define UPDATE_PARITY 4
+ R5_Expanded, /* This block now has post-expand data */
+ R5_Wantcompute, /* compute_block in progress treat as
+ * uptodate
+ */
+ R5_Wantfill, /* dev->toread contains a bio that needs
+ * filling
+ */
+ R5_Wantdrain, /* dev->towrite needs to be drained */
+ R5_WantFUA, /* Write should be FUA */
+ R5_WriteError, /* got a write error - need to record it */
+ R5_MadeGood, /* A bad block has been fixed by writing to it */
+ R5_ReadRepl, /* Will/did read from replacement rather than orig */
+ R5_MadeGoodRepl,/* A bad block on the replacement device has been
+ * fixed by writing to it */
+ R5_NeedReplace, /* This device has a replacement which is not
+ * up-to-date at this stripe. */
+ R5_WantReplace, /* We need to update the replacement, we have read
+ * data in, and now is a good time to write it out.
+ */
+};
/*
* Stripe state
@@ -311,13 +323,14 @@ enum {
/*
* Operation request flags
*/
-#define STRIPE_OP_BIOFILL 0
-#define STRIPE_OP_COMPUTE_BLK 1
-#define STRIPE_OP_PREXOR 2
-#define STRIPE_OP_BIODRAIN 3
-#define STRIPE_OP_RECONSTRUCT 4
-#define STRIPE_OP_CHECK 5
-
+enum {
+ STRIPE_OP_BIOFILL,
+ STRIPE_OP_COMPUTE_BLK,
+ STRIPE_OP_PREXOR,
+ STRIPE_OP_BIODRAIN,
+ STRIPE_OP_RECONSTRUCT,
+ STRIPE_OP_CHECK,
+};
/*
* Plugging:
*
@@ -344,13 +357,12 @@ enum {
struct disk_info {
- struct md_rdev *rdev;
+ struct md_rdev *rdev, *replacement;
};
struct r5conf {
struct hlist_head *stripe_hashtbl;
struct mddev *mddev;
- struct disk_info *spare;
int chunk_sectors;
int level, algorithm;
int max_degraded;