diff options
author | Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com> | 2022-03-22 18:23:38 +0300 |
---|---|---|
committer | Song Liu <song@kernel.org> | 2022-04-26 00:00:34 +0300 |
commit | 9631abdbf406c764f2a5d8305eac063bc3396a0a (patch) | |
tree | 562765b208e282d83742f8c90b84d6bc5fcf934e /drivers/md | |
parent | 5ea7c1339e3ed094dd4df48d598f9018a2587283 (diff) | |
download | linux-9631abdbf406c764f2a5d8305eac063bc3396a0a.tar.xz |
md: Set MD_BROKEN for RAID1 and RAID10
There is no direct mechanism to determine raid failure outside
personality. It is done by checking rdev->flags after executing
md_error(). If "faulty" flag is not set then -EBUSY is returned to
userspace. -EBUSY means that array will be failed after drive removal.
Mdadm has special routine to handle the array failure and it is executed
if -EBUSY is returned by md.
There are at least two known reasons to not consider this mechanism
as correct:
1. drive can be removed even if array will be failed[1].
2. -EBUSY seems to be wrong status. Array is not busy, but removal
process cannot proceed safe.
-EBUSY expectation cannot be removed without breaking compatibility
with userspace. In this patch first issue is resolved by adding support
for MD_BROKEN flag for RAID1 and RAID10. Support for RAID456 is added in
next commit.
The idea is to set the MD_BROKEN if we are sure that raid is in failed
state now. This is done in each error_handler(). In md_error() MD_BROKEN
flag is checked. If is set, then -EBUSY is returned to userspace.
As in previous commit, it causes that #mdadm --set-faulty is able to
fail array. Previously proposed workaround is valid if optional
functionality[1] is disabled.
[1] commit 9a567843f7ce("md: allow last device to be forcibly removed from
RAID1/RAID10.")
Reviewd-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
Signed-off-by: Song Liu <song@kernel.org>
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/md.c | 27 | ||||
-rw-r--r-- | drivers/md/md.h | 62 | ||||
-rw-r--r-- | drivers/md/raid1.c | 43 | ||||
-rw-r--r-- | drivers/md/raid10.c | 40 |
4 files changed, 100 insertions, 72 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 2587f872c088..9294f13e0c9d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2984,10 +2984,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) if (cmd_match(buf, "faulty") && rdev->mddev->pers) { md_error(rdev->mddev, rdev); - if (test_bit(Faulty, &rdev->flags)) - err = 0; - else + + if (test_bit(MD_BROKEN, &rdev->mddev->flags)) err = -EBUSY; + else + err = 0; } else if (cmd_match(buf, "remove")) { if (rdev->mddev->pers) { clear_bit(Blocked, &rdev->flags); @@ -4353,10 +4354,9 @@ __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, * like active, but no writes have been seen for a while (100msec). * * broken - * RAID0/LINEAR-only: same as clean, but array is missing a member. - * It's useful because RAID0/LINEAR mounted-arrays aren't stopped - * when a member is gone, so this state will at least alert the - * user that something is wrong. +* Array is failed. It's useful because mounted-arrays aren't stopped +* when array is failed, so this state will at least alert the user that +* something is wrong. */ enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, write_pending, active_idle, broken, bad_word}; @@ -7443,7 +7443,7 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) err = -ENODEV; else { md_error(mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) + if (test_bit(MD_BROKEN, &mddev->flags)) err = -EBUSY; } rcu_read_unlock(); @@ -7984,13 +7984,16 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) if (!mddev->pers || !mddev->pers->error_handler) return; - mddev->pers->error_handler(mddev,rdev); - if (mddev->degraded) + mddev->pers->error_handler(mddev, rdev); + + if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_RECOVERY_INTR, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); + if (!test_bit(MD_BROKEN, &mddev->flags)) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); md_new_event(); diff --git a/drivers/md/md.h b/drivers/md/md.h index 6ac283864533..cf2cbb17acbd 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -234,34 +234,42 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); struct md_cluster_info; -/* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ +/** + * enum mddev_flags - md device flags. + * @MD_ARRAY_FIRST_USE: First use of array, needs initialization. + * @MD_CLOSING: If set, we are closing the array, do not open it then. + * @MD_JOURNAL_CLEAN: A raid with journal is already clean. + * @MD_HAS_JOURNAL: The raid array has journal feature set. + * @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took + * resync lock, need to release the lock. + * @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as + * calls to md_error() will never cause the array to + * become failed. + * @MD_HAS_PPL: The raid array has PPL feature set. + * @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set. + * @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata + * without taking reconfig_mutex. + * @MD_UPDATING_SB: md_check_recovery is updating the metadata without + * explicitly holding reconfig_mutex. + * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that + * array is ready yet. + * @MD_BROKEN: This is used to stop writes and mark array as failed. + * + * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added + */ enum mddev_flags { - MD_ARRAY_FIRST_USE, /* First use of array, needs initialization */ - MD_CLOSING, /* If set, we are closing the array, do not open - * it then */ - MD_JOURNAL_CLEAN, /* A raid with journal is already clean */ - MD_HAS_JOURNAL, /* The raid array has journal feature set */ - MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node - * already took resync lock, need to - * release the lock */ - MD_FAILFAST_SUPPORTED, /* Using MD_FAILFAST on metadata writes is - * supported as calls to md_error() will - * never cause the array to become failed. - */ - MD_HAS_PPL, /* The raid array has PPL feature set */ - MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */ - MD_ALLOW_SB_UPDATE, /* md_check_recovery is allowed to update - * the metadata without taking reconfig_mutex. - */ - MD_UPDATING_SB, /* md_check_recovery is updating the metadata - * without explicitly holding reconfig_mutex. - */ - MD_NOT_READY, /* do_md_run() is active, so 'array_state' - * must not report that array is ready yet - */ - MD_BROKEN, /* This is used in RAID-0/LINEAR only, to stop - * I/O in case an array member is gone/failed. - */ + MD_ARRAY_FIRST_USE, + MD_CLOSING, + MD_JOURNAL_CLEAN, + MD_HAS_JOURNAL, + MD_CLUSTER_RESYNC_LOCKED, + MD_FAILFAST_SUPPORTED, + MD_HAS_PPL, + MD_HAS_MULTIPLE_PPLS, + MD_ALLOW_SB_UPDATE, + MD_UPDATING_SB, + MD_NOT_READY, + MD_BROKEN, }; enum mddev_sb_flags { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5aed2c8b746e..99d5af1362d7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1641,30 +1641,39 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev) seq_printf(seq, "]"); } +/** + * raid1_error() - RAID1 error handler. + * @mddev: affected md device. + * @rdev: member device to fail. + * + * The routine acknowledges &rdev failure and determines new @mddev state. + * If it failed, then: + * - &MD_BROKEN flag is set in &mddev->flags. + * - recovery is disabled. + * Otherwise, it must be degraded: + * - recovery is interrupted. + * - &mddev->degraded is bumped. + * + * @rdev is marked as &Faulty excluding case when array is failed and + * &mddev->fail_last_dev is off. + */ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; struct r1conf *conf = mddev->private; unsigned long flags; - /* - * If it is not operational, then we have already marked it as dead - * else if it is the last working disks with "fail_last_dev == false", - * ignore the error, let the next level up know. - * else mark the drive as failed - */ spin_lock_irqsave(&conf->device_lock, flags); - if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev - && (conf->raid_disks - mddev->degraded) == 1) { - /* - * Don't fail the drive, act as though we were just a - * normal single drive. - * However don't try a recovery from this drive as - * it is very likely to fail. - */ - conf->recovery_disabled = mddev->recovery_disabled; - spin_unlock_irqrestore(&conf->device_lock, flags); - return; + + if (test_bit(In_sync, &rdev->flags) && + (conf->raid_disks - mddev->degraded) == 1) { + set_bit(MD_BROKEN, &mddev->flags); + + if (!mddev->fail_last_dev) { + conf->recovery_disabled = mddev->recovery_disabled; + spin_unlock_irqrestore(&conf->device_lock, flags); + return; + } } set_bit(Blocked, &rdev->flags); if (test_and_clear_bit(In_sync, &rdev->flags)) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 834eb3ba95a6..dfa576cdf11c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1970,32 +1970,40 @@ static int enough(struct r10conf *conf, int ignore) _enough(conf, 1, ignore); } +/** + * raid10_error() - RAID10 error handler. + * @mddev: affected md device. + * @rdev: member device to fail. + * + * The routine acknowledges &rdev failure and determines new @mddev state. + * If it failed, then: + * - &MD_BROKEN flag is set in &mddev->flags. + * Otherwise, it must be degraded: + * - recovery is interrupted. + * - &mddev->degraded is bumped. + + * @rdev is marked as &Faulty excluding case when array is failed and + * &mddev->fail_last_dev is off. + */ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; struct r10conf *conf = mddev->private; unsigned long flags; - /* - * If it is not operational, then we have already marked it as dead - * else if it is the last working disks with "fail_last_dev == false", - * ignore the error, let the next level up know. - * else mark the drive as failed - */ spin_lock_irqsave(&conf->device_lock, flags); - if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev - && !enough(conf, rdev->raid_disk)) { - /* - * Don't fail the drive, just return an IO error. - */ - spin_unlock_irqrestore(&conf->device_lock, flags); - return; + + if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) { + set_bit(MD_BROKEN, &mddev->flags); + + if (!mddev->fail_last_dev) { + spin_unlock_irqrestore(&conf->device_lock, flags); + return; + } } if (test_and_clear_bit(In_sync, &rdev->flags)) mddev->degraded++; - /* - * If recovery is running, make sure it aborts. - */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); |