From 1af2048a3e87b4e982c53ad8cfb0c75d1a9c0a73 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:48 +0100 Subject: dm raid: fix deadlock caused by premature md_stop_writes() md_stop_writes() is called in raid_presuspend() causing deadlocks on bios submitted afterwards -- which happens on loaded raid sets with conversion requests. Fix by moving md_stop_writes() to raid_postsuspend(). NOTE: when the recovery's frozen (MD_RECOVERY_FROZEN), writes haven't been started (or are already stopped) so don't stop them again. Also remove superfluous readonly setting. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 6319d846e0ad..398314b6c31a 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3613,24 +3613,19 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs)); } -static void raid_presuspend(struct dm_target *ti) -{ - struct raid_set *rs = ti->private; - - md_stop_writes(&rs->md); -} - static void raid_postsuspend(struct dm_target *ti) { struct raid_set *rs = ti->private; if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + /* Writes have to be stopped before suspending to avoid deadlocks. */ + if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery)) + md_stop_writes(&rs->md); + mddev_lock_nointr(&rs->md); mddev_suspend(&rs->md); mddev_unlock(&rs->md); } - - rs->md.ro = 1; } static void attempt_restore_of_faulty_devices(struct raid_set *rs) @@ -3903,7 +3898,6 @@ static struct target_type raid_target = { .message = raid_message, .iterate_devices = raid_iterate_devices, .io_hints = raid_io_hints, - .presuspend = raid_presuspend, .postsuspend = raid_postsuspend, .preresume = raid_preresume, .resume = raid_resume, -- cgit v1.2.3 From 052b2b1e0689b30af2608d908916a16e9dbd0919 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:49 +0100 Subject: dm raid: consume sizes after md_finish_reshape() completes changing them The md raid personalities call md_finish_reshape() at the end of a reshape conversion which adjusts rdev->sectors. Correct/check rdev->sectors before initiating a reshape and raise the recovery pointer accordingly. Otherwise, the DM raid coordinated reshape will fail. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 398314b6c31a..c3ea4337bf51 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2640,12 +2640,19 @@ static int rs_adjust_data_offsets(struct raid_set *rs) * Make sure we got a minimum amount of free sectors per device */ if (rs->data_offset && - to_sector(i_size_read(rdev->bdev->bd_inode)) - rdev->sectors < MIN_FREE_RESHAPE_SPACE) { + to_sector(i_size_read(rdev->bdev->bd_inode)) - rs->md.dev_sectors < MIN_FREE_RESHAPE_SPACE) { rs->ti->error = data_offset ? "No space for forward reshape" : "No space for backward reshape"; return -ENOSPC; } out: + /* + * Raise recovery_cp in case data_offset != 0 to + * avoid false recovery positives in the constructor. + */ + if (rs->md.recovery_cp < rs->md.dev_sectors) + rs->md.recovery_cp += rs->dev[0].rdev.data_offset; + /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */ rdev_for_each(rdev, &rs->md) { if (!test_bit(Journal, &rdev->flags)) { @@ -2777,6 +2784,23 @@ static int rs_prepare_reshape(struct raid_set *rs) return 0; } +/* Get reshape sectors from data_offsets or raid set */ +static sector_t _get_reshape_sectors(struct raid_set *rs) +{ + struct md_rdev *rdev; + sector_t reshape_sectors = 0; + + rdev_for_each(rdev, &rs->md) + if (!test_bit(Journal, &rdev->flags)) { + reshape_sectors = (rdev->data_offset > rdev->new_data_offset) ? + rdev->data_offset - rdev->new_data_offset : + rdev->new_data_offset - rdev->data_offset; + break; + } + + return max(reshape_sectors, (sector_t) rs->data_offset); +} + /* * * - change raid layout @@ -2788,6 +2812,7 @@ static int rs_setup_reshape(struct raid_set *rs) { int r = 0; unsigned int cur_raid_devs, d; + sector_t reshape_sectors = _get_reshape_sectors(rs); struct mddev *mddev = &rs->md; struct md_rdev *rdev; @@ -2804,13 +2829,13 @@ static int rs_setup_reshape(struct raid_set *rs) /* * Adjust array size: * - * - in case of adding disks, array size has + * - in case of adding disk(s), array size has * to grow after the disk adding reshape, * which'll hapen in the event handler; * reshape will happen forward, so space has to * be available at the beginning of each disk * - * - in case of removing disks, array size + * - in case of removing disk(s), array size * has to shrink before starting the reshape, * which'll happen here; * reshape will happen backward, so space has to @@ -2841,7 +2866,7 @@ static int rs_setup_reshape(struct raid_set *rs) rdev->recovery_offset = rs_is_raid1(rs) ? 0 : MaxSector; } - mddev->reshape_backwards = 0; /* adding disks -> forward reshape */ + mddev->reshape_backwards = 0; /* adding disk(s) -> forward reshape */ /* Remove disk(s) */ } else if (rs->delta_disks < 0) { @@ -2874,6 +2899,15 @@ static int rs_setup_reshape(struct raid_set *rs) mddev->reshape_backwards = rs->dev[0].rdev.data_offset ? 0 : 1; } + /* + * Adjust device size for forward reshape + * because md_finish_reshape() reduces it. + */ + if (!mddev->reshape_backwards) + rdev_for_each(rdev, &rs->md) + if (!test_bit(Journal, &rdev->flags)) + rdev->sectors += reshape_sectors; + return r; } -- cgit v1.2.3 From 7501537ee3a5e6bd01c0084af141e4fa84e652c0 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:50 +0100 Subject: dm raid: correct resizing state relative to reshape space in ctr Pay attention to existing reshape space to define if a raid set needs resizing. Otherwise we can hit "Can't resize a reshaping raid set" when a reshape is being requested. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c3ea4337bf51..c4b0cb181fbc 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2969,10 +2969,10 @@ static void configure_discard_support(struct raid_set *rs) static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) { int r; - bool resize; + bool resize = false; struct raid_type *rt; unsigned int num_raid_params, num_raid_devs; - sector_t calculated_dev_sectors, rdev_sectors; + sector_t calculated_dev_sectors, rdev_sectors, reshape_sectors; struct raid_set *rs = NULL; const char *arg; struct rs_layout rs_layout; @@ -3055,7 +3055,10 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - resize = calculated_dev_sectors != rdev_sectors; + + reshape_sectors = _get_reshape_sectors(rs); + if (calculated_dev_sectors != rdev_sectors) + resize = calculated_dev_sectors != (reshape_sectors ? rdev_sectors - reshape_sectors : rdev_sectors); INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; @@ -3178,7 +3181,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) mddev_lock_nointr(&rs->md); r = md_run(&rs->md); rs->md.in_sync = 0; /* Assume already marked dirty */ - if (r) { ti->error = "Failed to run raid array"; mddev_unlock(&rs->md); -- cgit v1.2.3 From 61e06e2c3ebd986050958513bfa40dceed756f8f Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:51 +0100 Subject: dm raid: fix raid set size revalidation The raid set size is being revalidated unconditionally before a reshaping conversion is started. MD requires the size to only be reduced in case of a stripe removing (i.e. shrinking) reshape but not when growing because the raid array has to stay small until after the growing reshape finishes. Fix by avoiding the size revalidation in preresume unless a shrinking reshape is requested. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c4b0cb181fbc..ff75324133fb 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -675,15 +675,11 @@ static struct raid_type *get_raid_type_by_ll(const int level, const int layout) return NULL; } -/* - * Conditionally change bdev capacity of @rs - * in case of a disk add/remove reshape - */ -static void rs_set_capacity(struct raid_set *rs) +/* Adjust rdev sectors */ +static void rs_set_rdev_sectors(struct raid_set *rs) { struct mddev *mddev = &rs->md; struct md_rdev *rdev; - struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table)); /* * raid10 sets rdev->sector to the device size, which @@ -692,8 +688,16 @@ static void rs_set_capacity(struct raid_set *rs) rdev_for_each(rdev, mddev) if (!test_bit(Journal, &rdev->flags)) rdev->sectors = mddev->dev_sectors; +} - set_capacity(gendisk, mddev->array_sectors); +/* + * Change bdev capacity of @rs in case of a disk add/remove reshape + */ +static void rs_set_capacity(struct raid_set *rs) +{ + struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table)); + + set_capacity(gendisk, rs->md.array_sectors); revalidate_disk(gendisk); } @@ -1674,8 +1678,11 @@ static void do_table_event(struct work_struct *ws) struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); smp_rmb(); /* Make sure we access most actual mddev properties */ - if (!rs_is_reshaping(rs)) + if (!rs_is_reshaping(rs)) { + if (rs_is_raid10(rs)) + rs_set_rdev_sectors(rs); rs_set_capacity(rs); + } dm_table_event(rs->ti->table); } @@ -3873,11 +3880,10 @@ static int raid_preresume(struct dm_target *ti) mddev->resync_min = mddev->recovery_cp; } - rs_set_capacity(rs); - /* Check for any reshape request unless new raid set */ if (test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) { /* Initiate a reshape. */ + rs_set_rdev_sectors(rs); mddev_lock_nointr(mddev); r = rs_start_reshape(rs); mddev_unlock(mddev); @@ -3906,6 +3912,10 @@ static void raid_resume(struct dm_target *ti) mddev->ro = 0; mddev->in_sync = 0; + /* Only reduce raid set size before running a disk removing reshape. */ + if (mddev->delta_disks < 0) + rs_set_capacity(rs); + /* * Keep the RAID set frozen if reshape/rebuild flags are set. * The RAID set is unfrozen once the next table load/resume, -- cgit v1.2.3 From 188a212df1f3a2d7ea9bb0fc0ab4173042c23470 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:59 +0100 Subject: dm raid: add component device size checks to avoid runtime failure Check all component data device sizes versus calculated size. Reject if device(s) are too small. Otherwise, MD will fail the operation by accessing beyond the end of the data device. An example use-case is that growing bitmap won't fit any more and the MD runtime will report an error when DM raid should catch this earlier. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index ff75324133fb..2bb0ac7c3fba 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1580,6 +1580,24 @@ static sector_t __rdev_sectors(struct raid_set *rs) return 0; } +/* Check that calculated dev_sectors fits all component devices. */ +static int _check_data_dev_sectors(struct raid_set *rs) +{ + sector_t ds = ~0; + struct md_rdev *rdev; + + rdev_for_each(rdev, &rs->md) + if (!test_bit(Journal, &rdev->flags) && rdev->bdev) { + ds = min(ds, to_sector(i_size_read(rdev->bdev->bd_inode))); + if (ds < rs->md.dev_sectors) { + rs->ti->error = "Component device(s) too small"; + return -EINVAL; + } + } + + return 0; +} + /* Calculate the sectors per device and per array used for @rs */ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) { @@ -1629,7 +1647,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) mddev->array_sectors = array_sectors; mddev->dev_sectors = dev_sectors; - return 0; + return _check_data_dev_sectors(rs); bad: rs->ti->error = "Target length not divisible by number of data devices"; return -EINVAL; -- cgit v1.2.3 From d39f0010e40964d959c5157be02839da8a178015 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:52 +0100 Subject: dm raid: fix raid_resume() to keep raid set frozen as needed During a reshape request: if userspace reloads a "raid" table multiple times, resulting in multiple superblock reads, the raid set needs to stay frozen until all config changes (chunk size, layout data_offset, delta_disks) have been stored in the superblocks and respective flags cleared. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 2bb0ac7c3fba..bf3c9e3c736d 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3899,7 +3899,7 @@ static int raid_preresume(struct dm_target *ti) } /* Check for any reshape request unless new raid set */ - if (test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) { + if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) { /* Initiate a reshape. */ rs_set_rdev_sectors(rs); mddev_lock_nointr(mddev); @@ -3941,8 +3941,14 @@ static void raid_resume(struct dm_target *ti) * This ensures that the constructor for the inactive table * retrieves an up-to-date reshape_position. */ - if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (!test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags) && + !(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) { + if (rs_is_reshapable(rs)) { + if (!rs_is_reshaping(rs) || _get_reshape_sectors(rs)) + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + } else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + } if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { mddev_lock_nointr(mddev); -- cgit v1.2.3 From 67143510a7e3634a23f06a48445d1148b2fdbc4d Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:53 +0100 Subject: dm raid: display a consistent copy of the MD status via raid_status() The MD sync thread updates recovery flags providing state of any running, idle, frozen, recovering, reshaping, ... activity it performs and updates respective flags asynchronously versus dm processing raid_status(). To close that race window, take a single copy of the flags and pass it into its callees. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index bf3c9e3c736d..3df7c5bd5a9b 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3300,25 +3300,25 @@ static int raid_map(struct dm_target *ti, struct bio *bio) } /* Return string describing the current sync action of @mddev */ -static const char *decipher_sync_action(struct mddev *mddev) +static const char *decipher_sync_action(struct mddev *mddev, unsigned long recovery) { - if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + if (test_bit(MD_RECOVERY_FROZEN, &recovery)) return "frozen"; - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + if (test_bit(MD_RECOVERY_RUNNING, &recovery) || + (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) return "reshape"; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + if (test_bit(MD_RECOVERY_SYNC, &recovery)) { + if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) return "resync"; - else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + else if (test_bit(MD_RECOVERY_CHECK, &recovery)) return "check"; return "repair"; } - if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + if (test_bit(MD_RECOVERY_RECOVER, &recovery)) return "recover"; } @@ -3350,7 +3350,7 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, } /* Helper to return resync/reshape progress for @rs and @array_in_sync */ -static sector_t rs_get_progress(struct raid_set *rs, +static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, sector_t resync_max_sectors, bool *array_in_sync) { sector_t r, curr_resync_completed; @@ -3367,7 +3367,7 @@ static sector_t rs_get_progress(struct raid_set *rs, r = mddev->reshape_position; /* Reshape is relative to the array size */ - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || + if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || r != MaxSector) { if (r == MaxSector) { *array_in_sync = true; @@ -3382,20 +3382,20 @@ static sector_t rs_get_progress(struct raid_set *rs, } /* Sync is relative to the component device size */ - } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + } else if (test_bit(MD_RECOVERY_RUNNING, &recovery)) r = curr_resync_completed; else r = mddev->recovery_cp; if ((r == MaxSector) || - (test_bit(MD_RECOVERY_DONE, &mddev->recovery) && + (test_bit(MD_RECOVERY_DONE, &recovery) && (mddev->curr_resync_completed == resync_max_sectors))) { /* * Sync complete. */ *array_in_sync = true; r = resync_max_sectors; - } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + } else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) { /* * If "check" or "repair" is occurring, the raid set has * undergone an initial sync and the health characters @@ -3438,6 +3438,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, struct r5conf *conf = mddev->private; int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0; bool array_in_sync; + unsigned long recovery; unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */ unsigned int sz = 0; unsigned int rebuild_disks; @@ -3457,13 +3458,14 @@ static void raid_status(struct dm_target *ti, status_type_t type, /* Access most recent mddev properties for status output */ smp_rmb(); + recovery = rs->md.recovery; /* Get sensible max sectors even if raid set not yet started */ resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ? mddev->resync_max_sectors : mddev->dev_sectors; - progress = rs_get_progress(rs, resync_max_sectors, &array_in_sync); + progress = rs_get_progress(rs, recovery, resync_max_sectors, &array_in_sync); resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? atomic64_read(&mddev->resync_mismatches) : 0; - sync_action = decipher_sync_action(&rs->md); + sync_action = decipher_sync_action(&rs->md, recovery); /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ for (i = 0; i < rs->raid_disks; i++) -- cgit v1.2.3 From 242ea5ad11a03f2fbdfc2fe422d8e1b0601a8073 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:54 +0100 Subject: dm raid: avoid passing array_in_sync variable to raid_status() callees The raid_status() function passes the bool array_in_sync variable around providing synchronization state of the MD array. Replace it with a runtime flag. This will avoid a pattern of having to pass discrete variables to various functions. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 3df7c5bd5a9b..5730b32034aa 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -209,6 +209,7 @@ struct raid_dev { #define RT_FLAG_UPDATE_SBS 3 #define RT_FLAG_RESHAPE_RS 4 #define RT_FLAG_RS_SUSPENDED 5 +#define RT_FLAG_RS_IN_SYNC 6 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -3335,7 +3336,7 @@ static const char *decipher_sync_action(struct mddev *mddev, unsigned long recov * 'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr) */ -static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync) +static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev) { if (!rdev->bdev) return "-"; @@ -3343,25 +3344,27 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, return "D"; else if (test_bit(Journal, &rdev->flags)) return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a"; - else if (!array_in_sync || !test_bit(In_sync, &rdev->flags)) + else if (!test_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags) && + !test_bit(In_sync, &rdev->flags)) return "a"; else return "A"; } -/* Helper to return resync/reshape progress for @rs and @array_in_sync */ +/* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, - sector_t resync_max_sectors, bool *array_in_sync) + sector_t resync_max_sectors) { sector_t r, curr_resync_completed; struct mddev *mddev = &rs->md; + clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp; - *array_in_sync = false; if (rs_is_raid0(rs)) { r = resync_max_sectors; - *array_in_sync = true; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { r = mddev->reshape_position; @@ -3370,7 +3373,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || r != MaxSector) { if (r == MaxSector) { - *array_in_sync = true; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); r = resync_max_sectors; } else { /* Got to reverse on backward reshape */ @@ -3393,7 +3396,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, /* * Sync complete. */ - *array_in_sync = true; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); r = resync_max_sectors; } else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) { /* @@ -3401,7 +3404,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, * undergone an initial sync and the health characters * should not be 'a' anymore. */ - *array_in_sync = true; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { struct md_rdev *rdev; @@ -3414,7 +3417,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, rdev_for_each(rdev, mddev) if (!test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) - *array_in_sync = true; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); #if 0 r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */ #endif @@ -3437,7 +3440,6 @@ static void raid_status(struct dm_target *ti, status_type_t type, struct mddev *mddev = &rs->md; struct r5conf *conf = mddev->private; int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0; - bool array_in_sync; unsigned long recovery; unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */ unsigned int sz = 0; @@ -3462,14 +3464,14 @@ static void raid_status(struct dm_target *ti, status_type_t type, /* Get sensible max sectors even if raid set not yet started */ resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ? mddev->resync_max_sectors : mddev->dev_sectors; - progress = rs_get_progress(rs, recovery, resync_max_sectors, &array_in_sync); + progress = rs_get_progress(rs, recovery, resync_max_sectors); resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? atomic64_read(&mddev->resync_mismatches) : 0; sync_action = decipher_sync_action(&rs->md, recovery); /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ for (i = 0; i < rs->raid_disks; i++) - DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync)); + DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev)); /* * In-sync/Reshape ratio: @@ -3520,7 +3522,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, * v1.10.0+: */ DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? - __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-"); + __raid_dev_status(rs, &rs->journal_dev.rdev) : "-"); break; case STATUSTYPE_TABLE: -- cgit v1.2.3 From 4102d9de6d375fc27ec70382c4068f4f9f62ce4f Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:55 +0100 Subject: dm raid: fix rs_get_progress() synchronization state/ratio Fix various sync state issues causing racy/bogus sync ratio, sync_action ad health chars in dm_status() info output. Sync ratio could be N/N (i.e. 100%) shortly after raid set creation, i.e. creating a new RaidLV or upconverting a linear LV to raid1 thus: "0 2097152 raid raid1 2 Aa 2097162/2097152 recover 0 0 -" instead of: "0 2097152 raid raid1 2 Aa 0/2097152 idle 0 0 -" Sync action could be non-idle, when the MD thread was done with io. Health chars could be 'A' when they should be 'a' for a short time before a resynchonization started. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 95 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 64 insertions(+), 31 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 5730b32034aa..7e7075fb9c28 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -210,6 +210,7 @@ struct raid_dev { #define RT_FLAG_RESHAPE_RS 4 #define RT_FLAG_RS_SUSPENDED 5 #define RT_FLAG_RS_IN_SYNC 6 +#define RT_FLAG_RS_RESYNCING 7 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -3306,8 +3307,10 @@ static const char *decipher_sync_action(struct mddev *mddev, unsigned long recov if (test_bit(MD_RECOVERY_FROZEN, &recovery)) return "frozen"; - if (test_bit(MD_RECOVERY_RUNNING, &recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { + /* The MD sync thread can be done with io but still be running */ + if (!test_bit(MD_RECOVERY_DONE, &recovery) && + (test_bit(MD_RECOVERY_RUNNING, &recovery) || + (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) { if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) return "reshape"; @@ -3344,8 +3347,9 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev) return "D"; else if (test_bit(Journal, &rdev->flags)) return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a"; - else if (!test_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags) && - !test_bit(In_sync, &rdev->flags)) + else if (test_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags) || + (!test_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags) && + !test_bit(In_sync, &rdev->flags))) return "a"; else return "A"; @@ -3355,49 +3359,70 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev) static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, sector_t resync_max_sectors) { - sector_t r, curr_resync_completed; + sector_t r; struct mddev *mddev = &rs->md; clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); - - curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp; + clear_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); if (rs_is_raid0(rs)) { r = resync_max_sectors; set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { - r = mddev->reshape_position; - /* Reshape is relative to the array size */ - if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || - r != MaxSector) { - if (r == MaxSector) { - set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); - r = resync_max_sectors; - } else { + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) { + r = mddev->reshape_position; + if (r != MaxSector) { /* Got to reverse on backward reshape */ if (mddev->reshape_backwards) r = mddev->array_sectors - r; - /* Devide by # of data stripes */ - sector_div(r, mddev_data_stripes(rs)); + /* Divide by # of data stripes unless raid1 */ + if (!rs_is_raid1(rs)) + sector_div(r, mddev_data_stripes(rs)); } - /* Sync is relative to the component device size */ - } else if (test_bit(MD_RECOVERY_RUNNING, &recovery)) - r = curr_resync_completed; + /* + * Sync/recover is relative to the component device size. + * + * MD_RECOVERY_NEEDED for https://bugzilla.redhat.com/show_bug.cgi?id=1508070 + */ + } else if (test_bit(MD_RECOVERY_NEEDED, &recovery) || + test_bit(MD_RECOVERY_RUNNING, &recovery)) + r = mddev->curr_resync_completed; + else r = mddev->recovery_cp; - if ((r == MaxSector) || - (test_bit(MD_RECOVERY_DONE, &recovery) && - (mddev->curr_resync_completed == resync_max_sectors))) { + if (r >= resync_max_sectors && + (!test_bit(MD_RECOVERY_REQUESTED, &recovery) || + (!test_bit(MD_RECOVERY_FROZEN, &recovery) && + !test_bit(MD_RECOVERY_NEEDED, &recovery) && + !test_bit(MD_RECOVERY_RUNNING, &recovery)))) { /* * Sync complete. */ - set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); - r = resync_max_sectors; + /* In case we have finished recovering, the array is in sync. */ + if (test_bit(MD_RECOVERY_RECOVER, &recovery)) + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + + } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) { + /* + * In case we are recovering, the array is not in sync + * and health chars should show the recovering legs. + */ + ; + + } else if (test_bit(MD_RECOVERY_SYNC, &recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &recovery)) { + /* + * If "resync" is occurring, the raid set + * is or may be out of sync hence the health + * characters shall be 'a'. + */ + set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + } else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) { /* * If "check" or "repair" is occurring, the raid set has @@ -3405,26 +3430,34 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, * should not be 'a' anymore. */ set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + } else { struct md_rdev *rdev; + /* + * We are idle and recovery is needed, prevent 'A' chars race + * caused by components still set to in-sync by constrcuctor. + */ + if (test_bit(MD_RECOVERY_NEEDED, &recovery)) + set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + /* * The raid set may be doing an initial sync, or it may * be rebuilding individual components. If all the * devices are In_sync, then it is the raid set that is * being initialized. */ + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); rdev_for_each(rdev, mddev) if (!test_bit(Journal, &rdev->flags) && - !test_bit(In_sync, &rdev->flags)) - set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); -#if 0 - r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */ -#endif + !test_bit(In_sync, &rdev->flags)) { + clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + break; + } } } - return r; + return min(r, resync_max_sectors); } /* Helper to return @dev name or "-" if !@dev */ -- cgit v1.2.3 From 78a75d10ef869f4fae70f9b86afce28eb1922529 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:56 +0100 Subject: dm raid: small cleanup and remove unsed "struct raid_set" member Move raid_resume()'s setting of 'rw' and 'in_sync' to just prior to mddev_resume(). Also, remove unused 'bitmap_loaded' member from "struct raid_set". No functional changes. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 7e7075fb9c28..1069e617e727 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -227,7 +227,6 @@ struct rs_layout { struct raid_set { struct dm_target *ti; - uint32_t bitmap_loaded; uint32_t stripe_cache_entries; unsigned long ctr_flags; unsigned long runtime_flags; @@ -3964,9 +3963,6 @@ static void raid_resume(struct dm_target *ti) attempt_restore_of_faulty_devices(rs); } - mddev->ro = 0; - mddev->in_sync = 0; - /* Only reduce raid set size before running a disk removing reshape. */ if (mddev->delta_disks < 0) rs_set_capacity(rs); @@ -3989,6 +3985,8 @@ static void raid_resume(struct dm_target *ti) if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { mddev_lock_nointr(mddev); + mddev->ro = 0; + mddev->in_sync = 0; mddev_resume(mddev); mddev_unlock(mddev); } -- cgit v1.2.3 From b84cf26924cfe405993fc45fa2911cde38f3c3ac Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 4 Dec 2017 10:26:21 -0500 Subject: dm raid: bump target version to reflect numerous fixes Also update Documentation accordingly. Signed-off-by: Mike Snitzer --- Documentation/device-mapper/dm-raid.txt | 4 +++- drivers/md/dm-raid.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 32df07e29f68..7b22375091fa 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -343,5 +343,7 @@ Version History 1.11.0 Fix table line argument order (wrong raid10_copies/raid10_format sequence) 1.11.1 Add raid4/5/6 journal write-back support via journal_mode option -1.12.1 fix for MD deadlock between mddev_suspend() and md_write_start() available +1.12.1 Fix for MD deadlock between mddev_suspend() and md_write_start() available 1.13.0 Fix dev_health status at end of "recover" (was 'a', now 'A') +1.13.1 Fix deadlock caused by early md_stop_writes(). Also fix size an + state races. diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 1069e617e727..764baa9665bb 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3994,7 +3994,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 13, 0}, + .version = {1, 13, 1}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, -- cgit v1.2.3 From 53bf5384f9b9e37c628f171366959a38c89779ca Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 13 Dec 2017 17:13:17 +0100 Subject: dm raid: validate current raid sets redundancy Verifying the current raid sets redundancy based on retrieved superblock content has to use the superblock's raid level (e.g. raid0), not the constructor requested one (e.g. raid10). Using the requested raid level of raid10 lead to a "divide error" on raid0 which defines data copies divided by to be zero. Also check for bogus data copies. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 764baa9665bb..b82b7095a671 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1007,7 +1007,7 @@ static int validate_raid_redundancy(struct raid_set *rs) !rs->dev[i].rdev.sb_page) rebuild_cnt++; - switch (rs->raid_type->level) { + switch (rs->md.level) { case 0: break; case 1: @@ -1022,6 +1022,11 @@ static int validate_raid_redundancy(struct raid_set *rs) break; case 10: copies = raid10_md_layout_to_copies(rs->md.new_layout); + if (copies < 2) { + DMERR("Bogus raid10 data copies < 2!"); + return -EINVAL; + } + if (rebuild_cnt < copies) break; -- cgit v1.2.3 From 11e4723206683ad59f8e9dc7771e7b44a37f7b62 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 13 Dec 2017 17:13:18 +0100 Subject: dm raid: stop keeping raid set frozen altogether In order to avoid redoing synchronization/recovery/reshape partially, the raid set got frozen until after all passed in table line flags had been cleared. The related table reload sequence had to be precisely followed, or reshaping may lead to data corruption caused by the active mapping carrying on with a reshape when the inactive mapping already had retrieved a stale reshape position. Harden by retrieving the actual resync/recovery/reshape position during resume whilst the active table is suspended thus avoiding to keep the raid set frozen altogether. This prevents superfluous redoing of an already resynchronized or recovered segment and, most importantly, potential for redoing of an already reshaped segment causing data corruption. Fixes: d39f0010e ("dm raid: fix raid_resume() to keep raid set frozen as needed") Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- Documentation/device-mapper/dm-raid.txt | 1 + drivers/md/dm-raid.c | 108 +++++++++++++++++++++----------- 2 files changed, 71 insertions(+), 38 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 7b22375091fa..390c145f01d7 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -347,3 +347,4 @@ Version History 1.13.0 Fix dev_health status at end of "recover" (was 'a', now 'A') 1.13.1 Fix deadlock caused by early md_stop_writes(). Also fix size an state races. +1.13.2 Fix raid redundancy validation and avoid keeping raid set frozen diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index b82b7095a671..109b001407a8 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -29,6 +29,9 @@ */ #define MIN_RAID456_JOURNAL_SPACE (4*2048) +/* Global list of all raid sets */ +LIST_HEAD(raid_sets); + static bool devices_handle_discard_safely = false; /* @@ -105,8 +108,6 @@ struct raid_dev { #define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV) #define CTR_FLAG_JOURNAL_MODE (1 << __CTR_FLAG_JOURNAL_MODE) -#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET) - /* * Definitions of various constructor flags to * be used in checks of valid / invalid flags @@ -226,6 +227,7 @@ struct rs_layout { struct raid_set { struct dm_target *ti; + struct list_head list; uint32_t stripe_cache_entries; unsigned long ctr_flags; @@ -271,6 +273,19 @@ static void rs_config_restore(struct raid_set *rs, struct rs_layout *l) mddev->new_chunk_sectors = l->new_chunk_sectors; } +/* Find any raid_set in active slot for @rs on global list */ +static struct raid_set *rs_find_active(struct raid_set *rs) +{ + struct raid_set *r; + struct mapped_device *md = dm_table_get_md(rs->ti->table); + + list_for_each_entry(r, &raid_sets, list) + if (r != rs && dm_table_get_md(r->ti->table) == md) + return r; + + return NULL; +} + /* raid10 algorithms (i.e. formats) */ #define ALGORITHM_RAID10_DEFAULT 0 #define ALGORITHM_RAID10_NEAR 1 @@ -749,6 +764,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r mddev_init(&rs->md); + INIT_LIST_HEAD(&rs->list); rs->raid_disks = raid_devs; rs->delta_disks = 0; @@ -766,6 +782,9 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r for (i = 0; i < raid_devs; i++) md_rdev_init(&rs->dev[i].rdev); + /* Add @rs to global list. */ + list_add(&rs->list, &raid_sets); + /* * Remaining items to be initialized by further RAID params: * rs->md.persistent @@ -778,6 +797,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r return rs; } +/* Free all @rs allocations and remove it from global list. */ static void raid_set_free(struct raid_set *rs) { int i; @@ -795,6 +815,8 @@ static void raid_set_free(struct raid_set *rs) dm_put_device(rs->ti, rs->dev[i].data_dev); } + list_del(&rs->list); + kfree(rs); } @@ -2371,7 +2393,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) DMERR("new device%s provided without 'rebuild'", new_devs > 1 ? "s" : ""); return -EINVAL; - } else if (rs_is_recovering(rs)) { + } else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) { DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)", (unsigned long long) mddev->recovery_cp); return -EINVAL; @@ -3173,19 +3195,22 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - /* - * We can only prepare for a reshape here, because the - * raid set needs to run to provide the repective reshape - * check functions via its MD personality instance. - * - * So do the reshape check after md_run() succeeded. - */ - r = rs_prepare_reshape(rs); - if (r) - return r; + /* Out-of-place space has to be available to allow for a reshape unless raid1! */ + if (reshape_sectors || rs_is_raid1(rs)) { + /* + * We can only prepare for a reshape here, because the + * raid set needs to run to provide the repective reshape + * check functions via its MD personality instance. + * + * So do the reshape check after md_run() succeeded. + */ + r = rs_prepare_reshape(rs); + if (r) + return r; - /* Reshaping ain't recovery, so disable recovery */ - rs_setup_recovery(rs, MaxSector); + /* Reshaping ain't recovery, so disable recovery */ + rs_setup_recovery(rs, MaxSector); + } rs_set_cur(rs); } else { /* May not set recovery when a device rebuild is requested */ @@ -3395,7 +3420,6 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, } else if (test_bit(MD_RECOVERY_NEEDED, &recovery) || test_bit(MD_RECOVERY_RUNNING, &recovery)) r = mddev->curr_resync_completed; - else r = mddev->recovery_cp; @@ -3904,10 +3928,33 @@ static int raid_preresume(struct dm_target *ti) struct raid_set *rs = ti->private; struct mddev *mddev = &rs->md; - /* This is a resume after a suspend of the set -> it's already started */ + /* This is a resume after a suspend of the set -> it's already started. */ if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags)) return 0; + if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) { + struct raid_set *rs_active = rs_find_active(rs); + + if (rs_active) { + /* + * In case no rebuilds have been requested + * and an active table slot exists, copy + * current resynchonization completed and + * reshape position pointers across from + * suspended raid set in the active slot. + * + * This resumes the new mapping at current + * offsets to continue recover/reshape without + * necessarily redoing a raid set partially or + * causing data corruption in case of a reshape. + */ + if (rs_active->md.curr_resync_completed != MaxSector) + mddev->curr_resync_completed = rs_active->md.curr_resync_completed; + if (rs_active->md.reshape_position != MaxSector) + mddev->reshape_position = rs_active->md.reshape_position; + } + } + /* * The superblocks need to be updated on disk if the * array is new or new devices got added (thus zeroed @@ -3968,28 +4015,13 @@ static void raid_resume(struct dm_target *ti) attempt_restore_of_faulty_devices(rs); } - /* Only reduce raid set size before running a disk removing reshape. */ - if (mddev->delta_disks < 0) - rs_set_capacity(rs); - - /* - * Keep the RAID set frozen if reshape/rebuild flags are set. - * The RAID set is unfrozen once the next table load/resume, - * which clears the reshape/rebuild flags, occurs. - * This ensures that the constructor for the inactive table - * retrieves an up-to-date reshape_position. - */ - if (!test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags) && - !(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) { - if (rs_is_reshapable(rs)) { - if (!rs_is_reshaping(rs) || _get_reshape_sectors(rs)) - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - } else - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - } - if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + /* Only reduce raid set size before running a disk removing reshape. */ + if (mddev->delta_disks < 0) + rs_set_capacity(rs); + mddev_lock_nointr(mddev); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); mddev->ro = 0; mddev->in_sync = 0; mddev_resume(mddev); @@ -3999,7 +4031,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 13, 1}, + .version = {1, 13, 2}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, -- cgit v1.2.3 From dc15b943d4651bc13b9737bb27283ad9d3b8eeba Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 13 Dec 2017 17:13:19 +0100 Subject: dm raid: ensure 'a' chars during reshape During reshape, 'A' chars were reported in status rather than 'a'. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 109b001407a8..af4f40de2c0b 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3451,6 +3451,15 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, */ set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + } else if (test_bit(MD_RECOVERY_RESHAPE, &recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &recovery)) { + /* + * If "reshape" is occurring, the raid set + * is or may be out of sync hence the health + * characters shall be 'a'. + */ + set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + } else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) { /* * If "check" or "repair" is occurring, the raid set has -- cgit v1.2.3 From 7c29744eccecc2c74c9b4d1ea0a60b4d95229399 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 13 Dec 2017 17:13:20 +0100 Subject: dm raid: simplify rs_get_progress() No need to calculate the reshaping progress because mddev->curr_resync_completed holds it. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index af4f40de2c0b..21e007c89c2e 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3399,26 +3399,9 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { - /* Reshape is relative to the array size */ - if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) { - r = mddev->reshape_position; - if (r != MaxSector) { - /* Got to reverse on backward reshape */ - if (mddev->reshape_backwards) - r = mddev->array_sectors - r; - - /* Divide by # of data stripes unless raid1 */ - if (!rs_is_raid1(rs)) - sector_div(r, mddev_data_stripes(rs)); - } - - /* - * Sync/recover is relative to the component device size. - * - * MD_RECOVERY_NEEDED for https://bugzilla.redhat.com/show_bug.cgi?id=1508070 - */ - } else if (test_bit(MD_RECOVERY_NEEDED, &recovery) || - test_bit(MD_RECOVERY_RUNNING, &recovery)) + if (test_bit(MD_RECOVERY_NEEDED, &recovery) || + test_bit(MD_RECOVERY_RESHAPE, &recovery) || + test_bit(MD_RECOVERY_RUNNING, &recovery)) r = mddev->curr_resync_completed; else r = mddev->recovery_cp; -- cgit v1.2.3 From 552aa679f265743163fb440c61370a9c51f66c81 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 13 Dec 2017 17:13:21 +0100 Subject: dm raid: use rs_is_raid*() Cleanup, no functional change. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 21e007c89c2e..7d7dc1723180 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -588,7 +588,7 @@ static const char *raid10_md_layout_to_format(int layout) } /* Return md raid10 algorithm for @name */ -static int raid10_name_to_format(const char *name) +static const int raid10_name_to_format(const char *name) { if (!strcasecmp(name, "near")) return ALGORITHM_RAID10_NEAR; @@ -1913,7 +1913,7 @@ static bool rs_reshape_requested(struct raid_set *rs) if (rs_takeover_requested(rs)) return false; - if (!mddev->level) + if (rs_is_raid0(rs)) return false; change = mddev->new_layout != mddev->layout || @@ -1921,7 +1921,7 @@ static bool rs_reshape_requested(struct raid_set *rs) rs->delta_disks; /* Historical case to support raid1 reshape without delta disks */ - if (mddev->level == 1) { + if (rs_is_raid1(rs)) { if (rs->delta_disks) return !!rs->delta_disks; @@ -1929,7 +1929,7 @@ static bool rs_reshape_requested(struct raid_set *rs) mddev->raid_disks != rs->raid_disks; } - if (mddev->level == 10) + if (rs_is_raid10(rs)) return change && !__is_raid10_far(mddev->new_layout) && rs->delta_disks >= 0; @@ -2742,14 +2742,14 @@ static int rs_setup_takeover(struct raid_set *rs) sector_t new_data_offset = rs->dev[0].rdev.data_offset ? 0 : rs->data_offset; if (rt_is_raid10(rs->raid_type)) { - if (mddev->level == 0) { + if (rs_is_raid0(rs)) { /* Userpace reordered disks -> adjust raid_disk indexes */ __reorder_raid_disk_indexes(rs); /* raid0 -> raid10_far layout */ mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_FAR, rs->raid10_copies); - } else if (mddev->level == 1) + } else if (rs_is_raid1(rs)) /* raid1 -> raid10_near layout */ mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR, rs->raid_disks); @@ -2977,7 +2977,7 @@ static void configure_discard_support(struct raid_set *rs) /* * XXX: RAID level 4,5,6 require zeroing for safety. */ - raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6); + raid456 = rs_is_raid456(rs); for (i = 0; i < rs->raid_disks; i++) { struct request_queue *q; @@ -3002,7 +3002,7 @@ static void configure_discard_support(struct raid_set *rs) * RAID1 and RAID10 personalities require bio splitting, * RAID0/4/5/6 don't and process large discard bios properly. */ - ti->split_discard_bios = !!(rs->md.level == 1 || rs->md.level == 10); + ti->split_discard_bios = !!(rs_is_raid1(rs) || rs_is_raid10(rs)); ti->num_discard_bios = 1; } -- cgit v1.2.3 From 67ac901c553bab4bcc05ed1253829bf462c26b1f Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 2 Jan 2018 06:18:10 +0000 Subject: dm raid: make raid_sets symbol static Fixes the following sparse warning: drivers/md/dm-raid.c:33:1: warning: symbol 'raid_sets' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/dm-raid.c') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 7d7dc1723180..d46d1945fbcc 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -30,7 +30,7 @@ #define MIN_RAID456_JOURNAL_SPACE (4*2048) /* Global list of all raid sets */ -LIST_HEAD(raid_sets); +static LIST_HEAD(raid_sets); static bool devices_handle_discard_safely = false; -- cgit v1.2.3