From 935fe0983e09f4f7331ebf5ea4ae2124f6e9f9e8 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 10 Oct 2017 17:02:41 -0400
Subject: md: rename some drivers/md/ files to have an "md-" prefix

Motivated by the desire to illiminate the imprecise nature of
DM-specific patches being unnecessarily sent to both the MD maintainer
and mailing-list.  Which is born out of the fact that DM files also
reside in drivers/md/

Now all MD-specific files in drivers/md/ start with either "raid" or
"md-" and the MAINTAINERS file has been updated accordingly.

Shaohua: don't change module name

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 928e24a07133..10c0d87074f0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -63,7 +63,7 @@
 #include "md.h"
 #include "raid5.h"
 #include "raid0.h"
-#include "bitmap.h"
+#include "md-bitmap.h"
 #include "raid5-log.h"
 
 #define UNSUPPORTED_MDDEV_FLAGS	(1L << MD_FAILFAST_SUPPORTED)
-- 
cgit v1.2.3


From 235b6003fb28f0dd8e7ed8fbdb088bb548291766 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 17 Oct 2017 16:18:36 +1100
Subject: raid5: Set R5_Expanded on parity devices as well as data.

When reshaping a fully degraded raid5/raid6 to a larger
nubmer of devices, the new device(s) are not in-sync
and so that can make the newly grown stripe appear to be
"failed".
To avoid this, we set the R5_Expanded flag to say "Even though
this device is not fully in-sync, this block is safe so
don't treat the device as failed for this stripe".
This flag is set for data devices, not not for parity devices.

Consequently, if you have a RAID6 with two devices that are partly
recovered and a spare, and start a reshape to include the spare,
then when the reshape gets past the point where the recovery was
up to, it will think the stripes are failed and will get into
an infinite loop, failing to make progress.

So when contructing parity on an EXPAND_READY stripe,
set R5_Expanded.

Reported-by: Curt <lightspd@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 10c0d87074f0..a21dbd22a2fb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1818,8 +1818,11 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
 		struct r5dev *dev = &sh->dev[i];
 
 		if (dev->written || i == pd_idx || i == qd_idx) {
-			if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
+			if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
 				set_bit(R5_UPTODATE, &dev->flags);
+				if (test_bit(STRIPE_EXPAND_READY, &sh->state))
+					set_bit(R5_Expanded, &dev->flags);
+			}
 			if (fua)
 				set_bit(R5_WantFUA, &dev->flags);
 			if (sync)
-- 
cgit v1.2.3


From 230b55fa8d64007339319539f8f8e68114d08529 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 17 Oct 2017 14:24:09 +1100
Subject: md: forbid a RAID5 from having both a bitmap and a journal.

Having both a bitmap and a journal is pointless.
Attempting to do so can corrupt the bitmap if the journal
replay happens before the bitmap is initialized.
Rather than try to avoid this corruption, simply
refuse to allow arrays with both a bitmap and a journal.
So:
 - if raid5_run sees both are present, fail.
 - if adding a bitmap finds a journal is present, fail
 - if adding a journal finds a bitmap is present, fail.

Cc: stable@vger.kernel.org (4.10+)
Signed-off-by: NeilBrown <neilb@suse.com>
Tested-by: Joshua Kinard <kumba@gentoo.org>
Acked-by: Joshua Kinard <kumba@gentoo.org>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-bitmap.c | 6 ++++++
 drivers/md/md.c        | 2 +-
 drivers/md/raid5.c     | 7 +++++++
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index b843b53b0f65..d1b3b60669ea 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1816,6 +1816,12 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot)
 
 	BUG_ON(file && mddev->bitmap_info.offset);
 
+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+		pr_notice("md/raid:%s: array with journal cannot have bitmap\n",
+			  mdname(mddev));
+		return ERR_PTR(-EBUSY);
+	}
+
 	bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
 	if (!bitmap)
 		return ERR_PTR(-ENOMEM);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 97afb28c6f51..6f25e3f1a1cf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6362,7 +6362,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 					break;
 				}
 			}
-			if (has_journal) {
+			if (has_journal || mddev->bitmap) {
 				export_rdev(rdev);
 				return -EBUSY;
 			}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a21dbd22a2fb..a8732955f130 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7159,6 +7159,13 @@ static int raid5_run(struct mddev *mddev)
 			min_offset_diff = diff;
 	}
 
+	if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
+	    (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
+		pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
+			  mdname(mddev));
+		return -EINVAL;
+	}
+
 	if (mddev->reshape_position != MaxSector) {
 		/* Check that we can continue the reshape.
 		 * Difficulties arise if the stripe we would write to
-- 
cgit v1.2.3


From b3143b9a38d5039bcd1f2d1c94039651bfba8043 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 17 Oct 2017 13:46:43 +1100
Subject: md: move suspend_hi/lo handling into core md code

responding to ->suspend_lo and ->suspend_hi is similar
to responding to ->suspended.  It is best to wait in
the common core code without incrementing ->active_io.
This allows mddev_suspend()/mddev_resume() to work while
requests are waiting for suspend_lo/hi to change.
This is will be important after a subsequent patch
which uses mddev_suspend() to synchronize updating for
suspend_lo/hi.

So move the code for testing suspend_lo/hi out of raid1.c
and raid5.c, and place it in md.c

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c    | 29 +++++++++++++++++++++++------
 drivers/md/raid1.c | 14 +++++---------
 drivers/md/raid5.c | 22 ----------------------
 3 files changed, 28 insertions(+), 37 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2cb49f639809..68de2a6ee29a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -266,16 +266,31 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
  * call has finished, the bio has been linked into some internal structure
  * and so is visible to ->quiesce(), so we don't need the refcount any more.
  */
+static bool is_suspended(struct mddev *mddev, struct bio *bio)
+{
+	if (mddev->suspended)
+		return true;
+	if (bio_data_dir(bio) != WRITE)
+		return false;
+	if (mddev->suspend_lo >= mddev->suspend_hi)
+		return false;
+	if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
+		return false;
+	if (bio_end_sector(bio) < mddev->suspend_lo)
+		return false;
+	return true;
+}
+
 void md_handle_request(struct mddev *mddev, struct bio *bio)
 {
 check_suspended:
 	rcu_read_lock();
-	if (mddev->suspended) {
+	if (is_suspended(mddev, bio)) {
 		DEFINE_WAIT(__wait);
 		for (;;) {
 			prepare_to_wait(&mddev->sb_wait, &__wait,
 					TASK_UNINTERRUPTIBLE);
-			if (!mddev->suspended)
+			if (!is_suspended(mddev, bio))
 				break;
 			rcu_read_unlock();
 			schedule();
@@ -4845,10 +4860,11 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
 		goto unlock;
 	old = mddev->suspend_lo;
 	mddev->suspend_lo = new;
-	if (new >= old)
+	if (new >= old) {
 		/* Shrinking suspended region */
+		wake_up(&mddev->sb_wait);
 		mddev->pers->quiesce(mddev, 2);
-	else {
+	} else {
 		/* Expanding suspended region - need to wait */
 		mddev->pers->quiesce(mddev, 1);
 		mddev->pers->quiesce(mddev, 0);
@@ -4888,10 +4904,11 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
 		goto unlock;
 	old = mddev->suspend_hi;
 	mddev->suspend_hi = new;
-	if (new <= old)
+	if (new <= old) {
 		/* Shrinking suspended region */
+		wake_up(&mddev->sb_wait);
 		mddev->pers->quiesce(mddev, 2);
-	else {
+	} else {
 		/* Expanding suspended region - need to wait */
 		mddev->pers->quiesce(mddev, 1);
 		mddev->pers->quiesce(mddev, 0);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index efdabd3040e7..fb56ef79a1c3 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1310,11 +1310,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	 */
 
 
-	if ((bio_end_sector(bio) > mddev->suspend_lo &&
-	    bio->bi_iter.bi_sector < mddev->suspend_hi) ||
-	    (mddev_is_clustered(mddev) &&
+	if (mddev_is_clustered(mddev) &&
 	     md_cluster_ops->area_resyncing(mddev, WRITE,
-		     bio->bi_iter.bi_sector, bio_end_sector(bio)))) {
+		     bio->bi_iter.bi_sector, bio_end_sector(bio))) {
 
 		/*
 		 * As the suspend_* range is controlled by userspace, we want
@@ -1325,12 +1323,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			sigset_t full, old;
 			prepare_to_wait(&conf->wait_barrier,
 					&w, TASK_INTERRUPTIBLE);
-			if ((bio_end_sector(bio) <= mddev->suspend_lo ||
-			     bio->bi_iter.bi_sector >= mddev->suspend_hi) &&
-			    (!mddev_is_clustered(mddev) ||
-			     !md_cluster_ops->area_resyncing(mddev, WRITE,
+			if (!mddev_is_clustered(mddev) ||
+			    !md_cluster_ops->area_resyncing(mddev, WRITE,
 							bio->bi_iter.bi_sector,
-							bio_end_sector(bio))))
+							bio_end_sector(bio)))
 				break;
 			sigfillset(&full);
 			sigprocmask(SIG_BLOCK, &full, &old);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a8732955f130..354a969f50a6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5685,28 +5685,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 				goto retry;
 			}
 
-			if (rw == WRITE &&
-			    logical_sector >= mddev->suspend_lo &&
-			    logical_sector < mddev->suspend_hi) {
-				raid5_release_stripe(sh);
-				/* As the suspend_* range is controlled by
-				 * userspace, we want an interruptible
-				 * wait.
-				 */
-				prepare_to_wait(&conf->wait_for_overlap,
-						&w, TASK_INTERRUPTIBLE);
-				if (logical_sector >= mddev->suspend_lo &&
-				    logical_sector < mddev->suspend_hi) {
-					sigset_t full, old;
-					sigfillset(&full);
-					sigprocmask(SIG_BLOCK, &full, &old);
-					schedule();
-					sigprocmask(SIG_SETMASK, &old, NULL);
-					do_prepare = true;
-				}
-				goto retry;
-			}
-
 			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
 			    !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
 				/* Stripe is busy expanding or
-- 
cgit v1.2.3


From b03e0ccb5ab9df3efbe51c87843a1ffbecbafa1f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 19 Oct 2017 12:49:15 +1100
Subject: md: remove special meaning of ->quiesce(.., 2)

The '2' argument means "wake up anything that is waiting".
This is an inelegant part of the design and was added
to help support management of suspend_lo/suspend_hi setting.
Now that suspend_lo/hi is managed in mddev_suspend/resume,
that need is gone.
These is still a couple of places where we call 'quiesce'
with an argument of '2', but they can safely be changed to
call ->quiesce(.., 1); ->quiesce(.., 0) which
achieve the same result at the small cost of pausing IO
briefly.

This removes a small "optimization" from suspend_{hi,lo}_store,
but it isn't clear that optimization served a useful purpose.
The code now is a lot clearer.

Suggested-by: Shaohua Li <shli@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c  |  6 +++---
 drivers/md/md.c          | 34 ++++++++++------------------------
 drivers/md/md.h          |  9 ++++-----
 drivers/md/raid0.c       |  2 +-
 drivers/md/raid1.c       | 13 +++----------
 drivers/md/raid10.c      | 10 +++-------
 drivers/md/raid5-cache.c | 12 ++++++------
 drivers/md/raid5-log.h   |  2 +-
 drivers/md/raid5.c       | 18 ++++++------------
 9 files changed, 37 insertions(+), 69 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index bc81ecc24c96..d0fd1bd8575c 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -442,10 +442,11 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
 static void remove_suspend_info(struct mddev *mddev, int slot)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	mddev->pers->quiesce(mddev, 1);
 	spin_lock_irq(&cinfo->suspend_lock);
 	__remove_suspend_info(cinfo, slot);
 	spin_unlock_irq(&cinfo->suspend_lock);
-	mddev->pers->quiesce(mddev, 2);
+	mddev->pers->quiesce(mddev, 0);
 }
 
 
@@ -492,13 +493,12 @@ static void process_suspend_info(struct mddev *mddev,
 	s->lo = lo;
 	s->hi = hi;
 	mddev->pers->quiesce(mddev, 1);
-	mddev->pers->quiesce(mddev, 0);
 	spin_lock_irq(&cinfo->suspend_lock);
 	/* Remove existing entry (if exists) before adding */
 	__remove_suspend_info(cinfo, slot);
 	list_add(&s->list, &cinfo->suspend_list);
 	spin_unlock_irq(&cinfo->suspend_lock);
-	mddev->pers->quiesce(mddev, 2);
+	mddev->pers->quiesce(mddev, 0);
 }
 
 static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9155f00dca20..d441b1d9846c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4846,7 +4846,7 @@ suspend_lo_show(struct mddev *mddev, char *page)
 static ssize_t
 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
 {
-	unsigned long long old, new;
+	unsigned long long new;
 	int err;
 
 	err = kstrtoull(buf, 10, &new);
@@ -4862,17 +4862,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
 	if (mddev->pers == NULL ||
 	    mddev->pers->quiesce == NULL)
 		goto unlock;
-	old = mddev->suspend_lo;
+	mddev_suspend(mddev);
 	mddev->suspend_lo = new;
-	if (new >= old) {
-		/* Shrinking suspended region */
-		wake_up(&mddev->sb_wait);
-		mddev->pers->quiesce(mddev, 2);
-	} else {
-		/* Expanding suspended region - need to wait */
-		mddev_suspend(mddev);
-		mddev_resume(mddev);
-	}
+	mddev_resume(mddev);
+
 	err = 0;
 unlock:
 	mddev_unlock(mddev);
@@ -4890,7 +4883,7 @@ suspend_hi_show(struct mddev *mddev, char *page)
 static ssize_t
 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
 {
-	unsigned long long old, new;
+	unsigned long long new;
 	int err;
 
 	err = kstrtoull(buf, 10, &new);
@@ -4903,20 +4896,13 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
 	if (err)
 		return err;
 	err = -EINVAL;
-	if (mddev->pers == NULL ||
-	    mddev->pers->quiesce == NULL)
+	if (mddev->pers == NULL)
 		goto unlock;
-	old = mddev->suspend_hi;
+
+	mddev_suspend(mddev);
 	mddev->suspend_hi = new;
-	if (new <= old) {
-		/* Shrinking suspended region */
-		wake_up(&mddev->sb_wait);
-		mddev->pers->quiesce(mddev, 2);
-	} else {
-		/* Expanding suspended region - need to wait */
-		mddev_suspend(mddev);
-		mddev_resume(mddev);
-	}
+	mddev_resume(mddev);
+
 	err = 0;
 unlock:
 	mddev_unlock(mddev);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 03fc641e5da1..998b4ce1498f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -544,12 +544,11 @@ struct md_personality
 	int (*check_reshape) (struct mddev *mddev);
 	int (*start_reshape) (struct mddev *mddev);
 	void (*finish_reshape) (struct mddev *mddev);
-	/* quiesce moves between quiescence states
-	 * 0 - fully active
-	 * 1 - no new requests allowed
-	 * others - reserved
+	/* quiesce suspends or resumes internal processing.
+	 * 1 - stop new actions and wait for action io to complete
+	 * 0 - return to normal behaviour
 	 */
-	void (*quiesce) (struct mddev *mddev, int state);
+	void (*quiesce) (struct mddev *mddev, int quiesce);
 	/* takeover is used to transition an array from one
 	 * personality to another.  The new personality must be able
 	 * to handle the data in the current layout.
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 5a00fc118470..5ecba9eef441 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -768,7 +768,7 @@ static void *raid0_takeover(struct mddev *mddev)
 	return ERR_PTR(-EINVAL);
 }
 
-static void raid0_quiesce(struct mddev *mddev, int state)
+static void raid0_quiesce(struct mddev *mddev, int quiesce)
 {
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fb56ef79a1c3..9428dfa7e9a0 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3273,21 +3273,14 @@ static int raid1_reshape(struct mddev *mddev)
 	return 0;
 }
 
-static void raid1_quiesce(struct mddev *mddev, int state)
+static void raid1_quiesce(struct mddev *mddev, int quiesce)
 {
 	struct r1conf *conf = mddev->private;
 
-	switch(state) {
-	case 2: /* wake for suspend */
-		wake_up(&conf->wait_barrier);
-		break;
-	case 1:
+	if (quiesce)
 		freeze_array(conf, 0);
-		break;
-	case 0:
+	else
 		unfreeze_array(conf);
-		break;
-	}
 }
 
 static void *raid1_takeover(struct mddev *mddev)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b0de5b5ee689..615f677ceb1a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3828,18 +3828,14 @@ static void raid10_free(struct mddev *mddev, void *priv)
 	kfree(conf);
 }
 
-static void raid10_quiesce(struct mddev *mddev, int state)
+static void raid10_quiesce(struct mddev *mddev, int quiesce)
 {
 	struct r10conf *conf = mddev->private;
 
-	switch(state) {
-	case 1:
+	if (quiesce)
 		raise_barrier(conf, 0);
-		break;
-	case 0:
+	else
 		lower_barrier(conf);
-		break;
-	}
 }
 
 static int raid10_resize(struct mddev *mddev, sector_t sectors)
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 59af7cf35092..037ed274807f 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1589,21 +1589,21 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
 	md_wakeup_thread(log->reclaim_thread);
 }
 
-void r5l_quiesce(struct r5l_log *log, int state)
+void r5l_quiesce(struct r5l_log *log, int quiesce)
 {
 	struct mddev *mddev;
-	if (!log || state == 2)
+	if (!log)
 		return;
-	if (state == 0)
-		kthread_unpark(log->reclaim_thread->tsk);
-	else if (state == 1) {
+
+	if (quiesce) {
 		/* make sure r5l_write_super_and_discard_space exits */
 		mddev = log->rdev->mddev;
 		wake_up(&mddev->sb_wait);
 		kthread_park(log->reclaim_thread->tsk);
 		r5l_wake_reclaim(log, MaxSector);
 		r5l_do_reclaim(log);
-	}
+	} else
+		kthread_unpark(log->reclaim_thread->tsk);
 }
 
 bool r5l_log_disk_error(struct r5conf *conf)
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 328d67aedda4..c3596a27a5a8 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -8,7 +8,7 @@ extern void r5l_write_stripe_run(struct r5l_log *log);
 extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
 extern void r5l_stripe_write_finished(struct stripe_head *sh);
 extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
-extern void r5l_quiesce(struct r5l_log *log, int state);
+extern void r5l_quiesce(struct r5l_log *log, int quiesce);
 extern bool r5l_log_disk_error(struct r5conf *conf);
 extern bool r5c_is_writeback(struct r5l_log *log);
 extern int
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 354a969f50a6..17ffa1e44c84 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -8008,16 +8008,12 @@ static void raid5_finish_reshape(struct mddev *mddev)
 	}
 }
 
-static void raid5_quiesce(struct mddev *mddev, int state)
+static void raid5_quiesce(struct mddev *mddev, int quiesce)
 {
 	struct r5conf *conf = mddev->private;
 
-	switch(state) {
-	case 2: /* resume for a suspend */
-		wake_up(&conf->wait_for_overlap);
-		break;
-
-	case 1: /* stop all writes */
+	if (quiesce) {
+		/* stop all writes */
 		lock_all_device_hash_locks_irq(conf);
 		/* '2' tells resync/reshape to pause so that all
 		 * active stripes can drain
@@ -8033,17 +8029,15 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 		unlock_all_device_hash_locks_irq(conf);
 		/* allow reshape to continue */
 		wake_up(&conf->wait_for_overlap);
-		break;
-
-	case 0: /* re-enable writes */
+	} else {
+		/* re-enable writes */
 		lock_all_device_hash_locks_irq(conf);
 		conf->quiesce = 0;
 		wake_up(&conf->wait_for_quiescent);
 		wake_up(&conf->wait_for_overlap);
 		unlock_all_device_hash_locks_irq(conf);
-		break;
 	}
-	r5l_quiesce(conf->log, state);
+	r5l_quiesce(conf->log, quiesce);
 }
 
 static void *raid45_takeover_raid0(struct mddev *mddev, int level)
-- 
cgit v1.2.3


From ae89fd3de4793c0dc2ec7e9f26b58a357d74a6c7 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 18 Oct 2017 19:01:11 -0400
Subject: md: use TASK_IDLE instead of blocking signals

Hi - I submit this patch for the next merge window:

Some times ago, I made a patch f9c79bc05a2a that blocks signals around the
schedule() calls in MD. The MD subsystem needs to do an uninterruptible
sleep that is not accounted in load average - so we block signals and use
interruptible sleep.

The kernel has a special TASK_IDLE state for this purpose, so we can use
it instead of blocking signals. This patch doesn't fix any bug, it just
makes the code simpler.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Acked-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 7 +------
 drivers/md/raid5.c | 1 -
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 9428dfa7e9a0..1f36473c79dc 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -37,7 +37,6 @@
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/ratelimit.h>
-#include <linux/sched/signal.h>
 
 #include <trace/events/block.h>
 
@@ -1320,18 +1319,14 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		 */
 		DEFINE_WAIT(w);
 		for (;;) {
-			sigset_t full, old;
 			prepare_to_wait(&conf->wait_barrier,
-					&w, TASK_INTERRUPTIBLE);
+					&w, TASK_IDLE);
 			if (!mddev_is_clustered(mddev) ||
 			    !md_cluster_ops->area_resyncing(mddev, WRITE,
 							bio->bi_iter.bi_sector,
 							bio_end_sector(bio)))
 				break;
-			sigfillset(&full);
-			sigprocmask(SIG_BLOCK, &full, &old);
 			schedule();
-			sigprocmask(SIG_SETMASK, &old, NULL);
 		}
 		finish_wait(&conf->wait_barrier, &w);
 	}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 17ffa1e44c84..2a4b34941d86 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -55,7 +55,6 @@
 #include <linux/ratelimit.h>
 #include <linux/nodemask.h>
 #include <linux/flex_array.h>
-#include <linux/sched/signal.h>
 
 #include <trace/events/block.h>
 #include <linux/list_sort.h>
-- 
cgit v1.2.3


From db0505d320660b6ad92418847e7eca6b61b246ac Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 17 Oct 2017 16:18:36 +1100
Subject: md: be cautious about using ->curr_resync_completed for
 ->recovery_offset

The ->recovery_offset shows how much of a non-InSync device is actually
in sync - how much has been recoveryed.

When performing a recovery, ->curr_resync and ->curr_resync_completed
follow the device address being recovered and so can be used to update
->recovery_offset.

When performing a reshape, ->curr_resync* might follow the device
addresses (raid5) or might follow array addresses (raid10), so cannot
in general be used to set ->recovery_offset.  When reshaping backwards,
->curre_resync* measures from the *end* of the array-or-device, so is
particularly unhelpful.

So change the common code in md.c to only use ->curr_resync_complete
for the simple recovery case, and add code to raid5.c to update
->recovery_offset during a forwards reshape.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c    | 33 ++++++++++++++++++++++-----------
 drivers/md/raid5.c | 24 ++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 11 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2bf4cc41b4f8..15e4668f594c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2454,10 +2454,18 @@ repeat:
 		}
 	}
 
-	/* First make sure individual recovery_offsets are correct */
+	/*
+	 * First make sure individual recovery_offsets are correct
+	 * curr_resync_completed can only be used during recovery.
+	 * During reshape/resync it might use array-addresses rather
+	 * that device addresses.
+	 */
 	rdev_for_each(rdev, mddev) {
 		if (rdev->raid_disk >= 0 &&
 		    mddev->delta_disks >= 0 &&
+		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+		    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
+		    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 		    !test_bit(Journal, &rdev->flags) &&
 		    !test_bit(In_sync, &rdev->flags) &&
 		    mddev->curr_resync_completed > rdev->recovery_offset)
@@ -8491,16 +8499,19 @@ void md_do_sync(struct md_thread *thread)
 		} else {
 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
 				mddev->curr_resync = MaxSector;
-			rcu_read_lock();
-			rdev_for_each_rcu(rdev, mddev)
-				if (rdev->raid_disk >= 0 &&
-				    mddev->delta_disks >= 0 &&
-				    !test_bit(Journal, &rdev->flags) &&
-				    !test_bit(Faulty, &rdev->flags) &&
-				    !test_bit(In_sync, &rdev->flags) &&
-				    rdev->recovery_offset < mddev->curr_resync)
-					rdev->recovery_offset = mddev->curr_resync;
-			rcu_read_unlock();
+			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
+				rcu_read_lock();
+				rdev_for_each_rcu(rdev, mddev)
+					if (rdev->raid_disk >= 0 &&
+					    mddev->delta_disks >= 0 &&
+					    !test_bit(Journal, &rdev->flags) &&
+					    !test_bit(Faulty, &rdev->flags) &&
+					    !test_bit(In_sync, &rdev->flags) &&
+					    rdev->recovery_offset < mddev->curr_resync)
+						rdev->recovery_offset = mddev->curr_resync;
+				rcu_read_unlock();
+			}
 		}
 	}
  skip:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2a4b34941d86..1649e82faae2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5738,6 +5738,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 	 */
 	struct r5conf *conf = mddev->private;
 	struct stripe_head *sh;
+	struct md_rdev *rdev;
 	sector_t first_sector, last_sector;
 	int raid_disks = conf->previous_raid_disks;
 	int data_disks = raid_disks - conf->max_degraded;
@@ -5860,6 +5861,15 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 			return 0;
 		mddev->reshape_position = conf->reshape_progress;
 		mddev->curr_resync_completed = sector_nr;
+		if (!mddev->reshape_backwards)
+			/* Can update recovery_offset */
+			rdev_for_each(rdev, mddev)
+				if (rdev->raid_disk >= 0 &&
+				    !test_bit(Journal, &rdev->flags) &&
+				    !test_bit(In_sync, &rdev->flags) &&
+				    rdev->recovery_offset < sector_nr)
+					rdev->recovery_offset = sector_nr;
+
 		conf->reshape_checkpoint = jiffies;
 		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 		md_wakeup_thread(mddev->thread);
@@ -5958,6 +5968,14 @@ finish:
 			goto ret;
 		mddev->reshape_position = conf->reshape_progress;
 		mddev->curr_resync_completed = sector_nr;
+		if (!mddev->reshape_backwards)
+			/* Can update recovery_offset */
+			rdev_for_each(rdev, mddev)
+				if (rdev->raid_disk >= 0 &&
+				    !test_bit(Journal, &rdev->flags) &&
+				    !test_bit(In_sync, &rdev->flags) &&
+				    rdev->recovery_offset < sector_nr)
+					rdev->recovery_offset = sector_nr;
 		conf->reshape_checkpoint = jiffies;
 		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 		md_wakeup_thread(mddev->thread);
@@ -7945,6 +7963,7 @@ static void end_reshape(struct r5conf *conf)
 {
 
 	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
+		struct md_rdev *rdev;
 
 		spin_lock_irq(&conf->device_lock);
 		conf->previous_raid_disks = conf->raid_disks;
@@ -7952,6 +7971,11 @@ static void end_reshape(struct r5conf *conf)
 		smp_wmb();
 		conf->reshape_progress = MaxSector;
 		conf->mddev->reshape_position = MaxSector;
+		rdev_for_each(rdev, conf->mddev)
+			if (rdev->raid_disk >= 0 &&
+			    !test_bit(Journal, &rdev->flags) &&
+			    !test_bit(In_sync, &rdev->flags))
+				rdev->recovery_offset = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
 		wake_up(&conf->wait_for_overlap);
 
-- 
cgit v1.2.3