From 5815839b3ca16bb1d45939270871169f6803a121 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 25 Jun 2018 19:31:47 +0800
Subject: blk-mq: introduce new lock for protecting hctx->dispatch_wait

Now hctx->lock is only acquired when adding hctx->dispatch_wait to
one wait queue, but not held when removing it from the wait queue.

IO hang can be observed easily if SCHED RESTART is disabled, that means
now RESTART exits just for fixing the issue in blk_mq_mark_tag_wait().

This patch fixes the issue by introducing hctx->dispatch_wait_lock and
holding it for removing hctx->dispatch_wait in blk_mq_dispatch_wake(),
since we need to avoid acquiring hctx->lock in irq context.

Fixes: eb619fdb2d4cb8b3d3419 ("blk-mq: fix issue with shared tag queue re-running")
Cc: Christoph Hellwig <hch@lst.de>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Bart Van Assche <bart.vanassche@wdc.com>
Tested-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/blk-mq.h')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e3147eb74222..ea690254dab7 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -39,6 +39,7 @@ struct blk_mq_hw_ctx {
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
 
+	spinlock_t		dispatch_wait_lock;
 	wait_queue_entry_t	dispatch_wait;
 	atomic_t		wait_index;
 
-- 
cgit v1.2.3


From 6e768717304bdbe8d2897ca8298f6b58863fdc41 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 3 Jul 2018 09:03:16 -0600
Subject: blk-mq: dequeue request one by one from sw queue if hctx is busy

It won't be efficient to dequeue request one by one from sw queue,
but we have to do that when queue is busy for better merge performance.

This patch takes the Exponential Weighted Moving Average(EWMA) to figure
out if queue is busy, then only dequeue request one by one from sw queue
when queue is busy.

Fixes: b347689ffbca ("blk-mq-sched: improve dispatching from sw queue")
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Laurence Oberman <loberman@redhat.com>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Hannes Reinecke <hare@suse.de>
Reported-by: Kashyap Desai <kashyap.desai@broadcom.com>
Tested-by: Kashyap Desai <kashyap.desai@broadcom.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c |  9 +++++++++
 block/blk-mq-sched.c   | 11 ++---------
 block/blk-mq.c         | 33 ++++++++++++++++++++++++++++++++-
 include/linux/blk-mq.h |  3 ++-
 4 files changed, 45 insertions(+), 11 deletions(-)

(limited to 'include/linux/blk-mq.h')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7efe268e4447..cb1e6cf7ac48 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -622,6 +622,14 @@ static int hctx_active_show(void *data, struct seq_file *m)
 	return 0;
 }
 
+static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
+{
+	struct blk_mq_hw_ctx *hctx = data;
+
+	seq_printf(m, "%u\n", hctx->dispatch_busy);
+	return 0;
+}
+
 static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos)
 	__acquires(&ctx->lock)
 {
@@ -783,6 +791,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 	{"queued", 0600, hctx_queued_show, hctx_queued_write},
 	{"run", 0600, hctx_run_show, hctx_run_write},
 	{"active", 0400, hctx_active_show},
+	{"dispatch_busy", 0400, hctx_dispatch_busy_show},
 	{},
 };
 
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index f3b4b5ceb4d1..fdc129e64cc4 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -206,15 +206,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 		}
 	} else if (has_sched_dispatch) {
 		blk_mq_do_dispatch_sched(hctx);
-	} else if (q->mq_ops->get_budget) {
-		/*
-		 * If we need to get budget before queuing request, we
-		 * dequeue request one by one from sw queue for avoiding
-		 * to mess up I/O merge when dispatch runs out of resource.
-		 *
-		 * TODO: get more budgets, and dequeue more requests in
-		 * one time.
-		 */
+	} else if (hctx->dispatch_busy) {
+		/* dequeue request one by one from sw queue if queue is busy */
 		blk_mq_do_dispatch_ctx(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 795ba859b16b..850fdd02c385 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1074,6 +1074,35 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
 	return true;
 }
 
+#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
+#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
+/*
+ * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
+ * - EWMA is one simple way to compute running average value
+ * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
+ * - take 4 as factor for avoiding to get too small(0) result, and this
+ *   factor doesn't matter because EWMA decreases exponentially
+ */
+static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
+{
+	unsigned int ewma;
+
+	if (hctx->queue->elevator)
+		return;
+
+	ewma = hctx->dispatch_busy;
+
+	if (!ewma && !busy)
+		return;
+
+	ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
+	if (busy)
+		ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
+	ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
+
+	hctx->dispatch_busy = ewma;
+}
+
 #define BLK_MQ_RESOURCE_DELAY	3		/* ms units */
 
 /*
@@ -1210,8 +1239,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		else if (needs_restart && (ret == BLK_STS_RESOURCE))
 			blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
 
+		blk_mq_update_dispatch_busy(hctx, true);
 		return false;
-	}
+	} else
+		blk_mq_update_dispatch_busy(hctx, false);
 
 	/*
 	 * If the host/device is unable to accept more work, inform the
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ea690254dab7..d710e92874cc 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -35,9 +35,10 @@ struct blk_mq_hw_ctx {
 	struct sbitmap		ctx_map;
 
 	struct blk_mq_ctx	*dispatch_from;
+	unsigned int		dispatch_busy;
 
-	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
+	struct blk_mq_ctx	**ctxs;
 
 	spinlock_t		dispatch_wait_lock;
 	wait_queue_entry_t	dispatch_wait;
-- 
cgit v1.2.3