From 6cc77e9cb08041627fe1d32ac3a743249deb8167 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Dec 2017 15:43:38 +0900
Subject: block: introduce zoned block devices zone write locking

Components relying only on the request_queue structure for accessing
block devices (e.g. I/O schedulers) have a limited knowledged of the
device characteristics. In particular, the device capacity cannot be
easily discovered, which for a zoned block device also result in the
inability to easily know the number of zones of the device (the zone
size is indicated by the chunk_sectors field of the queue limits).

Introduce the nr_zones field to the request_queue structure to simplify
access to this information. Also, add the bitmap seq_zone_bitmap which
indicates which zones of the device are sequential zones (write
preferred or write required) and the bitmap seq_zones_wlock which
indicates if a zone is write locked, that is, if a write request
targeting a zone was dispatched to the device. These fields are
initialized by the low level block device driver (sd.c for ZBC/ZAC
disks). They are not initialized by stacking drivers (device mappers)
handling zoned block devices (e.g. dm-linear).

Using this, I/O schedulers can introduce zone write locking to control
request dispatching to a zoned block device and avoid write request
reordering by limiting to at most a single write request per zone
outside of the scheduler at any time.

Based on previous patches from Damien Le Moal.

Signed-off-by: Christoph Hellwig <hch@lst.de>
[Damien]
* Fixed comments and identation in blkdev.h
* Changed helper functions
* Fixed this commit message
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 111 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)

(limited to 'include/linux/blkdev.h')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8089ca17db9a..46e606f5b44b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -121,6 +121,8 @@ typedef __u32 __bitwise req_flags_t;
 /* Look at ->special_vec for the actual data payload instead of the
    bio chain. */
 #define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
+/* The per-zone write lock is held for this request */
+#define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
@@ -546,6 +548,22 @@ struct request_queue {
 
 	struct queue_limits	limits;
 
+	/*
+	 * Zoned block device information for request dispatch control.
+	 * nr_zones is the total number of zones of the device. This is always
+	 * 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones
+	 * bits which indicates if a zone is conventional (bit clear) or
+	 * sequential (bit set). seq_zones_wlock is a bitmap of nr_zones
+	 * bits which indicates if a zone is write locked, that is, if a write
+	 * request targeting the zone was dispatched. All three fields are
+	 * initialized by the low level device driver (e.g. scsi/sd.c).
+	 * Stacking drivers (device mappers) may or may not initialize
+	 * these fields.
+	 */
+	unsigned int		nr_zones;
+	unsigned long		*seq_zones_bitmap;
+	unsigned long		*seq_zones_wlock;
+
 	/*
 	 * sg stuff
 	 */
@@ -790,6 +808,27 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
 	return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
 }
 
+static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
+{
+	return q->nr_zones;
+}
+
+static inline unsigned int blk_queue_zone_no(struct request_queue *q,
+					     sector_t sector)
+{
+	if (!blk_queue_is_zoned(q))
+		return 0;
+	return sector >> ilog2(q->limits.chunk_sectors);
+}
+
+static inline bool blk_queue_zone_is_seq(struct request_queue *q,
+					 sector_t sector)
+{
+	if (!blk_queue_is_zoned(q) || !q->seq_zones_bitmap)
+		return false;
+	return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap);
+}
+
 static inline bool rq_is_sync(struct request *rq)
 {
 	return op_is_sync(rq->cmd_flags);
@@ -1029,6 +1068,16 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 	return blk_rq_cur_bytes(rq) >> 9;
 }
 
+static inline unsigned int blk_rq_zone_no(struct request *rq)
+{
+	return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
+}
+
+static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
+{
+	return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
+}
+
 /*
  * Some commands like WRITE SAME have a payload or data transfer size which
  * is different from the size of the request.  Any driver that supports such
@@ -1578,7 +1627,15 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev)
 
 	if (q)
 		return blk_queue_zone_sectors(q);
+	return 0;
+}
 
+static inline unsigned int bdev_nr_zones(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q)
+		return blk_queue_nr_zones(q);
 	return 0;
 }
 
@@ -1954,6 +2011,60 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
+
+#ifdef CONFIG_BLK_DEV_ZONED
+bool blk_req_needs_zone_write_lock(struct request *rq);
+void __blk_req_zone_write_lock(struct request *rq);
+void __blk_req_zone_write_unlock(struct request *rq);
+
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+	if (blk_req_needs_zone_write_lock(rq))
+		__blk_req_zone_write_lock(rq);
+}
+
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+	if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
+		__blk_req_zone_write_unlock(rq);
+}
+
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+	return rq->q->seq_zones_wlock &&
+		test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
+}
+
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+	if (!blk_req_needs_zone_write_lock(rq))
+		return true;
+	return !blk_req_zone_is_write_locked(rq);
+}
+#else
+static inline bool blk_req_needs_zone_write_lock(struct request *rq)
+{
+	return false;
+}
+
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+}
+
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+}
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+	return false;
+}
+
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+	return true;
+}
+#endif /* CONFIG_BLK_DEV_ZONED */
+
 #else /* CONFIG_BLOCK */
 
 struct block_device;
-- 
cgit v1.2.3


From 1d9bd5161ba32db5665a617edc8b0723880f543e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2018 08:29:48 -0800
Subject: blk-mq: replace timeout synchronization with a RCU and generation
 based scheme

Currently, blk-mq timeout path synchronizes against the usual
issue/completion path using a complex scheme involving atomic
bitflags, REQ_ATOM_*, memory barriers and subtle memory coherence
rules.  Unfortunately, it contains quite a few holes.

There's a complex dancing around REQ_ATOM_STARTED and
REQ_ATOM_COMPLETE between issue/completion and timeout paths; however,
they don't have a synchronization point across request recycle
instances and it isn't clear what the barriers add.
blk_mq_check_expired() can easily read STARTED from N-2'th iteration,
deadline from N-1'th, blk_mark_rq_complete() against Nth instance.

In fact, it's pretty easy to make blk_mq_check_expired() terminate a
later instance of a request.  If we induce 5 sec delay before
time_after_eq() test in blk_mq_check_expired(), shorten the timeout to
2s, and issue back-to-back large IOs, blk-mq starts timing out
requests spuriously pretty quickly.  Nothing actually timed out.  It
just made the call on a recycle instance of a request and then
terminated a later instance long after the original instance finished.
The scenario isn't theoretical either.

This patch replaces the broken synchronization mechanism with a RCU
and generation number based one.

1. Each request has a u64 generation + state value, which can be
   updated only by the request owner.  Whenever a request becomes
   in-flight, the generation number gets bumped up too.  This provides
   the basis for the timeout path to distinguish different recycle
   instances of the request.

   Also, marking a request in-flight and setting its deadline are
   protected with a seqcount so that the timeout path can fetch both
   values coherently.

2. The timeout path fetches the generation, state and deadline.  If
   the verdict is timeout, it records the generation into a dedicated
   request abortion field and does RCU wait.

3. The completion path is also protected by RCU (from the previous
   patch) and checks whether the current generation number and state
   match the abortion field.  If so, it skips completion.

4. The timeout path, after RCU wait, scans requests again and
   terminates the ones whose generation and state still match the ones
   requested for abortion.

   By now, the timeout path knows that either the generation number
   and state changed if it lost the race or the completion will yield
   to it and can safely timeout the request.

While it's more lines of code, it's conceptually simpler, doesn't
depend on direct use of subtle memory ordering or coherence, and
hopefully doesn't terminate the wrong instance.

While this change makes REQ_ATOM_COMPLETE synchronization unnecessary
between issue/complete and timeout paths, REQ_ATOM_COMPLETE isn't
removed yet as it's still used in other places.  Future patches will
move all state tracking to the new mechanism and remove all bitops in
the hot paths.

Note that this patch adds a comment explaining a race condition in
BLK_EH_RESET_TIMER path.  The race has always been there and this
patch doesn't change it.  It's just documenting the existing race.

v2: - Fixed BLK_EH_RESET_TIMER handling as pointed out by Jianchao.
    - s/request->gstate_seqc/request->gstate_seq/ as suggested by Peter.
    - READ_ONCE() added in blk_mq_rq_update_state() as suggested by Peter.

v3: - Fixed possible extended seqcount / u64_stats_sync read looping
      spotted by Peter.
    - MQ_RQ_IDLE was incorrectly being set in complete_request instead
      of free_request.  Fixed.

v4: - Rebased on top of hctx_lock() refactoring patch.
    - Added comment explaining the use of hctx_lock() in completion path.

v5: - Added comments requested by Bart.
    - Note the addition of BLK_EH_RESET_TIMER race condition in the
      commit message.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: "jianchao.wang" <jianchao.w.wang@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Bart Van Assche <Bart.VanAssche@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |   2 +
 block/blk-mq.c         | 229 +++++++++++++++++++++++++++++++++----------------
 block/blk-mq.h         |  46 ++++++++++
 block/blk-timeout.c    |   2 +-
 block/blk.h            |   6 --
 include/linux/blk-mq.h |   1 +
 include/linux/blkdev.h |  23 +++++
 7 files changed, 230 insertions(+), 79 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-core.c b/block/blk-core.c
index 2e0d041e2daf..f843ae4f858d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
 	rq->part = NULL;
+	seqcount_init(&rq->gstate_seq);
+	u64_stats_init(&rq->aborted_gstate_sync);
 }
 EXPORT_SYMBOL(blk_rq_init);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f5e57c80a82b..156203876c8c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -483,6 +483,7 @@ void blk_mq_free_request(struct request *rq)
 	if (blk_rq_rl(rq))
 		blk_put_rl(blk_rq_rl(rq));
 
+	blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
 	if (rq->tag != -1)
@@ -530,6 +531,8 @@ static void __blk_mq_complete_request(struct request *rq)
 	bool shared = false;
 	int cpu;
 
+	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
+
 	if (rq->internal_tag != -1)
 		blk_mq_sched_completed_request(rq);
 	if (rq->rq_flags & RQF_STATS) {
@@ -573,6 +576,36 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
 		*srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
 }
 
+static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
+{
+	unsigned long flags;
+
+	/*
+	 * blk_mq_rq_aborted_gstate() is used from the completion path and
+	 * can thus be called from irq context.  u64_stats_fetch in the
+	 * middle of update on the same CPU leads to lockup.  Disable irq
+	 * while updating.
+	 */
+	local_irq_save(flags);
+	u64_stats_update_begin(&rq->aborted_gstate_sync);
+	rq->aborted_gstate = gstate;
+	u64_stats_update_end(&rq->aborted_gstate_sync);
+	local_irq_restore(flags);
+}
+
+static u64 blk_mq_rq_aborted_gstate(struct request *rq)
+{
+	unsigned int start;
+	u64 aborted_gstate;
+
+	do {
+		start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
+		aborted_gstate = rq->aborted_gstate;
+	} while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
+
+	return aborted_gstate;
+}
+
 /**
  * blk_mq_complete_request - end I/O on a request
  * @rq:		the request being processed
@@ -590,8 +623,20 @@ void blk_mq_complete_request(struct request *rq)
 	if (unlikely(blk_should_fake_timeout(q)))
 		return;
 
+	/*
+	 * If @rq->aborted_gstate equals the current instance, timeout is
+	 * claiming @rq and we lost.  This is synchronized through
+	 * hctx_lock().  See blk_mq_timeout_work() for details.
+	 *
+	 * Completion path never blocks and we can directly use RCU here
+	 * instead of hctx_lock() which can be either RCU or SRCU.
+	 * However, that would complicate paths which want to synchronize
+	 * against us.  Let stay in sync with the issue path so that
+	 * hctx_lock() covers both issue and completion paths.
+	 */
 	hctx_lock(hctx, &srcu_idx);
-	if (!blk_mark_rq_complete(rq))
+	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate &&
+	    !blk_mark_rq_complete(rq))
 		__blk_mq_complete_request(rq);
 	hctx_unlock(hctx, srcu_idx);
 }
@@ -617,34 +662,32 @@ void blk_mq_start_request(struct request *rq)
 		wbt_issue(q->rq_wb, &rq->issue_stat);
 	}
 
-	blk_add_timer(rq);
-
+	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
 	WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
 
 	/*
-	 * Mark us as started and clear complete. Complete might have been
-	 * set if requeue raced with timeout, which then marked it as
-	 * complete. So be sure to clear complete again when we start
-	 * the request, otherwise we'll ignore the completion event.
+	 * Mark @rq in-flight which also advances the generation number,
+	 * and register for timeout.  Protect with a seqcount to allow the
+	 * timeout path to read both @rq->gstate and @rq->deadline
+	 * coherently.
 	 *
-	 * Ensure that ->deadline is visible before we set STARTED, such that
-	 * blk_mq_check_expired() is guaranteed to observe our ->deadline when
-	 * it observes STARTED.
+	 * This is the only place where a request is marked in-flight.  If
+	 * the timeout path reads an in-flight @rq->gstate, the
+	 * @rq->deadline it reads together under @rq->gstate_seq is
+	 * guaranteed to be the matching one.
 	 */
-	smp_wmb();
+	preempt_disable();
+	write_seqcount_begin(&rq->gstate_seq);
+
+	blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
+	blk_add_timer(rq);
+
+	write_seqcount_end(&rq->gstate_seq);
+	preempt_enable();
+
 	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
-		/*
-		 * Coherence order guarantees these consecutive stores to a
-		 * single variable propagate in the specified order. Thus the
-		 * clear_bit() is ordered _after_ the set bit. See
-		 * blk_mq_check_expired().
-		 *
-		 * (the bits must be part of the same byte for this to be
-		 * true).
-		 */
+	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
 		clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
-	}
 
 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
 		/*
@@ -677,6 +720,7 @@ static void __blk_mq_requeue_request(struct request *rq)
 	blk_mq_sched_requeue_request(rq);
 
 	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+		blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
 		if (q->dma_drain_size && blk_rq_bytes(rq))
 			rq->nr_phys_segments--;
 	}
@@ -774,6 +818,7 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq);
 struct blk_mq_timeout_data {
 	unsigned long next;
 	unsigned int next_set;
+	unsigned int nr_expired;
 };
 
 void blk_mq_rq_timed_out(struct request *req, bool reserved)
@@ -801,6 +846,12 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
 		__blk_mq_complete_request(req);
 		break;
 	case BLK_EH_RESET_TIMER:
+		/*
+		 * As nothing prevents from completion happening while
+		 * ->aborted_gstate is set, this may lead to ignored
+		 * completions and further spurious timeouts.
+		 */
+		blk_mq_rq_update_aborted_gstate(req, 0);
 		blk_add_timer(req);
 		blk_clear_rq_complete(req);
 		break;
@@ -816,50 +867,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, void *priv, bool reserved)
 {
 	struct blk_mq_timeout_data *data = priv;
-	unsigned long deadline;
+	unsigned long gstate, deadline;
+	int start;
+
+	might_sleep();
 
 	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 		return;
 
-	/*
-	 * Ensures that if we see STARTED we must also see our
-	 * up-to-date deadline, see blk_mq_start_request().
-	 */
-	smp_rmb();
-
-	deadline = READ_ONCE(rq->deadline);
+	/* read coherent snapshots of @rq->state_gen and @rq->deadline */
+	while (true) {
+		start = read_seqcount_begin(&rq->gstate_seq);
+		gstate = READ_ONCE(rq->gstate);
+		deadline = rq->deadline;
+		if (!read_seqcount_retry(&rq->gstate_seq, start))
+			break;
+		cond_resched();
+	}
 
-	/*
-	 * The rq being checked may have been freed and reallocated
-	 * out already here, we avoid this race by checking rq->deadline
-	 * and REQ_ATOM_COMPLETE flag together:
-	 *
-	 * - if rq->deadline is observed as new value because of
-	 *   reusing, the rq won't be timed out because of timing.
-	 * - if rq->deadline is observed as previous value,
-	 *   REQ_ATOM_COMPLETE flag won't be cleared in reuse path
-	 *   because we put a barrier between setting rq->deadline
-	 *   and clearing the flag in blk_mq_start_request(), so
-	 *   this rq won't be timed out too.
-	 */
-	if (time_after_eq(jiffies, deadline)) {
-		if (!blk_mark_rq_complete(rq)) {
-			/*
-			 * Again coherence order ensures that consecutive reads
-			 * from the same variable must be in that order. This
-			 * ensures that if we see COMPLETE clear, we must then
-			 * see STARTED set and we'll ignore this timeout.
-			 *
-			 * (There's also the MB implied by the test_and_clear())
-			 */
-			blk_mq_rq_timed_out(rq, reserved);
-		}
+	/* if in-flight && overdue, mark for abortion */
+	if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
+	    time_after_eq(jiffies, deadline)) {
+		blk_mq_rq_update_aborted_gstate(rq, gstate);
+		data->nr_expired++;
+		hctx->nr_expired++;
 	} else if (!data->next_set || time_after(data->next, deadline)) {
 		data->next = deadline;
 		data->next_set = 1;
 	}
 }
 
+static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+		struct request *rq, void *priv, bool reserved)
+{
+	/*
+	 * We marked @rq->aborted_gstate and waited for RCU.  If there were
+	 * completions that we lost to, they would have finished and
+	 * updated @rq->gstate by now; otherwise, the completion path is
+	 * now guaranteed to see @rq->aborted_gstate and yield.  If
+	 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+	 */
+	if (READ_ONCE(rq->gstate) == rq->aborted_gstate &&
+	    !blk_mark_rq_complete(rq))
+		blk_mq_rq_timed_out(rq, reserved);
+}
+
 static void blk_mq_timeout_work(struct work_struct *work)
 {
 	struct request_queue *q =
@@ -867,7 +919,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
 	struct blk_mq_timeout_data data = {
 		.next		= 0,
 		.next_set	= 0,
+		.nr_expired	= 0,
 	};
+	struct blk_mq_hw_ctx *hctx;
 	int i;
 
 	/* A deadlock might occur if a request is stuck requiring a
@@ -886,14 +940,40 @@ static void blk_mq_timeout_work(struct work_struct *work)
 	if (!percpu_ref_tryget(&q->q_usage_counter))
 		return;
 
+	/* scan for the expired ones and set their ->aborted_gstate */
 	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
 
+	if (data.nr_expired) {
+		bool has_rcu = false;
+
+		/*
+		 * Wait till everyone sees ->aborted_gstate.  The
+		 * sequential waits for SRCUs aren't ideal.  If this ever
+		 * becomes a problem, we can add per-hw_ctx rcu_head and
+		 * wait in parallel.
+		 */
+		queue_for_each_hw_ctx(q, hctx, i) {
+			if (!hctx->nr_expired)
+				continue;
+
+			if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+				has_rcu = true;
+			else
+				synchronize_srcu(hctx->queue_rq_srcu);
+
+			hctx->nr_expired = 0;
+		}
+		if (has_rcu)
+			synchronize_rcu();
+
+		/* terminate the ones we won */
+		blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
+	}
+
 	if (data.next_set) {
 		data.next = blk_rq_timeout(round_jiffies_up(data.next));
 		mod_timer(&q->timeout, data.next);
 	} else {
-		struct blk_mq_hw_ctx *hctx;
-
 		queue_for_each_hw_ctx(q, hctx, i) {
 			/* the hctx may be unmapped, so check it here */
 			if (blk_mq_hw_queue_mapped(hctx))
@@ -1893,6 +1973,22 @@ static size_t order_to_size(unsigned int order)
 	return (size_t)PAGE_SIZE << order;
 }
 
+static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
+			       unsigned int hctx_idx, int node)
+{
+	int ret;
+
+	if (set->ops->init_request) {
+		ret = set->ops->init_request(set, rq, hctx_idx, node);
+		if (ret)
+			return ret;
+	}
+
+	seqcount_init(&rq->gstate_seq);
+	u64_stats_init(&rq->aborted_gstate_sync);
+	return 0;
+}
+
 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		     unsigned int hctx_idx, unsigned int depth)
 {
@@ -1954,12 +2050,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 			struct request *rq = p;
 
 			tags->static_rqs[i] = rq;
-			if (set->ops->init_request) {
-				if (set->ops->init_request(set, rq, hctx_idx,
-						node)) {
-					tags->static_rqs[i] = NULL;
-					goto fail;
-				}
+			if (blk_mq_init_request(set, rq, hctx_idx, node)) {
+				tags->static_rqs[i] = NULL;
+				goto fail;
 			}
 
 			p += rq_size;
@@ -2099,9 +2192,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (!hctx->fq)
 		goto sched_exit_hctx;
 
-	if (set->ops->init_request &&
-	    set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
-				   node))
+	if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
 		goto free_fq;
 
 	if (hctx->flags & BLK_MQ_F_BLOCKING)
@@ -3019,12 +3110,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 
 static int __init blk_mq_init(void)
 {
-	/*
-	 * See comment in block/blk.h rq_atomic_flags enum
-	 */
-	BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
-			(REQ_ATOM_COMPLETE / BITS_PER_BYTE));
-
 	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
 				blk_mq_hctx_notify_dead);
 	return 0;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6c7c3ff5bf62..cf01f6f8c73d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -27,6 +27,19 @@ struct blk_mq_ctx {
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
 
+/*
+ * Bits for request->gstate.  The lower two bits carry MQ_RQ_* state value
+ * and the upper bits the generation number.
+ */
+enum mq_rq_state {
+	MQ_RQ_IDLE		= 0,
+	MQ_RQ_IN_FLIGHT		= 1,
+
+	MQ_RQ_STATE_BITS	= 2,
+	MQ_RQ_STATE_MASK	= (1 << MQ_RQ_STATE_BITS) - 1,
+	MQ_RQ_GEN_INC		= 1 << MQ_RQ_STATE_BITS,
+};
+
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@@ -85,6 +98,39 @@ extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
 
 void blk_mq_release(struct request_queue *q);
 
+/**
+ * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
+ * @rq: target request.
+ */
+static inline int blk_mq_rq_state(struct request *rq)
+{
+	return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
+}
+
+/**
+ * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
+ * @rq: target request.
+ * @state: new state to set.
+ *
+ * Set @rq's state to @state.  The caller is responsible for ensuring that
+ * there are no other updaters.  A request can transition into IN_FLIGHT
+ * only from IDLE and doing so increments the generation number.
+ */
+static inline void blk_mq_rq_update_state(struct request *rq,
+					  enum mq_rq_state state)
+{
+	u64 old_val = READ_ONCE(rq->gstate);
+	u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
+
+	if (state == MQ_RQ_IN_FLIGHT) {
+		WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
+		new_val += MQ_RQ_GEN_INC;
+	}
+
+	/* avoid exposing interim values */
+	WRITE_ONCE(rq->gstate, new_val);
+}
+
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
 					   unsigned int cpu)
 {
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 764ecf9aeb30..6427be7ac363 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -208,7 +208,7 @@ void blk_add_timer(struct request *req)
 	if (!req->timeout)
 		req->timeout = q->rq_timeout;
 
-	WRITE_ONCE(req->deadline, jiffies + req->timeout);
+	req->deadline = jiffies + req->timeout;
 
 	/*
 	 * Only the non-mq case needs to add the request to a protected list.
diff --git a/block/blk.h b/block/blk.h
index 3f1446937aec..9cb2739edb6a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -123,12 +123,6 @@ void blk_account_io_done(struct request *req);
  * Internal atomic flags for request handling
  */
 enum rq_atomic_flags {
-	/*
-	 * Keep these two bits first - not because we depend on the
-	 * value of them, but we do depend on them being in the same
-	 * byte of storage to ensure ordering on writes. Keeping them
-	 * first will achieve that nicely.
-	 */
 	REQ_ATOM_COMPLETE = 0,
 	REQ_ATOM_STARTED,
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 95c9a5c862e2..460798dbac1f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -51,6 +51,7 @@ struct blk_mq_hw_ctx {
 	unsigned int		queue_num;
 
 	atomic_t		nr_active;
+	unsigned int		nr_expired;
 
 	struct hlist_node	cpuhp_dead;
 	struct kobject		kobj;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 46e606f5b44b..ae563d01b29d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -27,6 +27,8 @@
 #include <linux/percpu-refcount.h>
 #include <linux/scatterlist.h>
 #include <linux/blkzoned.h>
+#include <linux/seqlock.h>
+#include <linux/u64_stats_sync.h>
 
 struct module;
 struct scsi_ioctl_command;
@@ -230,6 +232,27 @@ struct request {
 
 	unsigned short write_hint;
 
+	/*
+	 * On blk-mq, the lower bits of ->gstate (generation number and
+	 * state) carry the MQ_RQ_* state value and the upper bits the
+	 * generation number which is monotonically incremented and used to
+	 * distinguish the reuse instances.
+	 *
+	 * ->gstate_seq allows updates to ->gstate and other fields
+	 * (currently ->deadline) during request start to be read
+	 * atomically from the timeout path, so that it can operate on a
+	 * coherent set of information.
+	 */
+	seqcount_t gstate_seq;
+	u64 gstate;
+
+	/*
+	 * ->aborted_gstate is used by the timeout to claim a specific
+	 * recycle instance of this request.  See blk_mq_timeout_work().
+	 */
+	struct u64_stats_sync aborted_gstate_sync;
+	u64 aborted_gstate;
+
 	unsigned long deadline;
 	struct list_head timeout_list;
 
-- 
cgit v1.2.3


From 634f9e4631a88025d3b90c1884e9a1b6a13d01d2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2018 08:29:51 -0800
Subject: blk-mq: remove REQ_ATOM_COMPLETE usages from blk-mq

After the recent updates to use generation number and state based
synchronization, blk-mq no longer depends on REQ_ATOM_COMPLETE except
to avoid firing the same timeout multiple times.

Remove all REQ_ATOM_COMPLETE usages and use a new rq_flags flag
RQF_MQ_TIMEOUT_EXPIRED to avoid firing the same timeout multiple
times.  This removes atomic bitops from hot paths too.

v2: Removed blk_clear_rq_complete() from blk_mq_rq_timed_out().

v3: Added RQF_MQ_TIMEOUT_EXPIRED flag.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: "jianchao.wang" <jianchao.w.wang@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 15 +++++++--------
 block/blk-timeout.c    |  1 +
 include/linux/blkdev.h |  2 ++
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 90f6910a83f6..d1000c6cbec6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -634,8 +634,7 @@ void blk_mq_complete_request(struct request *rq)
 	 * hctx_lock() covers both issue and completion paths.
 	 */
 	hctx_lock(hctx, &srcu_idx);
-	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate &&
-	    !blk_mark_rq_complete(rq))
+	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
 		__blk_mq_complete_request(rq);
 	hctx_unlock(hctx, srcu_idx);
 }
@@ -685,8 +684,6 @@ void blk_mq_start_request(struct request *rq)
 	preempt_enable();
 
 	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
-		clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
 
 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
 		/*
@@ -837,6 +834,8 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 	if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
 		return;
 
+	req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
+
 	if (ops->timeout)
 		ret = ops->timeout(req, reserved);
 
@@ -852,7 +851,6 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 		 */
 		blk_mq_rq_update_aborted_gstate(req, 0);
 		blk_add_timer(req);
-		blk_clear_rq_complete(req);
 		break;
 	case BLK_EH_NOT_HANDLED:
 		break;
@@ -871,7 +869,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 
 	might_sleep();
 
-	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+	if ((rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) ||
+	    !test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 		return;
 
 	/* read coherent snapshots of @rq->state_gen and @rq->deadline */
@@ -906,8 +905,8 @@ static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
 	 * now guaranteed to see @rq->aborted_gstate and yield.  If
 	 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
 	 */
-	if (READ_ONCE(rq->gstate) == rq->aborted_gstate &&
-	    !blk_mark_rq_complete(rq))
+	if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
+	    READ_ONCE(rq->gstate) == rq->aborted_gstate)
 		blk_mq_rq_timed_out(rq, reserved);
 }
 
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 4f04cd1e0b74..ebe99963386c 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -214,6 +214,7 @@ void blk_add_timer(struct request *req)
 		req->timeout = q->rq_timeout;
 
 	req->deadline = jiffies + req->timeout;
+	req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
 
 	/*
 	 * Only the non-mq case needs to add the request to a protected list.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ae563d01b29d..007a7cf1f262 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -125,6 +125,8 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
 /* The per-zone write lock is held for this request */
 #define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
+/* timeout is expired */
+#define RQF_MQ_TIMEOUT_EXPIRED	((__force req_flags_t)(1 << 20))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
-- 
cgit v1.2.3


From 76a86f9d027b342b8759a4b2f9f7fe046e284220 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 10 Jan 2018 11:30:56 -0700
Subject: block: remove REQ_ATOM_POLL_SLEPT

We don't need this to be an atomic flag, it can be a regular
flag. We either end up on the same CPU for the polling, in which
case the state is sane, or we did the sleep which would imply
the needed barrier to ensure we see the right state.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 2 +-
 block/blk-mq.c         | 5 ++---
 block/blk.h            | 2 --
 include/linux/blkdev.h | 2 ++
 4 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 25d41151073d..dd890d5e0fbd 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -290,13 +290,13 @@ static const char *const rqf_name[] = {
 	RQF_NAME(SPECIAL_PAYLOAD),
 	RQF_NAME(ZONE_WRITE_LOCKED),
 	RQF_NAME(MQ_TIMEOUT_EXPIRED),
+	RQF_NAME(MQ_POLL_SLEPT),
 };
 #undef RQF_NAME
 
 #define RQAF_NAME(name) [REQ_ATOM_##name] = #name
 static const char *const rqaf_name[] = {
 	RQAF_NAME(COMPLETE),
-	RQAF_NAME(POLL_SLEPT),
 };
 #undef RQAF_NAME
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 266fc4f6b046..3239ca9e199f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -483,7 +483,6 @@ void blk_mq_free_request(struct request *rq)
 		blk_put_rl(blk_rq_rl(rq));
 
 	blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
-	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
 	if (rq->tag != -1)
 		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
 	if (sched_tag != -1)
@@ -2976,7 +2975,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 	unsigned int nsecs;
 	ktime_t kt;
 
-	if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+	if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
 		return false;
 
 	/*
@@ -2996,7 +2995,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 	if (!nsecs)
 		return false;
 
-	set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+	rq->rq_flags |= RQF_MQ_POLL_SLEPT;
 
 	/*
 	 * This will be replaced with the stats tracking code, using
diff --git a/block/blk.h b/block/blk.h
index a68dbe312ea3..eb306c52121e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -124,8 +124,6 @@ void blk_account_io_done(struct request *req);
  */
 enum rq_atomic_flags {
 	REQ_ATOM_COMPLETE = 0,
-
-	REQ_ATOM_POLL_SLEPT,
 };
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 007a7cf1f262..ba31674d8581 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -127,6 +127,8 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
 /* timeout is expired */
 #define RQF_MQ_TIMEOUT_EXPIRED	((__force req_flags_t)(1 << 20))
+/* already slept for hybrid poll */
+#define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 21))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
-- 
cgit v1.2.3


From 0a72e7f44964b9ada3e5c15820372e9cb119bf80 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 9 Jan 2018 14:23:42 -0700
Subject: block: add accessors for setting/querying request deadline

We reduce the resolution of request expiry, but since we're already
using jiffies for this where resolution depends on the kernel
configuration and since the timeout resolution is coarse anyway,
that should be fine.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         |  2 +-
 block/blk-timeout.c    | 14 ++++++++------
 block/blk.h            | 15 +++++++++++++++
 include/linux/blkdev.h |  4 +++-
 4 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3239ca9e199f..7035c305be45 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -858,7 +858,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 	while (true) {
 		start = read_seqcount_begin(&rq->gstate_seq);
 		gstate = READ_ONCE(rq->gstate);
-		deadline = rq->deadline;
+		deadline = blk_rq_deadline(rq);
 		if (!read_seqcount_retry(&rq->gstate_seq, start))
 			break;
 		cond_resched();
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index ebe99963386c..a05e3676d24a 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -112,7 +112,9 @@ static void blk_rq_timed_out(struct request *req)
 static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
 			  unsigned int *next_set)
 {
-	if (time_after_eq(jiffies, rq->deadline)) {
+	const unsigned long deadline = blk_rq_deadline(rq);
+
+	if (time_after_eq(jiffies, deadline)) {
 		list_del_init(&rq->timeout_list);
 
 		/*
@@ -120,8 +122,8 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout
 		 */
 		if (!blk_mark_rq_complete(rq))
 			blk_rq_timed_out(rq);
-	} else if (!*next_set || time_after(*next_timeout, rq->deadline)) {
-		*next_timeout = rq->deadline;
+	} else if (!*next_set || time_after(*next_timeout, deadline)) {
+		*next_timeout = deadline;
 		*next_set = 1;
 	}
 }
@@ -162,7 +164,7 @@ void blk_abort_request(struct request *req)
 		 * immediately and that scan sees the new timeout value.
 		 * No need for fancy synchronizations.
 		 */
-		req->deadline = jiffies;
+		blk_rq_set_deadline(req, jiffies);
 		mod_timer(&req->q->timeout, 0);
 	} else {
 		if (blk_mark_rq_complete(req))
@@ -213,7 +215,7 @@ void blk_add_timer(struct request *req)
 	if (!req->timeout)
 		req->timeout = q->rq_timeout;
 
-	req->deadline = jiffies + req->timeout;
+	blk_rq_set_deadline(req, jiffies + req->timeout);
 	req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
 
 	/*
@@ -228,7 +230,7 @@ void blk_add_timer(struct request *req)
 	 * than an existing one, modify the timer. Round up to next nearest
 	 * second.
 	 */
-	expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
+	expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req)));
 
 	if (!timer_pending(&q->timeout) ||
 	    time_before(expiry, q->timeout.expires)) {
diff --git a/block/blk.h b/block/blk.h
index eb306c52121e..bcd9cf7db0d4 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -236,6 +236,21 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
 		q->last_merge = NULL;
 }
 
+/*
+ * Steal a bit from this field for legacy IO path atomic IO marking. Note that
+ * setting the deadline clears the bottom bit, potentially clearing the
+ * completed bit. The user has to be OK with this (current ones are fine).
+ */
+static inline void blk_rq_set_deadline(struct request *rq, unsigned long time)
+{
+	rq->__deadline = time & ~0x1UL;
+}
+
+static inline unsigned long blk_rq_deadline(struct request *rq)
+{
+	return rq->__deadline & ~0x1UL;
+}
+
 /*
  * Internal io_context interface
  */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ba31674d8581..aa6698cf483c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -257,7 +257,9 @@ struct request {
 	struct u64_stats_sync aborted_gstate_sync;
 	u64 aborted_gstate;
 
-	unsigned long deadline;
+	/* access through blk_rq_set_deadline, blk_rq_deadline */
+	unsigned long __deadline;
+
 	struct list_head timeout_list;
 
 	/*
-- 
cgit v1.2.3


From e14575b3d457f5806d79b85886ef94d9c29e3b2a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 10 Jan 2018 11:34:25 -0700
Subject: block: convert REQ_ATOM_COMPLETE to stealing rq->__deadline bit

We only have one atomic flag left. Instead of using an entire
unsigned long for that, steal the bottom bit of the deadline
field that we already reserved.

Remove ->atomic_flags, since it's now unused.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  2 +-
 block/blk-mq-debugfs.c |  9 +--------
 block/blk-mq.c         |  2 +-
 block/blk.h            | 19 +++++++++----------
 include/linux/blkdev.h |  2 --
 5 files changed, 12 insertions(+), 22 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-core.c b/block/blk-core.c
index f843ae4f858d..7ba607527487 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2853,7 +2853,7 @@ void blk_start_request(struct request *req)
 		wbt_issue(req->q->rq_wb, &req->issue_stat);
 	}
 
-	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
+	BUG_ON(blk_rq_is_complete(req));
 	blk_add_timer(req);
 }
 EXPORT_SYMBOL(blk_start_request);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index dd890d5e0fbd..19db3f583bf1 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -294,12 +294,6 @@ static const char *const rqf_name[] = {
 };
 #undef RQF_NAME
 
-#define RQAF_NAME(name) [REQ_ATOM_##name] = #name
-static const char *const rqaf_name[] = {
-	RQAF_NAME(COMPLETE),
-};
-#undef RQAF_NAME
-
 int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 {
 	const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
@@ -316,8 +310,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 	seq_puts(m, ", .rq_flags=");
 	blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
 		       ARRAY_SIZE(rqf_name));
-	seq_puts(m, ", .atomic_flags=");
-	blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name));
+	seq_printf(m, ", complete=%d", blk_rq_is_complete(rq));
 	seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
 		   rq->internal_tag);
 	if (mq_ops->show_rq)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7035c305be45..87e6b10c8ecb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -294,7 +294,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		rq->rq_flags |= RQF_PREEMPT;
 	if (blk_queue_io_stat(data->q))
 		rq->rq_flags |= RQF_IO_STAT;
-	/* do not touch atomic flags, it needs atomic ops against the timer */
 	rq->cpu = -1;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
@@ -313,6 +312,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->special = NULL;
 	/* tag was already set */
 	rq->extra_len = 0;
+	rq->__deadline = 0;
 
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->timeout = 0;
diff --git a/block/blk.h b/block/blk.h
index bcd9cf7db0d4..c84ae0e21ebd 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -119,25 +119,24 @@ void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);
 void blk_account_io_done(struct request *req);
 
-/*
- * Internal atomic flags for request handling
- */
-enum rq_atomic_flags {
-	REQ_ATOM_COMPLETE = 0,
-};
-
 /*
  * EH timer and IO completion will both attempt to 'grab' the request, make
- * sure that only one of them succeeds
+ * sure that only one of them succeeds. Steal the bottom bit of the
+ * __deadline field for this.
  */
 static inline int blk_mark_rq_complete(struct request *rq)
 {
-	return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+	return test_and_set_bit(0, &rq->__deadline);
 }
 
 static inline void blk_clear_rq_complete(struct request *rq)
 {
-	clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+	clear_bit(0, &rq->__deadline);
+}
+
+static inline bool blk_rq_is_complete(struct request *rq)
+{
+	return test_bit(0, &rq->__deadline);
 }
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aa6698cf483c..d4b2f7bb18d6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -156,8 +156,6 @@ struct request {
 
 	int internal_tag;
 
-	unsigned long atomic_flags;
-
 	/* the following two fields are internal, NEVER access directly */
 	unsigned int __data_len;	/* total data len */
 	int tag;
-- 
cgit v1.2.3


From 7c3fb70f0341f9d924818e648906774921f4bcb3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 10 Jan 2018 11:46:39 -0700
Subject: block: rearrange a few request fields for better cache layout

Move completion related items (like the call single data) near the
end of the struct, instead of mixing them in with the initial
queueing related fields.

Move queuelist below the bio structures. Then we have all
queueing related bits in the first cache line.

This yields a 1.5-2% increase in IOPS for a null_blk test, both for
sync and for high thread count access. Sync test goes form 975K to
992K, 32-thread case from 20.8M to 21.2M IOPS.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 18 +++++++++---------
 include/linux/blkdev.h | 28 +++++++++++++++-------------
 2 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 87e6b10c8ecb..435a5a0d441f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -270,8 +270,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	struct request *rq = tags->static_rqs[tag];
 
-	rq->rq_flags = 0;
-
 	if (data->flags & BLK_MQ_REQ_INTERNAL) {
 		rq->tag = -1;
 		rq->internal_tag = tag;
@@ -285,26 +283,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		data->hctx->tags->rqs[rq->tag] = rq;
 	}
 
-	INIT_LIST_HEAD(&rq->queuelist);
 	/* csd/requeue_work/fifo_time is initialized before use */
 	rq->q = data->q;
 	rq->mq_ctx = data->ctx;
+	rq->rq_flags = 0;
+	rq->cpu = -1;
 	rq->cmd_flags = op;
 	if (data->flags & BLK_MQ_REQ_PREEMPT)
 		rq->rq_flags |= RQF_PREEMPT;
 	if (blk_queue_io_stat(data->q))
 		rq->rq_flags |= RQF_IO_STAT;
-	rq->cpu = -1;
+	INIT_LIST_HEAD(&rq->queuelist);
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->rq_disk = NULL;
 	rq->part = NULL;
 	rq->start_time = jiffies;
-#ifdef CONFIG_BLK_CGROUP
-	rq->rl = NULL;
-	set_start_time_ns(rq);
-	rq->io_start_time_ns = 0;
-#endif
 	rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	rq->nr_integrity_segments = 0;
@@ -321,6 +315,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->end_io_data = NULL;
 	rq->next_rq = NULL;
 
+#ifdef CONFIG_BLK_CGROUP
+	rq->rl = NULL;
+	set_start_time_ns(rq);
+	rq->io_start_time_ns = 0;
+#endif
+
 	data->ctx->rq_dispatched[op_is_sync(op)]++;
 	return rq;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d4b2f7bb18d6..71a9371c8182 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -141,12 +141,6 @@ typedef __u32 __bitwise req_flags_t;
  * especially blk_mq_rq_ctx_init() to take care of the added fields.
  */
 struct request {
-	struct list_head queuelist;
-	union {
-		call_single_data_t csd;
-		u64 fifo_time;
-	};
-
 	struct request_queue *q;
 	struct blk_mq_ctx *mq_ctx;
 
@@ -164,6 +158,8 @@ struct request {
 	struct bio *bio;
 	struct bio *biotail;
 
+	struct list_head queuelist;
+
 	/*
 	 * The hash is used inside the scheduler, and killed once the
 	 * request reaches the dispatch list. The ipi_list is only used
@@ -211,19 +207,16 @@ struct request {
 	struct hd_struct *part;
 	unsigned long start_time;
 	struct blk_issue_stat issue_stat;
-#ifdef CONFIG_BLK_CGROUP
-	struct request_list *rl;		/* rl this rq is alloced from */
-	unsigned long long start_time_ns;
-	unsigned long long io_start_time_ns;    /* when passed to hardware */
-#endif
 	/* Number of scatter-gather DMA addr+len pairs after
 	 * physical address coalescing is performed.
 	 */
 	unsigned short nr_phys_segments;
+
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	unsigned short nr_integrity_segments;
 #endif
 
+	unsigned short write_hint;
 	unsigned short ioprio;
 
 	unsigned int timeout;
@@ -232,8 +225,6 @@ struct request {
 
 	unsigned int extra_len;	/* length of alignment and padding */
 
-	unsigned short write_hint;
-
 	/*
 	 * On blk-mq, the lower bits of ->gstate (generation number and
 	 * state) carry the MQ_RQ_* state value and the upper bits the
@@ -260,6 +251,11 @@ struct request {
 
 	struct list_head timeout_list;
 
+	union {
+		call_single_data_t csd;
+		u64 fifo_time;
+	};
+
 	/*
 	 * completion callback.
 	 */
@@ -268,6 +264,12 @@ struct request {
 
 	/* for bidi */
 	struct request *next_rq;
+
+#ifdef CONFIG_BLK_CGROUP
+	struct request_list *rl;		/* rl this rq is alloced from */
+	unsigned long long start_time_ns;
+	unsigned long long io_start_time_ns;    /* when passed to hardware */
+#endif
 };
 
 static inline bool blk_rq_is_scsi(struct request *rq)
-- 
cgit v1.2.3


From f5ced52aaa5494c1feb9f80252cb2a2cde0dace8 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 19 Jan 2018 08:58:56 -0800
Subject: block: Remove kblockd_schedule_delayed_work{,_on}()

The previous patch removed all users of these two functions. Hence
also remove the functions themselves.

Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 14 --------------
 include/linux/blkdev.h |  2 --
 2 files changed, 16 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/block/blk-core.c b/block/blk-core.c
index 1645a1e54a37..cdae69be68e9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3441,20 +3441,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
 }
 EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
 
-int kblockd_schedule_delayed_work(struct delayed_work *dwork,
-				  unsigned long delay)
-{
-	return queue_delayed_work(kblockd_workqueue, dwork, delay);
-}
-EXPORT_SYMBOL(kblockd_schedule_delayed_work);
-
-int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
-				     unsigned long delay)
-{
-	return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
-}
-EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
-
 /**
  * blk_start_plug - initialize blk_plug and track it inside the task_struct
  * @plug:	The &struct blk_plug that needs to be initialized
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 71a9371c8182..afc43fb63c16 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1800,8 +1800,6 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
 
 int kblockd_schedule_work(struct work_struct *work);
 int kblockd_schedule_work_on(int cpu, struct work_struct *work);
-int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
-int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 
 #ifdef CONFIG_BLK_CGROUP
-- 
cgit v1.2.3