From 4f02fb7617ba12ac15d261c654b9759ea8f1f1ef Mon Sep 17 00:00:00 2001
From: Joseph Qi <qijiang.qj@alibaba-inc.com>
Date: Sat, 30 Sep 2017 14:38:49 +0800
Subject: blk-throttle: fix possible io stall when upgrade to max

There is a case which will lead to io stall. The case is described as
follows.
/test1
  |-subtest1
/test2
  |-subtest2
And subtest1 and subtest2 each has 32 queued bios already.

Now upgrade to max. In throtl_upgrade_state, it will try to dispatch
bios as follows:
1) tg=subtest1, do nothing;
2) tg=test1, transfer 32 queued bios from subtest1 to test1; no pending
left, no need to schedule next dispatch;
3) tg=subtest2, do nothing;
4) tg=test2, transfer 32 queued bios from subtest2 to test2; no pending
left, no need to schedule next dispatch;
5) tg=/, transfer 8 queued bios from test1 to /, 8 queued bios from
test2 to /, 8 queued bios from test1 to /, and 8 queued bios from test2
to /; note that test1 and test2 each still has 16 queued bios left;
6) tg=/, try to schedule next dispatch, but since disptime is now
(update in tg_update_disptime, wait=0), pending timer is not scheduled
in fact;
7) In throtl_upgrade_state it totally dispatches 32 queued bios and with
32 left. test1 and test2 each has 16 queued bios;
8) throtl_pending_timer_fn sees the left over bios, but could do
nothing, because throtl_select_dispatch returns 0, and test1/test2 has
no pending tg.

The blktrace shows the following:
8,32   0        0     2.539007641     0  m   N throtl upgrade to max
8,32   0        0     2.539072267     0  m   N throtl /test2 dispatch nr_queued=16 read=0 write=16
8,32   7        0     2.539077142     0  m   N throtl /test1 dispatch nr_queued=16 read=0 write=16

So force schedule dispatch if there are pending children.

Reviewed-by: Shaohua Li <shli@fb.com>
Signed-off-by: Joseph Qi <qijiang.qj@alibaba-inc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 0fea76aa0f3f..17816a028dcb 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1911,11 +1911,11 @@ static void throtl_upgrade_state(struct throtl_data *td)
 
 		tg->disptime = jiffies - 1;
 		throtl_select_dispatch(sq);
-		throtl_schedule_next_dispatch(sq, false);
+		throtl_schedule_next_dispatch(sq, true);
 	}
 	rcu_read_unlock();
 	throtl_select_dispatch(&td->service_queue);
-	throtl_schedule_next_dispatch(&td->service_queue, false);
+	throtl_schedule_next_dispatch(&td->service_queue, true);
 	queue_work(kthrotld_workqueue, &td->dispatch_work);
 }
 
-- 
cgit v1.2.3


From 70e62f4bacdf31ea8a59f241c9229120cd06d9d1 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 3 Oct 2017 14:57:16 -0700
Subject: blk-mq-debugfs: fix device sched directory for default scheduler

In blk_mq_debugfs_register(), I remembered to set up the per-hctx sched
directories if a default scheduler was already configured by
blk_mq_sched_init() from blk_mq_init_allocated_queue(), but I didn't do
the same for the device-wide sched directory. Fix it.

Fixes: d332ce091813 ("blk-mq-debugfs: allow schedulers to register debugfs attributes")
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 980e73095643..de294d775acf 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -815,10 +815,14 @@ int blk_mq_debugfs_register(struct request_queue *q)
 		goto err;
 
 	/*
-	 * blk_mq_init_hctx() attempted to do this already, but q->debugfs_dir
+	 * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir
 	 * didn't exist yet (because we don't know what to name the directory
 	 * until the queue is registered to a gendisk).
 	 */
+	if (q->elevator && !q->sched_debugfs_dir)
+		blk_mq_debugfs_register_sched(q);
+
+	/* Similarly, blk_mq_init_hctx() couldn't do this previously. */
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx->debugfs_dir && blk_mq_debugfs_register_hctx(q, hctx))
 			goto err;
-- 
cgit v1.2.3


From eab40cf336065e8d765e006b81ff48c5c114b365 Mon Sep 17 00:00:00 2001
From: Benjamin Block <bblock@linux.vnet.ibm.com>
Date: Tue, 3 Oct 2017 12:48:37 +0200
Subject: bsg-lib: fix use-after-free under memory-pressure

When under memory-pressure it is possible that the mempool which backs
the 'struct request_queue' will make use of up to BLKDEV_MIN_RQ count
emergency buffers - in case it can't get a regular allocation. These
buffers are preallocated and once they are also used, they are
re-supplied with old finished requests from the same request_queue (see
mempool_free()).

The bug is, when re-supplying the emergency pool, the old requests are
not again ran through the callback mempool_t->alloc(), and thus also not
through the callback bsg_init_rq(). Thus we skip initialization, and
while the sense-buffer still should be good, scsi_request->cmd might
have become to be an invalid pointer in the meantime. When the request
is initialized in bsg.c, and the user's CDB is larger than BLK_MAX_CDB,
bsg will replace it with a custom allocated buffer, which is freed when
the user's command is finished, thus it dangles afterwards. When next a
command is sent by the user that has a smaller/similar CDB as
BLK_MAX_CDB, bsg will assume that scsi_request->cmd is backed by
scsi_request->__cmd, will not make a custom allocation, and write into
undefined memory.

Fix this by splitting bsg_init_rq() into two functions:
 - bsg_init_rq() is changed to only do the allocation of the
   sense-buffer, which is used to back the bsg job's reply buffer. This
   pointer should never change during the lifetime of a scsi_request, so
   it doesn't need re-initialization.
 - bsg_initialize_rq() is a new function that makes use of
   'struct request_queue's initialize_rq_fn callback (which was
   introduced in v4.12). This is always called before the request is
   given out via blk_get_request(). This function does the remaining
   initialization that was previously done in bsg_init_rq(), and will
   also do it when the request is taken from the emergency-pool of the
   backing mempool.

Fixes: 50b4d485528d ("bsg-lib: fix kernel panic resulting from missing allocation of reply-buffer")
Cc: <stable@vger.kernel.org> # 4.11+
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bsg-lib.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index dbddff8174e5..15d25ccd51a5 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -207,20 +207,34 @@ static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp)
 	struct bsg_job *job = blk_mq_rq_to_pdu(req);
 	struct scsi_request *sreq = &job->sreq;
 
+	/* called right after the request is allocated for the request_queue */
+
+	sreq->sense = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp);
+	if (!sreq->sense)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void bsg_initialize_rq(struct request *req)
+{
+	struct bsg_job *job = blk_mq_rq_to_pdu(req);
+	struct scsi_request *sreq = &job->sreq;
+	void *sense = sreq->sense;
+
+	/* called right before the request is given to the request_queue user */
+
 	memset(job, 0, sizeof(*job));
 
 	scsi_req_init(sreq);
+
+	sreq->sense = sense;
 	sreq->sense_len = SCSI_SENSE_BUFFERSIZE;
-	sreq->sense = kzalloc(sreq->sense_len, gfp);
-	if (!sreq->sense)
-		return -ENOMEM;
 
 	job->req = req;
-	job->reply = sreq->sense;
+	job->reply = sense;
 	job->reply_len = sreq->sense_len;
 	job->dd_data = job + 1;
-
-	return 0;
 }
 
 static void bsg_exit_rq(struct request_queue *q, struct request *req)
@@ -251,6 +265,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 	q->cmd_size = sizeof(struct bsg_job) + dd_job_size;
 	q->init_rq_fn = bsg_init_rq;
 	q->exit_rq_fn = bsg_exit_rq;
+	q->initialize_rq_fn = bsg_initialize_rq;
 	q->request_fn = bsg_request_fn;
 
 	ret = blk_init_allocated_queue(q);
-- 
cgit v1.2.3


From 95d78c28b5a85bacbc29b8dba7c04babb9b0d467 Mon Sep 17 00:00:00 2001
From: Vitaly Mayatskikh <v.mayatskih@gmail.com>
Date: Fri, 22 Sep 2017 01:18:39 -0400
Subject: fix unbalanced page refcounting in bio_map_user_iov

bio_map_user_iov and bio_unmap_user do unbalanced pages refcounting if
IO vector has small consecutive buffers belonging to the same page.
bio_add_pc_page merges them into one, but the page reference is never
dropped.

Cc: stable@vger.kernel.org
Signed-off-by: Vitaly Mayatskikh <v.mayatskih@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/bio.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index b38e962fa83e..0d6439e89acb 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1383,6 +1383,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 		offset = offset_in_page(uaddr);
 		for (j = cur_page; j < page_limit; j++) {
 			unsigned int bytes = PAGE_SIZE - offset;
+			unsigned short prev_bi_vcnt = bio->bi_vcnt;
 
 			if (len <= 0)
 				break;
@@ -1397,6 +1398,13 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 					    bytes)
 				break;
 
+			/*
+			 * check if vector was merged with previous
+			 * drop page reference if needed
+			 */
+			if (bio->bi_vcnt == prev_bi_vcnt)
+				put_page(pages[j]);
+
 			len -= bytes;
 			offset = 0;
 		}
-- 
cgit v1.2.3


From 2b04e8f6bbb196cab4b232af0f8d48ff2c7a8058 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 23 Sep 2017 15:51:23 -0400
Subject: more bio_map_user_iov() leak fixes

we need to take care of failure exit as well - pages already
in bio should be dropped by analogue of bio_unmap_pages(),
since their refcounts had been bumped only once per reference
in bio.

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/bio.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index 0d6439e89acb..9e9606d26cc6 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1331,6 +1331,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 	int ret, offset;
 	struct iov_iter i;
 	struct iovec iov;
+	struct bio_vec *bvec;
 
 	iov_for_each(iov, i, *iter) {
 		unsigned long uaddr = (unsigned long) iov.iov_base;
@@ -1375,7 +1376,12 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 		ret = get_user_pages_fast(uaddr, local_nr_pages,
 				(iter->type & WRITE) != WRITE,
 				&pages[cur_page]);
-		if (ret < local_nr_pages) {
+		if (unlikely(ret < local_nr_pages)) {
+			for (j = cur_page; j < page_limit; j++) {
+				if (!pages[j])
+					break;
+				put_page(pages[j]);
+			}
 			ret = -EFAULT;
 			goto out_unmap;
 		}
@@ -1431,10 +1437,8 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 	return bio;
 
  out_unmap:
-	for (j = 0; j < nr_pages; j++) {
-		if (!pages[j])
-			break;
-		put_page(pages[j]);
+	bio_for_each_segment_all(bvec, bio, j) {
+		put_page(bvec->bv_page);
 	}
  out:
 	kfree(pages);
-- 
cgit v1.2.3


From 1cfd0ddd82232804e03f3023f6a58b50dfef0574 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Sep 2017 10:21:15 -0400
Subject: bio_copy_user_iov(): don't ignore ->iov_offset

Since "block: support large requests in blk_rq_map_user_iov" we
started to call it with partially drained iter; that works fine
on the write side, but reads create a copy of iter for completion
time.  And that needs to take the possibility of ->iov_iter != 0
into account...

Cc: stable@vger.kernel.org #v4.5+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/bio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index 9e9606d26cc6..101c2a9b5481 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1239,8 +1239,8 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	 */
 	bmd->is_our_pages = map_data ? 0 : 1;
 	memcpy(bmd->iov, iter->iov, sizeof(struct iovec) * iter->nr_segs);
-	iov_iter_init(&bmd->iter, iter->type, bmd->iov,
-			iter->nr_segs, iter->count);
+	bmd->iter = *iter;
+	bmd->iter.iov = bmd->iov;
 
 	ret = -ENOMEM;
 	bio = bio_kmalloc(gfp_mask, nr_pages);
-- 
cgit v1.2.3