53 files changed, 9521 insertions, 2185 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 161491d0a879..a2a92e57a87d 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,6 +4,8 @@
 menuconfig BLOCK
        bool "Enable the block layer" if EXPERT
        default y
+       select SBITMAP
+       select SRCU
        help
 	 Provide block layer support for the kernel.
 
@@ -47,9 +49,13 @@ config LBDAF
 
 	  If unsure, say Y.
 
+config BLK_SCSI_REQUEST
+	bool
+
 config BLK_DEV_BSG
 	bool "Block layer SG support v4"
 	default y
+	select BLK_SCSI_REQUEST
 	help
 	  Saying Y here will enable generic SG (SCSI generic) v4 support
 	  for any block device.
@@ -69,6 +75,7 @@ config BLK_DEV_BSGLIB
 	bool "Block layer SG support v4 helper lib"
 	default n
 	select BLK_DEV_BSG
+	select BLK_SCSI_REQUEST
 	help
 	  Subsystems will normally enable this if needed. Users will not
 	  normally need to manually enable this.
@@ -88,6 +95,14 @@ config BLK_DEV_INTEGRITY
 	T10/SCSI Data Integrity Field or the T13/ATA External Path
 	Protection.  If in doubt, say N.
 
+config BLK_DEV_ZONED
+	bool "Zoned block device support"
+	---help---
+	Block layer zoned block device support. This option enables
+	support for ZAC/ZBC host-managed and host-aware zoned block devices.
+
+	Say yes here if you have a ZAC or ZBC storage device.
+
 config BLK_DEV_THROTTLING
 	bool "Block layer bio throttling support"
 	depends on BLK_CGROUP=y
@@ -111,6 +126,51 @@ config BLK_CMDLINE_PARSER
 
 	See Documentation/block/cmdline-partition.txt for more information.
 
+config BLK_WBT
+	bool "Enable support for block device writeback throttling"
+	default n
+	---help---
+	Enabling this option enables the block layer to throttle buffered
+	background writeback from the VM, making it more smooth and having
+	less impact on foreground operations. The throttling is done
+	dynamically on an algorithm loosely based on CoDel, factoring in
+	the realtime performance of the disk.
+
+config BLK_WBT_SQ
+	bool "Single queue writeback throttling"
+	default n
+	depends on BLK_WBT
+	---help---
+	Enable writeback throttling by default on legacy single queue devices
+
+config BLK_WBT_MQ
+	bool "Multiqueue writeback throttling"
+	default y
+	depends on BLK_WBT
+	---help---
+	Enable writeback throttling by default on multiqueue devices.
+	Multiqueue currently doesn't have support for IO scheduling,
+	enabling this option is recommended.
+
+config BLK_DEBUG_FS
+	bool "Block layer debugging information in debugfs"
+	default y
+	depends on DEBUG_FS
+	---help---
+	Include block layer debugging information in debugfs. This information
+	is mostly useful for kernel developers, but it doesn't incur any cost
+	at runtime.
+
+	Unless you are building a kernel for a tiny system, you should
+	say Y here.
+
+config BLK_SED_OPAL
+	bool "Logic for interfacing with Opal enabled SEDs"
+	---help---
+	Builds Logic for interfacing with Opal enabled controllers.
+	Enabling this option enables users to setup/unlock/lock
+	Locking ranges for SED devices using the Opal protocol.
+
 menu "Partition Types"
 
 source "block/partitions/Kconfig"
@@ -124,4 +184,9 @@ config BLOCK_COMPAT
 	depends on BLOCK && COMPAT
 	default y
 
+config BLK_MQ_PCI
+	bool
+	depends on BLOCK && PCI
+	default y
+
 source block/Kconfig.iosched
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9c4c48..0715ce93daef 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -63,6 +63,56 @@ config DEFAULT_IOSCHED
 	default "cfq" if DEFAULT_CFQ
 	default "noop" if DEFAULT_NOOP
 
+config MQ_IOSCHED_DEADLINE
+	tristate "MQ deadline I/O scheduler"
+	default y
+	---help---
+	  MQ version of the deadline IO scheduler.
+
+config MQ_IOSCHED_NONE
+	bool
+	default y
+
+choice
+	prompt "Default single-queue blk-mq I/O scheduler"
+	default DEFAULT_SQ_NONE
+	help
+	  Select the I/O scheduler which will be used by default for blk-mq
+	  managed block devices with a single queue.
+
+	config DEFAULT_SQ_DEADLINE
+		bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y
+
+	config DEFAULT_SQ_NONE
+		bool "None"
+
+endchoice
+
+config DEFAULT_SQ_IOSCHED
+	string
+	default "mq-deadline" if DEFAULT_SQ_DEADLINE
+	default "none" if DEFAULT_SQ_NONE
+
+choice
+	prompt "Default multi-queue blk-mq I/O scheduler"
+	default DEFAULT_MQ_NONE
+	help
+	  Select the I/O scheduler which will be used by default for blk-mq
+	  managed block devices with multiple queues.
+
+	config DEFAULT_MQ_DEADLINE
+		bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y
+
+	config DEFAULT_MQ_NONE
+		bool "None"
+
+endchoice
+
+config DEFAULT_MQ_IOSCHED
+	string
+	default "mq-deadline" if DEFAULT_MQ_DEADLINE
+	default "none" if DEFAULT_MQ_NONE
+
 endmenu
 
 endif
diff --git a/block/Makefile b/block/Makefile
index 9eda2322b2d4..2ad7c304e3f5 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,12 +5,13 @@
 obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-			blk-lib.o blk-mq.o blk-mq-tag.o \
-			blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
-			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
+			blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
+			blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
+			genhd.o partition-generic.o ioprio.o \
 			badblocks.o partitions/
 
-obj-$(CONFIG_BOUNCE)	+= bounce.o
+obj-$(CONFIG_BOUNCE)		+= bounce.o
+obj-$(CONFIG_BLK_SCSI_REQUEST)	+= scsi_ioctl.o
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_BLK_DEV_BSGLIB)	+= bsg-lib.o
 obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
@@ -18,8 +19,13 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
+obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
-
+obj-$(CONFIG_BLK_MQ_PCI)	+= blk-mq-pci.o
+obj-$(CONFIG_BLK_DEV_ZONED)	+= blk-zoned.o
+obj-$(CONFIG_BLK_WBT)		+= blk-wbt.o
+obj-$(CONFIG_BLK_DEBUG_FS)	+= blk-mq-debugfs.o
+obj-$(CONFIG_BLK_SED_OPAL)	+= sed-opal.o
diff --git a/block/badblocks.c b/block/badblocks.c
index 7be53cb1cc3c..6ebcef282314 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -133,6 +133,26 @@ retry:
 }
 EXPORT_SYMBOL_GPL(badblocks_check);
 
+static void badblocks_update_acked(struct badblocks *bb)
+{
+	u64 *p = bb->page;
+	int i;
+	bool unacked = false;
+
+	if (!bb->unacked_exist)
+		return;
+
+	for (i = 0; i < bb->count ; i++) {
+		if (!BB_ACK(p[i])) {
+			unacked = true;
+			break;
+		}
+	}
+
+	if (!unacked)
+		bb->unacked_exist = 0;
+}
+
 /**
  * badblocks_set() - Add a range of bad blocks to the table.
  * @bb:		the badblocks structure that holds all badblock information
@@ -294,6 +314,8 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
 	bb->changed = 1;
 	if (!acknowledged)
 		bb->unacked_exist = 1;
+	else
+		badblocks_update_acked(bb);
 	write_sequnlock_irqrestore(&bb->lock, flags);
 
 	return rv;
@@ -354,7 +376,8 @@ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
 		 * current range.  Earlier ranges could also overlap,
 		 * but only this one can overlap the end of the range.
 		 */
-		if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
+		if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
+		    (BB_OFFSET(p[lo]) < target)) {
 			/* Partial overlap, leave the tail of this range */
 			int ack = BB_ACK(p[lo]);
 			sector_t a = BB_OFFSET(p[lo]);
@@ -377,7 +400,8 @@ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
 			lo--;
 		}
 		while (lo >= 0 &&
-		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+		       (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
+		       (BB_OFFSET(p[lo]) < target)) {
 			/* This range does overlap */
 			if (BB_OFFSET(p[lo]) < s) {
 				/* Keep the early parts of this range. */
@@ -399,6 +423,7 @@ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
 		}
 	}
 
+	badblocks_update_acked(bb);
 	bb->changed = 1;
 out:
 	write_sequnlock_irq(&bb->lock);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 63f72f00c72e..5384713d48bc 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -172,7 +172,7 @@ bool bio_integrity_enabled(struct bio *bio)
 {
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
 
-	if (!bio_is_rw(bio))
+	if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE)
 		return false;
 
 	/* Already protected? */
diff --git a/block/bio.c b/block/bio.c
index aa7354088008..4b564d0c3e29 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -270,11 +270,15 @@ static void bio_free(struct bio *bio)
 	}
 }
 
-void bio_init(struct bio *bio)
+void bio_init(struct bio *bio, struct bio_vec *table,
+	      unsigned short max_vecs)
 {
 	memset(bio, 0, sizeof(*bio));
 	atomic_set(&bio->__bi_remaining, 1);
 	atomic_set(&bio->__bi_cnt, 1);
+
+	bio->bi_io_vec = table;
+	bio->bi_max_vecs = max_vecs;
 }
 EXPORT_SYMBOL(bio_init);
 
@@ -480,7 +484,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 		return NULL;
 
 	bio = p + front_pad;
-	bio_init(bio);
+	bio_init(bio, NULL, 0);
 
 	if (nr_iovecs > inline_vecs) {
 		unsigned long idx = 0;
@@ -670,6 +674,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
+	case REQ_OP_WRITE_ZEROES:
 		break;
 	case REQ_OP_WRITE_SAME:
 		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
@@ -847,6 +852,55 @@ done:
 }
 EXPORT_SYMBOL(bio_add_page);
 
+/**
+ * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
+ * @bio: bio to add pages to
+ * @iter: iov iterator describing the region to be mapped
+ *
+ * Pins as many pages from *iter and appends them to @bio's bvec array. The
+ * pages will have to be released using put_page() when done.
+ */
+int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+{
+	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
+	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
+	struct page **pages = (struct page **)bv;
+	size_t offset, diff;
+	ssize_t size;
+
+	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+	if (unlikely(size <= 0))
+		return size ? size : -EFAULT;
+	nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
+
+	/*
+	 * Deep magic below:  We need to walk the pinned pages backwards
+	 * because we are abusing the space allocated for the bio_vecs
+	 * for the page array.  Because the bio_vecs are larger than the
+	 * page pointers by definition this will always work.  But it also
+	 * means we can't use bio_add_page, so any changes to it's semantics
+	 * need to be reflected here as well.
+	 */
+	bio->bi_iter.bi_size += size;
+	bio->bi_vcnt += nr_pages;
+
+	diff = (nr_pages * PAGE_SIZE - offset) - size;
+	while (nr_pages--) {
+		bv[nr_pages].bv_page = pages[nr_pages];
+		bv[nr_pages].bv_len = PAGE_SIZE;
+		bv[nr_pages].bv_offset = 0;
+	}
+
+	bv[0].bv_offset += offset;
+	bv[0].bv_len -= offset;
+	if (diff)
+		bv[bio->bi_vcnt - 1].bv_len -= diff;
+
+	iov_iter_advance(iter, size);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
+
 struct submit_bio_ret {
 	struct completion event;
 	int error;
@@ -1068,7 +1122,7 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
 	return 0;
 }
 
-static void bio_free_pages(struct bio *bio)
+void bio_free_pages(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	int i;
@@ -1076,6 +1130,7 @@ static void bio_free_pages(struct bio *bio)
 	bio_for_each_segment_all(bvec, bio, i)
 		__free_page(bvec->bv_page);
 }
+EXPORT_SYMBOL(bio_free_pages);
 
 /**
  *	bio_uncopy_user	-	finish previously mapped bio
@@ -1172,9 +1227,6 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	if (!bio)
 		goto out_bmd;
 
-	if (iter->type & WRITE)
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-
 	ret = 0;
 
 	if (map_data) {
@@ -1274,7 +1326,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 
 		nr_pages += end - start;
 		/*
-		 * buffer must be aligned to at least hardsector size for now
+		 * buffer must be aligned to at least logical block size for now
 		 */
 		if (uaddr & queue_dma_alignment(q))
 			return ERR_PTR(-EINVAL);
@@ -1339,16 +1391,10 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 
 	kfree(pages);
 
-	/*
-	 * set data direction, and check if mapped pages need bouncing
-	 */
-	if (iter->type & WRITE)
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-
 	bio_set_flag(bio, BIO_USER_MAPPED);
 
 	/*
-	 * subtle -- if __bio_map_user() ended up bouncing a bio,
+	 * subtle -- if bio_map_user_iov() ended up bouncing a bio,
 	 * it would normally disappear when its bi_end_io is run.
 	 * however, we need it for the unmap, so grab an extra
 	 * reference to it
@@ -1390,8 +1436,8 @@ static void __bio_unmap_user(struct bio *bio)
  *	bio_unmap_user	-	unmap a bio
  *	@bio:		the bio being unmapped
  *
- *	Unmap a bio previously mapped by bio_map_user(). Must be called with
- *	a process context.
+ *	Unmap a bio previously mapped by bio_map_user_iov(). Must be called from
+ *	process context.
  *
  *	bio_unmap_user() may sleep.
  */
@@ -1535,7 +1581,6 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
 		bio->bi_private = data;
 	} else {
 		bio->bi_end_io = bio_copy_kern_endio;
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 	}
 
 	return bio;
@@ -1785,15 +1830,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
 	BUG_ON(sectors <= 0);
 	BUG_ON(sectors >= bio_sectors(bio));
 
-	/*
-	 * Discards need a mutable bio_vec to accommodate the payload
-	 * required by the DSM TRIM and UNMAP commands.
-	 */
-	if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
-		split = bio_clone_bioset(bio, gfp, bs);
-	else
-		split = bio_clone_fast(bio, gfp, bs);
-
+	split = bio_clone_fast(bio, gfp, bs);
 	if (!split)
 		return NULL;
 
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index dd38e5ced4a3..295e98c2c8cc 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -184,8 +184,9 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 		goto err_free_blkg;
 	}
 
-	wb_congested = wb_congested_get_create(&q->backing_dev_info,
-					       blkcg->css.id, GFP_NOWAIT);
+	wb_congested = wb_congested_get_create(q->backing_dev_info,
+					       blkcg->css.id,
+					       GFP_NOWAIT | __GFP_NOWARN);
 	if (!wb_congested) {
 		ret = -ENOMEM;
 		goto err_put_css;
@@ -193,7 +194,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 
 	/* allocate */
 	if (!new_blkg) {
-		new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT);
+		new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
 		if (unlikely(!new_blkg)) {
 			ret = -ENOMEM;
 			goto err_put_congested;
@@ -468,8 +469,8 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 const char *blkg_dev_name(struct blkcg_gq *blkg)
 {
 	/* some drivers (floppy) instantiate a queue w/o disk registered */
-	if (blkg->q->backing_dev_info.dev)
-		return dev_name(blkg->q->backing_dev_info.dev);
+	if (blkg->q->backing_dev_info->dev)
+		return dev_name(blkg->q->backing_dev_info->dev);
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(blkg_dev_name);
@@ -1022,7 +1023,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 	}
 
 	spin_lock_init(&blkcg->lock);
-	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
+	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 #ifdef CONFIG_CGROUP_WRITEBACK
 	INIT_LIST_HEAD(&blkcg->cgwb_list);
@@ -1078,10 +1079,8 @@ int blkcg_init_queue(struct request_queue *q)
 	if (preloaded)
 		radix_tree_preload_end();
 
-	if (IS_ERR(blkg)) {
-		blkg_free(new_blkg);
+	if (IS_ERR(blkg))
 		return PTR_ERR(blkg);
-	}
 
 	q->root_blkg = blkg;
 	q->root_rl.blkg = blkg;
@@ -1222,7 +1221,10 @@ int blkcg_activate_policy(struct request_queue *q,
 	if (blkcg_policy_enabled(q, pol))
 		return 0;
 
-	blk_queue_bypass_start(q);
+	if (q->mq_ops)
+		blk_mq_freeze_queue(q);
+	else
+		blk_queue_bypass_start(q);
 pd_prealloc:
 	if (!pd_prealloc) {
 		pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
@@ -1240,7 +1242,7 @@ pd_prealloc:
 		if (blkg->pd[pol->plid])
 			continue;
 
-		pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node);
+		pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
 		if (!pd)
 			swap(pd, pd_prealloc);
 		if (!pd) {
@@ -1260,7 +1262,10 @@ pd_prealloc:
 
 	spin_unlock_irq(q->queue_lock);
 out_bypass_end:
-	blk_queue_bypass_end(q);
+	if (q->mq_ops)
+		blk_mq_unfreeze_queue(q);
+	else
+		blk_queue_bypass_end(q);
 	if (pd_prealloc)
 		pol->pd_free_fn(pd_prealloc);
 	return ret;
@@ -1283,7 +1288,11 @@ void blkcg_deactivate_policy(struct request_queue *q,
 	if (!blkcg_policy_enabled(q, pol))
 		return;
 
-	blk_queue_bypass_start(q);
+	if (q->mq_ops)
+		blk_mq_freeze_queue(q);
+	else
+		blk_queue_bypass_start(q);
+
 	spin_lock_irq(q->queue_lock);
 
 	__clear_bit(pol->plid, q->blkcg_pols);
@@ -1303,7 +1312,11 @@ void blkcg_deactivate_policy(struct request_queue *q,
 	}
 
 	spin_unlock_irq(q->queue_lock);
-	blk_queue_bypass_end(q);
+
+	if (q->mq_ops)
+		blk_mq_unfreeze_queue(q);
+	else
+		blk_queue_bypass_end(q);
 }
 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
 
@@ -1340,10 +1353,8 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 			struct blkcg_policy_data *cpd;
 
 			cpd = pol->cpd_alloc_fn(GFP_KERNEL);
-			if (!cpd) {
-				mutex_unlock(&blkcg_pol_mutex);
+			if (!cpd)
 				goto err_free_cpds;
-			}
 
 			blkcg->cpd[pol->plid] = cpd;
 			cpd->blkcg = blkcg;
diff --git a/block/blk-core.c b/block/blk-core.c
index 36c7ac328d8c..b9e857f4afe8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -33,12 +33,19 @@
 #include <linux/ratelimit.h>
 #include <linux/pm_runtime.h>
 #include <linux/blk-cgroup.h>
+#include <linux/debugfs.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-mq-sched.h"
+#include "blk-wbt.h"
+
+#ifdef CONFIG_DEBUG_FS
+struct dentry *blk_debugfs_root;
+#endif
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -73,7 +80,7 @@ static void blk_clear_congested(struct request_list *rl, int sync)
 	 * flip its congestion state for events on other blkcgs.
 	 */
 	if (rl == &rl->q->root_rl)
-		clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+		clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
 #endif
 }
 
@@ -84,7 +91,7 @@ static void blk_set_congested(struct request_list *rl, int sync)
 #else
 	/* see blk_clear_congested() */
 	if (rl == &rl->q->root_rl)
-		set_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+		set_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
 #endif
 }
 
@@ -103,22 +110,6 @@ void blk_queue_congestion_threshold(struct request_queue *q)
 	q->nr_congestion_off = nr;
 }
 
-/**
- * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
- * @bdev:	device
- *
- * Locates the passed device's request queue and returns the address of its
- * backing_dev_info.  This function can only be called if @bdev is opened
- * and the return value is never NULL.
- */
-struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
-{
-	struct request_queue *q = bdev_get_queue(bdev);
-
-	return &q->backing_dev_info;
-}
-EXPORT_SYMBOL(blk_get_backing_dev_info);
-
 void blk_rq_init(struct request_queue *q, struct request *rq)
 {
 	memset(rq, 0, sizeof(*rq));
@@ -130,9 +121,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->__sector = (sector_t) -1;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
-	rq->cmd = rq->__cmd;
-	rq->cmd_len = BLK_MAX_CDB;
 	rq->tag = -1;
+	rq->internal_tag = -1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
 	rq->part = NULL;
@@ -145,22 +135,20 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	if (error)
 		bio->bi_error = error;
 
-	if (unlikely(rq->cmd_flags & REQ_QUIET))
+	if (unlikely(rq->rq_flags & RQF_QUIET))
 		bio_set_flag(bio, BIO_QUIET);
 
 	bio_advance(bio, nbytes);
 
 	/* don't actually finish bio if it's part of flush sequence */
-	if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+	if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
 		bio_endio(bio);
 }
 
 void blk_dump_rq_flags(struct request *rq, char *msg)
 {
-	int bit;
-
-	printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg,
-		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
+	printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
+		rq->rq_disk ? rq->rq_disk->disk_name : "?",
 		(unsigned long long) rq->cmd_flags);
 
 	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
@@ -168,13 +156,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
 	printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
 	       rq->bio, rq->biotail, blk_rq_bytes(rq));
-
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
-		printk(KERN_INFO "  cdb: ");
-		for (bit = 0; bit < BLK_MAX_CDB; bit++)
-			printk("%02x ", rq->cmd[bit]);
-		printk("\n");
-	}
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
 
@@ -288,7 +269,7 @@ void blk_sync_queue(struct request_queue *q)
 		int i;
 
 		queue_for_each_hw_ctx(q, hctx, i) {
-			cancel_delayed_work_sync(&hctx->run_work);
+			cancel_work_sync(&hctx->run_work);
 			cancel_delayed_work_sync(&hctx->delay_work);
 		}
 	} else {
@@ -524,12 +505,14 @@ void blk_set_queue_dying(struct request_queue *q)
 	else {
 		struct request_list *rl;
 
+		spin_lock_irq(q->queue_lock);
 		blk_queue_for_each_rl(rl, q) {
 			if (rl->rq_pool) {
 				wake_up(&rl->wait[BLK_RW_SYNC]);
 				wake_up(&rl->wait[BLK_RW_ASYNC]);
 			}
 		}
+		spin_unlock_irq(q->queue_lock);
 	}
 }
 EXPORT_SYMBOL_GPL(blk_set_queue_dying);
@@ -583,7 +566,7 @@ void blk_cleanup_queue(struct request_queue *q)
 	blk_flush_integrity();
 
 	/* @q won't process any more request, flush async actions */
-	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
+	del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
 	blk_sync_queue(q);
 
 	if (q->mq_ops)
@@ -595,7 +578,8 @@ void blk_cleanup_queue(struct request_queue *q)
 		q->queue_lock = &q->__queue_lock;
 	spin_unlock_irq(lock);
 
-	bdi_unregister(&q->backing_dev_info);
+	bdi_unregister(q->backing_dev_info);
+	put_disk_devt(q->disk_devt);
 
 	/* @q is and will stay empty, shutdown and put */
 	blk_put_queue(q);
@@ -603,17 +587,41 @@ void blk_cleanup_queue(struct request_queue *q)
 EXPORT_SYMBOL(blk_cleanup_queue);
 
 /* Allocate memory local to the request queue */
-static void *alloc_request_struct(gfp_t gfp_mask, void *data)
+static void *alloc_request_simple(gfp_t gfp_mask, void *data)
 {
-	int nid = (int)(long)data;
-	return kmem_cache_alloc_node(request_cachep, gfp_mask, nid);
+	struct request_queue *q = data;
+
+	return kmem_cache_alloc_node(request_cachep, gfp_mask, q->node);
 }
 
-static void free_request_struct(void *element, void *unused)
+static void free_request_simple(void *element, void *data)
 {
 	kmem_cache_free(request_cachep, element);
 }
 
+static void *alloc_request_size(gfp_t gfp_mask, void *data)
+{
+	struct request_queue *q = data;
+	struct request *rq;
+
+	rq = kmalloc_node(sizeof(struct request) + q->cmd_size, gfp_mask,
+			q->node);
+	if (rq && q->init_rq_fn && q->init_rq_fn(q, rq, gfp_mask) < 0) {
+		kfree(rq);
+		rq = NULL;
+	}
+	return rq;
+}
+
+static void free_request_size(void *element, void *data)
+{
+	struct request_queue *q = data;
+
+	if (q->exit_rq_fn)
+		q->exit_rq_fn(q, element);
+	kfree(element);
+}
+
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
 		gfp_t gfp_mask)
 {
@@ -626,10 +634,15 @@ int blk_init_rl(struct request_list *rl, struct request_queue *q,
 	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
 	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
 
-	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, alloc_request_struct,
-					  free_request_struct,
-					  (void *)(long)q->node, gfp_mask,
-					  q->node);
+	if (q->cmd_size) {
+		rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
+				alloc_request_size, free_request_size,
+				q, gfp_mask, q->node);
+	} else {
+		rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
+				alloc_request_simple, free_request_simple,
+				q, gfp_mask, q->node);
+	}
 	if (!rl->rq_pool)
 		return -ENOMEM;
 
@@ -692,7 +705,6 @@ static void blk_rq_timed_out_timer(unsigned long data)
 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	struct request_queue *q;
-	int err;
 
 	q = kmem_cache_alloc_node(blk_requestq_cachep,
 				gfp_mask | __GFP_ZERO, node_id);
@@ -707,17 +719,17 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (!q->bio_split)
 		goto fail_id;
 
-	q->backing_dev_info.ra_pages =
+	q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
+	if (!q->backing_dev_info)
+		goto fail_split;
+
+	q->backing_dev_info->ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
-	q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
-	q->backing_dev_info.name = "block";
+	q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
+	q->backing_dev_info->name = "block";
 	q->node = node_id;
 
-	err = bdi_init(&q->backing_dev_info);
-	if (err)
-		goto fail_split;
-
-	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
+	setup_timer(&q->backing_dev_info->laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->queue_head);
@@ -767,7 +779,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 fail_ref:
 	percpu_ref_exit(&q->q_usage_counter);
 fail_bdi:
-	bdi_destroy(&q->backing_dev_info);
+	bdi_put(q->backing_dev_info);
 fail_split:
 	bioset_free(q->bio_split);
 fail_id:
@@ -820,15 +832,19 @@ EXPORT_SYMBOL(blk_init_queue);
 struct request_queue *
 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 {
-	struct request_queue *uninit_q, *q;
+	struct request_queue *q;
 
-	uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
-	if (!uninit_q)
+	q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+	if (!q)
 		return NULL;
 
-	q = blk_init_allocated_queue(uninit_q, rfn, lock);
-	if (!q)
-		blk_cleanup_queue(uninit_q);
+	q->request_fn = rfn;
+	if (lock)
+		q->queue_lock = lock;
+	if (blk_init_allocated_queue(q) < 0) {
+		blk_cleanup_queue(q);
+		return NULL;
+	}
 
 	return q;
 }
@@ -836,30 +852,22 @@ EXPORT_SYMBOL(blk_init_queue_node);
 
 static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
 
-struct request_queue *
-blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
-			 spinlock_t *lock)
-{
-	if (!q)
-		return NULL;
 
-	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0);
+int blk_init_allocated_queue(struct request_queue *q)
+{
+	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size);
 	if (!q->fq)
-		return NULL;
+		return -ENOMEM;
+
+	if (q->init_rq_fn && q->init_rq_fn(q, q->fq->flush_rq, GFP_KERNEL))
+		goto out_free_flush_queue;
 
 	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
-		goto fail;
+		goto out_exit_flush_rq;
 
 	INIT_WORK(&q->timeout_work, blk_timeout_work);
-	q->request_fn		= rfn;
-	q->prep_rq_fn		= NULL;
-	q->unprep_rq_fn		= NULL;
 	q->queue_flags		|= QUEUE_FLAG_DEFAULT;
 
-	/* Override internal queue lock with supplied lock pointer */
-	if (lock)
-		q->queue_lock		= lock;
-
 	/*
 	 * This also sets hw/phys segments, boundary and size
 	 */
@@ -873,16 +881,19 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 	/* init elevator */
 	if (elevator_init(q, NULL)) {
 		mutex_unlock(&q->sysfs_lock);
-		goto fail;
+		goto out_exit_flush_rq;
 	}
 
 	mutex_unlock(&q->sysfs_lock);
+	return 0;
 
-	return q;
-
-fail:
+out_exit_flush_rq:
+	if (q->exit_rq_fn)
+		q->exit_rq_fn(q, q->fq->flush_rq);
+out_free_flush_queue:
 	blk_free_flush_queue(q->fq);
-	return NULL;
+	wbt_exit(q);
+	return -ENOMEM;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
 
@@ -899,7 +910,7 @@ EXPORT_SYMBOL(blk_get_queue);
 
 static inline void blk_free_request(struct request_list *rl, struct request *rq)
 {
-	if (rq->cmd_flags & REQ_ELVPRIV) {
+	if (rq->rq_flags & RQF_ELVPRIV) {
 		elv_put_request(rl->q, rq);
 		if (rq->elv.icq)
 			put_io_context(rq->elv.icq->ioc);
@@ -961,14 +972,14 @@ static void __freed_request(struct request_list *rl, int sync)
  * A request has just been released.  Account for it, update the full and
  * congestion status, wake up any waiters.   Called under q->queue_lock.
  */
-static void freed_request(struct request_list *rl, int op, unsigned int flags)
+static void freed_request(struct request_list *rl, bool sync,
+		req_flags_t rq_flags)
 {
 	struct request_queue *q = rl->q;
-	int sync = rw_is_sync(op, flags);
 
 	q->nr_rqs[sync]--;
 	rl->count[sync]--;
-	if (flags & REQ_ELVPRIV)
+	if (rq_flags & RQF_ELVPRIV)
 		q->nr_rqs_elvpriv--;
 
 	__freed_request(rl, sync);
@@ -1018,46 +1029,10 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
 	return 0;
 }
 
-/*
- * Determine if elevator data should be initialized when allocating the
- * request associated with @bio.
- */
-static bool blk_rq_should_init_elevator(struct bio *bio)
-{
-	if (!bio)
-		return true;
-
-	/*
-	 * Flush requests do not use the elevator so skip initialization.
-	 * This allows a request to share the flush and elevator data.
-	 */
-	if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA))
-		return false;
-
-	return true;
-}
-
-/**
- * rq_ioc - determine io_context for request allocation
- * @bio: request being allocated is for this bio (can be %NULL)
- *
- * Determine io_context to use for request allocation for @bio.  May return
- * %NULL if %current->io_context doesn't exist.
- */
-static struct io_context *rq_ioc(struct bio *bio)
-{
-#ifdef CONFIG_BLK_CGROUP
-	if (bio && bio->bi_ioc)
-		return bio->bi_ioc;
-#endif
-	return current->io_context;
-}
-
 /**
  * __get_request - get a free request
  * @rl: request list to allocate from
- * @op: REQ_OP_READ/REQ_OP_WRITE
- * @op_flags: rq_flag_bits
+ * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
  * @gfp_mask: allocation mask
  *
@@ -1068,22 +1043,22 @@ static struct io_context *rq_ioc(struct bio *bio)
  * Returns ERR_PTR on failure, with @q->queue_lock held.
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
-static struct request *__get_request(struct request_list *rl, int op,
-				     int op_flags, struct bio *bio,
-				     gfp_t gfp_mask)
+static struct request *__get_request(struct request_list *rl, unsigned int op,
+		struct bio *bio, gfp_t gfp_mask)
 {
 	struct request_queue *q = rl->q;
 	struct request *rq;
 	struct elevator_type *et = q->elevator->type;
 	struct io_context *ioc = rq_ioc(bio);
 	struct io_cq *icq = NULL;
-	const bool is_sync = rw_is_sync(op, op_flags) != 0;
+	const bool is_sync = op_is_sync(op);
 	int may_queue;
+	req_flags_t rq_flags = RQF_ALLOCED;
 
 	if (unlikely(blk_queue_dying(q)))
 		return ERR_PTR(-ENODEV);
 
-	may_queue = elv_may_queue(q, op, op_flags);
+	may_queue = elv_may_queue(q, op);
 	if (may_queue == ELV_MQUEUE_NO)
 		goto rq_starved;
 
@@ -1127,23 +1102,26 @@ static struct request *__get_request(struct request_list *rl, int op,
 
 	/*
 	 * Decide whether the new request will be managed by elevator.  If
-	 * so, mark @op_flags and increment elvpriv.  Non-zero elvpriv will
+	 * so, mark @rq_flags and increment elvpriv.  Non-zero elvpriv will
 	 * prevent the current elevator from being destroyed until the new
 	 * request is freed.  This guarantees icq's won't be destroyed and
 	 * makes creating new ones safe.
 	 *
+	 * Flush requests do not use the elevator so skip initialization.
+	 * This allows a request to share the flush and elevator data.
+	 *
 	 * Also, lookup icq while holding queue_lock.  If it doesn't exist,
 	 * it will be created after releasing queue_lock.
 	 */
-	if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
-		op_flags |= REQ_ELVPRIV;
+	if (!op_is_flush(op) && !blk_queue_bypass(q)) {
+		rq_flags |= RQF_ELVPRIV;
 		q->nr_rqs_elvpriv++;
 		if (et->icq_cache && ioc)
 			icq = ioc_lookup_icq(ioc, q);
 	}
 
 	if (blk_queue_io_stat(q))
-		op_flags |= REQ_IO_STAT;
+		rq_flags |= RQF_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
 
 	/* allocate and init request */
@@ -1153,10 +1131,12 @@ static struct request *__get_request(struct request_list *rl, int op,
 
 	blk_rq_init(q, rq);
 	blk_rq_set_rl(rq, rl);
-	req_set_op_attrs(rq, op, op_flags | REQ_ALLOCED);
+	blk_rq_set_prio(rq, ioc);
+	rq->cmd_flags = op;
+	rq->rq_flags = rq_flags;
 
 	/* init elvpriv */
-	if (op_flags & REQ_ELVPRIV) {
+	if (rq_flags & RQF_ELVPRIV) {
 		if (unlikely(et->icq_cache && !icq)) {
 			if (ioc)
 				icq = ioc_create_icq(ioc, q, gfp_mask);
@@ -1193,9 +1173,9 @@ fail_elvpriv:
 	 * disturb iosched and blkcg but weird is bettern than dead.
 	 */
 	printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n",
-			   __func__, dev_name(q->backing_dev_info.dev));
+			   __func__, dev_name(q->backing_dev_info->dev));
 
-	rq->cmd_flags &= ~REQ_ELVPRIV;
+	rq->rq_flags &= ~RQF_ELVPRIV;
 	rq->elv.icq = NULL;
 
 	spin_lock_irq(q->queue_lock);
@@ -1212,7 +1192,7 @@ fail_alloc:
 	 * queue, but this is pretty rare.
 	 */
 	spin_lock_irq(q->queue_lock);
-	freed_request(rl, op, op_flags);
+	freed_request(rl, is_sync, rq_flags);
 
 	/*
 	 * in the very unlikely event that allocation failed and no
@@ -1230,8 +1210,7 @@ rq_starved:
 /**
  * get_request - get a free request
  * @q: request_queue to allocate request from
- * @op: REQ_OP_READ/REQ_OP_WRITE
- * @op_flags: rq_flag_bits
+ * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
  * @gfp_mask: allocation mask
  *
@@ -1242,18 +1221,17 @@ rq_starved:
  * Returns ERR_PTR on failure, with @q->queue_lock held.
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
-static struct request *get_request(struct request_queue *q, int op,
-				   int op_flags, struct bio *bio,
-				   gfp_t gfp_mask)
+static struct request *get_request(struct request_queue *q, unsigned int op,
+		struct bio *bio, gfp_t gfp_mask)
 {
-	const bool is_sync = rw_is_sync(op, op_flags) != 0;
+	const bool is_sync = op_is_sync(op);
 	DEFINE_WAIT(wait);
 	struct request_list *rl;
 	struct request *rq;
 
 	rl = blk_get_rl(q, bio);	/* transferred to @rq on success */
 retry:
-	rq = __get_request(rl, op, op_flags, bio, gfp_mask);
+	rq = __get_request(rl, op, bio, gfp_mask);
 	if (!IS_ERR(rq))
 		return rq;
 
@@ -1289,13 +1267,11 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
 {
 	struct request *rq;
 
-	BUG_ON(rw != READ && rw != WRITE);
-
 	/* create ioc upfront */
 	create_io_context(gfp_mask, q->node);
 
 	spin_lock_irq(q->queue_lock);
-	rq = get_request(q, rw, 0, NULL, gfp_mask);
+	rq = get_request(q, rw, NULL, gfp_mask);
 	if (IS_ERR(rq)) {
 		spin_unlock_irq(q->queue_lock);
 		return rq;
@@ -1320,18 +1296,6 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 EXPORT_SYMBOL(blk_get_request);
 
 /**
- * blk_rq_set_block_pc - initialize a request to type BLOCK_PC
- * @rq:		request to be initialized
- *
- */
-void blk_rq_set_block_pc(struct request *rq)
-{
-	rq->cmd_type = REQ_TYPE_BLOCK_PC;
-	memset(rq->__cmd, 0, sizeof(rq->__cmd));
-}
-EXPORT_SYMBOL(blk_rq_set_block_pc);
-
-/**
  * blk_requeue_request - put a request back on queue
  * @q:		request queue where request should be inserted
  * @rq:		request to be inserted
@@ -1346,8 +1310,9 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
+	wbt_requeue(q->rq_wb, &rq->issue_stat);
 
-	if (rq->cmd_flags & REQ_QUEUED)
+	if (rq->rq_flags & RQF_QUEUED)
 		blk_queue_end_tag(q, rq);
 
 	BUG_ON(blk_queued_rq(rq));
@@ -1409,7 +1374,7 @@ EXPORT_SYMBOL_GPL(part_round_stats);
 #ifdef CONFIG_PM
 static void blk_pm_put_request(struct request *rq)
 {
-	if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending)
+	if (rq->q->dev && !(rq->rq_flags & RQF_PM) && !--rq->q->nr_pending)
 		pm_runtime_mark_last_busy(rq->q->dev);
 }
 #else
@@ -1421,6 +1386,8 @@ static inline void blk_pm_put_request(struct request *rq) {}
  */
 void __blk_put_request(struct request_queue *q, struct request *req)
 {
+	req_flags_t rq_flags = req->rq_flags;
+
 	if (unlikely(!q))
 		return;
 
@@ -1436,20 +1403,21 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 	/* this is a bio leak */
 	WARN_ON(req->bio != NULL);
 
+	wbt_done(q->rq_wb, &req->issue_stat);
+
 	/*
 	 * Request may not have originated from ll_rw_blk. if not,
 	 * it didn't come out of our reserved rq pools
 	 */
-	if (req->cmd_flags & REQ_ALLOCED) {
-		unsigned int flags = req->cmd_flags;
-		int op = req_op(req);
+	if (rq_flags & RQF_ALLOCED) {
 		struct request_list *rl = blk_rq_rl(req);
+		bool sync = op_is_sync(req->cmd_flags);
 
 		BUG_ON(!list_empty(&req->queuelist));
 		BUG_ON(ELV_ON_HASH(req));
 
 		blk_free_request(rl, req);
-		freed_request(rl, op, flags);
+		freed_request(rl, sync, rq_flags);
 		blk_put_rl(rl);
 	}
 }
@@ -1471,38 +1439,6 @@ void blk_put_request(struct request *req)
 }
 EXPORT_SYMBOL(blk_put_request);
 
-/**
- * blk_add_request_payload - add a payload to a request
- * @rq: request to update
- * @page: page backing the payload
- * @offset: offset in page
- * @len: length of the payload.
- *
- * This allows to later add a payload to an already submitted request by
- * a block driver.  The driver needs to take care of freeing the payload
- * itself.
- *
- * Note that this is a quite horrible hack and nothing but handling of
- * discard requests should ever use it.
- */
-void blk_add_request_payload(struct request *rq, struct page *page,
-		int offset, unsigned int len)
-{
-	struct bio *bio = rq->bio;
-
-	bio->bi_io_vec->bv_page = page;
-	bio->bi_io_vec->bv_offset = offset;
-	bio->bi_io_vec->bv_len = len;
-
-	bio->bi_iter.bi_size = len;
-	bio->bi_vcnt = 1;
-	bio->bi_phys_segments = 1;
-
-	rq->__data_len = rq->resid_len = len;
-	rq->nr_phys_segments = 1;
-}
-EXPORT_SYMBOL_GPL(blk_add_request_payload);
-
 bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 			    struct bio *bio)
 {
@@ -1549,6 +1485,30 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 	return true;
 }
 
+bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
+		struct bio *bio)
+{
+	unsigned short segments = blk_rq_nr_discard_segments(req);
+
+	if (segments >= queue_max_discard_segments(q))
+		goto no_merge;
+	if (blk_rq_sectors(req) + bio_sectors(bio) >
+	    blk_rq_get_max_sectors(req, blk_rq_pos(req)))
+		goto no_merge;
+
+	req->biotail->bi_next = bio;
+	req->biotail = bio;
+	req->__data_len += bio->bi_iter.bi_size;
+	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
+	req->nr_phys_segments = segments + 1;
+
+	blk_account_io_start(req, false);
+	return true;
+no_merge:
+	req_set_nomerge(q, req);
+	return false;
+}
+
 /**
  * blk_attempt_plug_merge - try to merge with %current's plugged list
  * @q: request_queue new bio is being queued at
@@ -1577,12 +1537,11 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 {
 	struct blk_plug *plug;
 	struct request *rq;
-	bool ret = false;
 	struct list_head *plug_list;
 
 	plug = current->plug;
 	if (!plug)
-		goto out;
+		return false;
 	*request_count = 0;
 
 	if (q->mq_ops)
@@ -1591,7 +1550,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 		plug_list = &plug->list;
 
 	list_for_each_entry_reverse(rq, plug_list, queuelist) {
-		int el_ret;
+		bool merged = false;
 
 		if (rq->q == q) {
 			(*request_count)++;
@@ -1607,19 +1566,25 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 		if (rq->q != q || !blk_rq_merge_ok(rq, bio))
 			continue;
 
-		el_ret = blk_try_merge(rq, bio);
-		if (el_ret == ELEVATOR_BACK_MERGE) {
-			ret = bio_attempt_back_merge(q, rq, bio);
-			if (ret)
-				break;
-		} else if (el_ret == ELEVATOR_FRONT_MERGE) {
-			ret = bio_attempt_front_merge(q, rq, bio);
-			if (ret)
-				break;
+		switch (blk_try_merge(rq, bio)) {
+		case ELEVATOR_BACK_MERGE:
+			merged = bio_attempt_back_merge(q, rq, bio);
+			break;
+		case ELEVATOR_FRONT_MERGE:
+			merged = bio_attempt_front_merge(q, rq, bio);
+			break;
+		case ELEVATOR_DISCARD_MERGE:
+			merged = bio_attempt_discard_merge(q, rq, bio);
+			break;
+		default:
+			break;
 		}
+
+		if (merged)
+			return true;
 	}
-out:
-	return ret;
+
+	return false;
 }
 
 unsigned int blk_plug_queued_count(struct request_queue *q)
@@ -1648,25 +1613,23 @@ out:
 
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
-	req->cmd_type = REQ_TYPE_FS;
-
-	req->cmd_flags |= bio->bi_opf & REQ_COMMON_MASK;
 	if (bio->bi_opf & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 
 	req->errors = 0;
 	req->__sector = bio->bi_iter.bi_sector;
-	req->ioprio = bio_prio(bio);
+	if (ioprio_valid(bio_prio(bio)))
+		req->ioprio = bio_prio(bio);
 	blk_rq_bio_prep(req->q, req, bio);
 }
 
 static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
-	const bool sync = !!(bio->bi_opf & REQ_SYNC);
 	struct blk_plug *plug;
-	int el_ret, rw_flags = 0, where = ELEVATOR_INSERT_SORT;
-	struct request *req;
+	int where = ELEVATOR_INSERT_SORT;
+	struct request *req, *free;
 	unsigned int request_count = 0;
+	unsigned int wb_acct;
 
 	/*
 	 * low level driver can indicate that it wants pages above a
@@ -1683,7 +1646,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
-	if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) {
+	if (op_is_flush(bio->bi_opf)) {
 		spin_lock_irq(q->queue_lock);
 		where = ELEVATOR_INSERT_FLUSH;
 		goto get_rq;
@@ -1701,48 +1664,48 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 
 	spin_lock_irq(q->queue_lock);
 
-	el_ret = elv_merge(q, &req, bio);
-	if (el_ret == ELEVATOR_BACK_MERGE) {
-		if (bio_attempt_back_merge(q, req, bio)) {
-			elv_bio_merged(q, req, bio);
-			if (!attempt_back_merge(q, req))
-				elv_merged_request(q, req, el_ret);
-			goto out_unlock;
-		}
-	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
-		if (bio_attempt_front_merge(q, req, bio)) {
-			elv_bio_merged(q, req, bio);
-			if (!attempt_front_merge(q, req))
-				elv_merged_request(q, req, el_ret);
-			goto out_unlock;
-		}
+	switch (elv_merge(q, &req, bio)) {
+	case ELEVATOR_BACK_MERGE:
+		if (!bio_attempt_back_merge(q, req, bio))
+			break;
+		elv_bio_merged(q, req, bio);
+		free = attempt_back_merge(q, req);
+		if (free)
+			__blk_put_request(q, free);
+		else
+			elv_merged_request(q, req, ELEVATOR_BACK_MERGE);
+		goto out_unlock;
+	case ELEVATOR_FRONT_MERGE:
+		if (!bio_attempt_front_merge(q, req, bio))
+			break;
+		elv_bio_merged(q, req, bio);
+		free = attempt_front_merge(q, req);
+		if (free)
+			__blk_put_request(q, free);
+		else
+			elv_merged_request(q, req, ELEVATOR_FRONT_MERGE);
+		goto out_unlock;
+	default:
+		break;
 	}
 
 get_rq:
-	/*
-	 * This sync check and mask will be re-done in init_request_from_bio(),
-	 * but we need to set it earlier to expose the sync flag to the
-	 * rq allocator and io schedulers.
-	 */
-	if (sync)
-		rw_flags |= REQ_SYNC;
-
-	/*
-	 * Add in META/PRIO flags, if set, before we get to the IO scheduler
-	 */
-	rw_flags |= (bio->bi_opf & (REQ_META | REQ_PRIO));
+	wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock);
 
 	/*
 	 * Grab a free request. This is might sleep but can not fail.
 	 * Returns with the queue unlocked.
 	 */
-	req = get_request(q, bio_data_dir(bio), rw_flags, bio, GFP_NOIO);
+	req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
 	if (IS_ERR(req)) {
+		__wbt_done(q->rq_wb, wb_acct);
 		bio->bi_error = PTR_ERR(req);
 		bio_endio(bio);
 		goto out_unlock;
 	}
 
+	wbt_track(&req->issue_stat, wb_acct);
+
 	/*
 	 * After dropping the lock and possibly sleeping here, our request
 	 * may now be mergeable after it had proven unmergeable (above).
@@ -1759,11 +1722,16 @@ get_rq:
 		/*
 		 * If this is the first request added after a plug, fire
 		 * of a plug trace.
+		 *
+		 * @request_count may become stale because of schedule
+		 * out, so check plug list again.
 		 */
-		if (!request_count)
+		if (!request_count || list_empty(&plug->list))
 			trace_block_plug(q);
 		else {
-			if (request_count >= BLK_MAX_REQUEST_COUNT) {
+			struct request *last = list_entry_rq(plug->list.prev);
+			if (request_count >= BLK_MAX_REQUEST_COUNT ||
+			    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) {
 				blk_flush_plug_list(plug, false);
 				trace_block_plug(q);
 			}
@@ -1788,7 +1756,12 @@ static inline void blk_partition_remap(struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 
-	if (bio_sectors(bio) && bdev != bdev->bd_contains) {
+	/*
+	 * Zone reset does not include bi_size so bio_sectors() is always 0.
+	 * Include a test for the reset op code and perform the remap if needed.
+	 */
+	if (bdev != bdev->bd_contains &&
+	    (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)) {
 		struct hd_struct *p = bdev->bd_part;
 
 		bio->bi_iter.bi_sector += p->start_sect;
@@ -1920,7 +1893,7 @@ generic_make_request_checks(struct bio *bio)
 	 * drivers without flush support don't have to worry
 	 * about them.
 	 */
-	if ((bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) &&
+	if (op_is_flush(bio->bi_opf) &&
 	    !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
 		bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
 		if (!nr_sectors) {
@@ -1942,6 +1915,15 @@ generic_make_request_checks(struct bio *bio)
 		if (!bdev_write_same(bio->bi_bdev))
 			goto not_supported;
 		break;
+	case REQ_OP_ZONE_REPORT:
+	case REQ_OP_ZONE_RESET:
+		if (!bdev_is_zoned(bio->bi_bdev))
+			goto not_supported;
+		break;
+	case REQ_OP_WRITE_ZEROES:
+		if (!bdev_write_zeroes_sectors(bio->bi_bdev))
+			goto not_supported;
+		break;
 	default:
 		break;
 	}
@@ -2160,7 +2142,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	if (q->mq_ops) {
 		if (blk_queue_io_stat(q))
 			blk_account_io_start(rq, true);
-		blk_mq_insert_request(rq, false, true, false);
+		blk_mq_sched_insert_request(rq, false, true, false, false);
 		return 0;
 	}
 
@@ -2176,7 +2158,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	 */
 	BUG_ON(blk_queued_rq(rq));
 
-	if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA))
+	if (op_is_flush(rq->cmd_flags))
 		where = ELEVATOR_INSERT_FLUSH;
 
 	add_acct_request(q, rq, where);
@@ -2210,7 +2192,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
 	unsigned int bytes = 0;
 	struct bio *bio;
 
-	if (!(rq->cmd_flags & REQ_MIXED_MERGE))
+	if (!(rq->rq_flags & RQF_MIXED_MERGE))
 		return blk_rq_bytes(rq);
 
 	/*
@@ -2253,7 +2235,7 @@ void blk_account_io_done(struct request *req)
 	 * normal IO on queueing nor completion.  Accounting the
 	 * containing request is enough.
 	 */
-	if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
+	if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
@@ -2281,7 +2263,7 @@ static struct request *blk_pm_peek_request(struct request_queue *q,
 					   struct request *rq)
 {
 	if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
-	    (q->rpm_status != RPM_ACTIVE && !(rq->cmd_flags & REQ_PM))))
+	    (q->rpm_status != RPM_ACTIVE && !(rq->rq_flags & RQF_PM))))
 		return NULL;
 	else
 		return rq;
@@ -2357,13 +2339,13 @@ struct request *blk_peek_request(struct request_queue *q)
 		if (!rq)
 			break;
 
-		if (!(rq->cmd_flags & REQ_STARTED)) {
+		if (!(rq->rq_flags & RQF_STARTED)) {
 			/*
 			 * This is the first time the device driver
 			 * sees this request (possibly after
 			 * requeueing).  Notify IO scheduler.
 			 */
-			if (rq->cmd_flags & REQ_SORTED)
+			if (rq->rq_flags & RQF_SORTED)
 				elv_activate_rq(q, rq);
 
 			/*
@@ -2371,7 +2353,7 @@ struct request *blk_peek_request(struct request_queue *q)
 			 * it, a request that has been delayed should
 			 * not be passed by new incoming requests
 			 */
-			rq->cmd_flags |= REQ_STARTED;
+			rq->rq_flags |= RQF_STARTED;
 			trace_block_rq_issue(q, rq);
 		}
 
@@ -2380,7 +2362,7 @@ struct request *blk_peek_request(struct request_queue *q)
 			q->boundary_rq = NULL;
 		}
 
-		if (rq->cmd_flags & REQ_DONTPREP)
+		if (rq->rq_flags & RQF_DONTPREP)
 			break;
 
 		if (q->dma_drain_size && blk_rq_bytes(rq)) {
@@ -2403,11 +2385,11 @@ struct request *blk_peek_request(struct request_queue *q)
 			/*
 			 * the request may have been (partially) prepped.
 			 * we need to keep this request in the front to
-			 * avoid resource deadlock.  REQ_STARTED will
+			 * avoid resource deadlock.  RQF_STARTED will
 			 * prevent other fs requests from passing this one.
 			 */
 			if (q->dma_drain_size && blk_rq_bytes(rq) &&
-			    !(rq->cmd_flags & REQ_DONTPREP)) {
+			    !(rq->rq_flags & RQF_DONTPREP)) {
 				/*
 				 * remove the space for the drain we added
 				 * so that we don't add it again
@@ -2420,7 +2402,7 @@ struct request *blk_peek_request(struct request_queue *q)
 		} else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
 			int err = (ret == BLKPREP_INVALID) ? -EREMOTEIO : -EIO;
 
-			rq->cmd_flags |= REQ_QUIET;
+			rq->rq_flags |= RQF_QUIET;
 			/*
 			 * Mark this request as started so we don't trigger
 			 * any debug logic in the end I/O path.
@@ -2475,13 +2457,11 @@ void blk_start_request(struct request *req)
 {
 	blk_dequeue_request(req);
 
-	/*
-	 * We are now handing the request to the hardware, initialize
-	 * resid_len to full count and add the timeout handler.
-	 */
-	req->resid_len = blk_rq_bytes(req);
-	if (unlikely(blk_bidi_rq(req)))
-		req->next_rq->resid_len = blk_rq_bytes(req->next_rq);
+	if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
+		blk_stat_set_issue_time(&req->issue_stat);
+		req->rq_flags |= RQF_STATS;
+		wbt_issue(req->q->rq_wb, &req->issue_stat);
+	}
 
 	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
 	blk_add_timer(req);
@@ -2553,11 +2533,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	 * TODO: tj: This is too subtle.  It would be better to let
 	 * low level drivers do what they see fit.
 	 */
-	if (req->cmd_type == REQ_TYPE_FS)
+	if (!blk_rq_is_passthrough(req))
 		req->errors = 0;
 
-	if (error && req->cmd_type == REQ_TYPE_FS &&
-	    !(req->cmd_flags & REQ_QUIET)) {
+	if (error && !blk_rq_is_passthrough(req) &&
+	    !(req->rq_flags & RQF_QUIET)) {
 		char *error_type;
 
 		switch (error) {
@@ -2623,14 +2603,16 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		return false;
 	}
 
+	WARN_ON_ONCE(req->rq_flags & RQF_SPECIAL_PAYLOAD);
+
 	req->__data_len -= total_bytes;
 
 	/* update sector only for requests with clear definition of sector */
-	if (req->cmd_type == REQ_TYPE_FS)
+	if (!blk_rq_is_passthrough(req))
 		req->__sector += total_bytes >> 9;
 
 	/* mixed attributes always follow the first bio */
-	if (req->cmd_flags & REQ_MIXED_MERGE) {
+	if (req->rq_flags & RQF_MIXED_MERGE) {
 		req->cmd_flags &= ~REQ_FAILFAST_MASK;
 		req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
 	}
@@ -2683,7 +2665,7 @@ void blk_unprep_request(struct request *req)
 {
 	struct request_queue *q = req->q;
 
-	req->cmd_flags &= ~REQ_DONTPREP;
+	req->rq_flags &= ~RQF_DONTPREP;
 	if (q->unprep_rq_fn)
 		q->unprep_rq_fn(q, req);
 }
@@ -2694,28 +2676,34 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);
  */
 void blk_finish_request(struct request *req, int error)
 {
-	if (req->cmd_flags & REQ_QUEUED)
-		blk_queue_end_tag(req->q, req);
+	struct request_queue *q = req->q;
+
+	if (req->rq_flags & RQF_STATS)
+		blk_stat_add(&q->rq_stats[rq_data_dir(req)], req);
+
+	if (req->rq_flags & RQF_QUEUED)
+		blk_queue_end_tag(q, req);
 
 	BUG_ON(blk_queued_rq(req));
 
-	if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS)
-		laptop_io_completion(&req->q->backing_dev_info);
+	if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req))
+		laptop_io_completion(req->q->backing_dev_info);
 
 	blk_delete_timer(req);
 
-	if (req->cmd_flags & REQ_DONTPREP)
+	if (req->rq_flags & RQF_DONTPREP)
 		blk_unprep_request(req);
 
 	blk_account_io_done(req);
 
-	if (req->end_io)
+	if (req->end_io) {
+		wbt_done(req->q->rq_wb, &req->issue_stat);
 		req->end_io(req, error);
-	else {
+	} else {
 		if (blk_bidi_rq(req))
 			__blk_put_request(req->next_rq->q, req->next_rq);
 
-		__blk_put_request(req->q, req);
+		__blk_put_request(q, req);
 	}
 }
 EXPORT_SYMBOL(blk_finish_request);
@@ -2939,8 +2927,6 @@ EXPORT_SYMBOL_GPL(__blk_end_request_err);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
-	req_set_op(rq, bio_op(bio));
-
 	if (bio_has_data(bio))
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
 
@@ -3024,9 +3010,6 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
 static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
-	req_set_op_attrs(dst, req_op(src),
-			 (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE);
-	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
 	dst->nr_phys_segments = src->nr_phys_segments;
@@ -3097,6 +3080,12 @@ int kblockd_schedule_work(struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 
+int kblockd_schedule_work_on(int cpu, struct work_struct *work)
+{
+	return queue_work_on(cpu, kblockd_workqueue, work);
+}
+EXPORT_SYMBOL(kblockd_schedule_work_on);
+
 int kblockd_schedule_delayed_work(struct delayed_work *dwork,
 				  unsigned long delay)
 {
@@ -3270,7 +3259,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		/*
 		 * rq is already accounted, so use raw insert
 		 */
-		if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA))
+		if (op_is_flush(rq->cmd_flags))
 			__elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
 		else
 			__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
@@ -3297,48 +3286,6 @@ void blk_finish_plug(struct blk_plug *plug)
 }
 EXPORT_SYMBOL(blk_finish_plug);
 
-bool blk_poll(struct request_queue *q, blk_qc_t cookie)
-{
-	struct blk_plug *plug;
-	long state;
-
-	if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
-	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
-		return false;
-
-	plug = current->plug;
-	if (plug)
-		blk_flush_plug_list(plug, false);
-
-	state = current->state;
-	while (!need_resched()) {
-		unsigned int queue_num = blk_qc_t_to_queue_num(cookie);
-		struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num];
-		int ret;
-
-		hctx->poll_invoked++;
-
-		ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie));
-		if (ret > 0) {
-			hctx->poll_success++;
-			set_current_state(TASK_RUNNING);
-			return true;
-		}
-
-		if (signal_pending_state(state, current))
-			set_current_state(TASK_RUNNING);
-
-		if (current->state == TASK_RUNNING)
-			return true;
-		if (ret < 0)
-			break;
-		cpu_relax();
-	}
-
-	return false;
-}
-EXPORT_SYMBOL_GPL(blk_poll);
-
 #ifdef CONFIG_PM
 /**
  * blk_pm_runtime_init - Block layer runtime PM initialization routine
@@ -3520,8 +3467,11 @@ EXPORT_SYMBOL(blk_set_runtime_active);
 
 int __init blk_dev_init(void)
 {
-	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
+	BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
+	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
 			FIELD_SIZEOF(struct request, cmd_flags));
+	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
+			FIELD_SIZEOF(struct bio, bi_opf));
 
 	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
 	kblockd_workqueue = alloc_workqueue("kblockd",
@@ -3535,5 +3485,9 @@ int __init blk_dev_init(void)
 	blk_requestq_cachep = kmem_cache_create("request_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 
+#ifdef CONFIG_DEBUG_FS
+	blk_debugfs_root = debugfs_create_dir("block", NULL);
+#endif
+
 	return 0;
 }
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 7ea04325d02f..8cd0e9bc8dc8 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -9,11 +9,7 @@
 #include <linux/sched/sysctl.h>
 
 #include "blk.h"
-
-/*
- * for max sense size
- */
-#include <scsi/scsi_cmnd.h>
+#include "blk-mq-sched.h"
 
 /**
  * blk_end_sync_rq - executes a completion event on a request
@@ -55,7 +51,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 
 	WARN_ON(irqs_disabled());
-	WARN_ON(rq->cmd_type == REQ_TYPE_FS);
+	WARN_ON(!blk_rq_is_passthrough(rq));
 
 	rq->rq_disk = bd_disk;
 	rq->end_io = done;
@@ -65,14 +61,14 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	 * be reused after dying flag is set
 	 */
 	if (q->mq_ops) {
-		blk_mq_insert_request(rq, at_head, true, false);
+		blk_mq_sched_insert_request(rq, at_head, true, false, false);
 		return;
 	}
 
 	spin_lock_irq(q->queue_lock);
 
 	if (unlikely(blk_queue_dying(q))) {
-		rq->cmd_flags |= REQ_QUIET; 
+		rq->rq_flags |= RQF_QUIET;
 		rq->errors = -ENXIO;
 		__blk_end_request_all(rq, rq->errors);
 		spin_unlock_irq(q->queue_lock);
@@ -100,16 +96,9 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 		   struct request *rq, int at_head)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
-	char sense[SCSI_SENSE_BUFFERSIZE];
 	int err = 0;
 	unsigned long hang_check;
 
-	if (!rq->sense) {
-		memset(sense, 0, sizeof(sense));
-		rq->sense = sense;
-		rq->sense_len = 0;
-	}
-
 	rq->end_io_data = &wait;
 	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
 
@@ -123,11 +112,6 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 	if (rq->errors)
 		err = -EIO;
 
-	if (rq->sense == sense)	{
-		rq->sense = NULL;
-		rq->sense_len = 0;
-	}
-
 	return err;
 }
 EXPORT_SYMBOL(blk_execute_rq);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index d308def812db..0d5a9c1da1fc 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -56,7 +56,7 @@
  * Once while executing DATA and again after the whole sequence is
  * complete.  The first completion updates the contained bio but doesn't
  * finish it so that the bio submitter is notified only after the whole
- * sequence is complete.  This is implemented by testing REQ_FLUSH_SEQ in
+ * sequence is complete.  This is implemented by testing RQF_FLUSH_SEQ in
  * req_bio_endio().
  *
  * The above peculiarity requires that each FLUSH/FUA request has only one
@@ -74,6 +74,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
 
 /* FLUSH/FUA sequences */
 enum {
@@ -127,17 +128,14 @@ static void blk_flush_restore_request(struct request *rq)
 	rq->bio = rq->biotail;
 
 	/* make @rq a normal request */
-	rq->cmd_flags &= ~REQ_FLUSH_SEQ;
+	rq->rq_flags &= ~RQF_FLUSH_SEQ;
 	rq->end_io = rq->flush.saved_end_io;
 }
 
 static bool blk_flush_queue_rq(struct request *rq, bool add_front)
 {
 	if (rq->q->mq_ops) {
-		struct request_queue *q = rq->q;
-
-		blk_mq_add_to_requeue_list(rq, add_front);
-		blk_mq_kick_requeue_list(q);
+		blk_mq_add_to_requeue_list(rq, add_front, true);
 		return false;
 	} else {
 		if (add_front)
@@ -232,7 +230,7 @@ static void flush_end_io(struct request *flush_rq, int error)
 
 		/* release the tag's ownership to the req cloned from */
 		spin_lock_irqsave(&fq->mq_flush_lock, flags);
-		hctx = q->mq_ops->map_queue(q, flush_rq->mq_ctx->cpu);
+		hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
 		blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
 		flush_rq->tag = -1;
 	}
@@ -299,8 +297,14 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 	if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
 		return false;
 
-	/* C2 and C3 */
+	/* C2 and C3
+	 *
+	 * For blk-mq + scheduling, we can risk having all driver tags
+	 * assigned to empty flushes, and we deadlock if we are expecting
+	 * other requests to make progress. Don't defer for that case.
+	 */
 	if (!list_empty(&fq->flush_data_in_flight) &&
+	    !(q->mq_ops && q->elevator) &&
 	    time_before(jiffies,
 			fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
 		return false;
@@ -325,12 +329,12 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 		flush_rq->tag = first_rq->tag;
 		fq->orig_rq = first_rq;
 
-		hctx = q->mq_ops->map_queue(q, first_rq->mq_ctx->cpu);
+		hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
 		blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
 	}
 
-	flush_rq->cmd_type = REQ_TYPE_FS;
-	req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH | REQ_FLUSH_SEQ);
+	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
+	flush_rq->rq_flags |= RQF_FLUSH_SEQ;
 	flush_rq->rq_disk = first_rq->rq_disk;
 	flush_rq->end_io = flush_end_io;
 
@@ -343,6 +347,34 @@ static void flush_data_end_io(struct request *rq, int error)
 	struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
 
 	/*
+	 * Updating q->in_flight[] here for making this tag usable
+	 * early. Because in blk_queue_start_tag(),
+	 * q->in_flight[BLK_RW_ASYNC] is used to limit async I/O and
+	 * reserve tags for sync I/O.
+	 *
+	 * More importantly this way can avoid the following I/O
+	 * deadlock:
+	 *
+	 * - suppose there are 40 fua requests comming to flush queue
+	 *   and queue depth is 31
+	 * - 30 rqs are scheduled then blk_queue_start_tag() can't alloc
+	 *   tag for async I/O any more
+	 * - all the 30 rqs are completed before FLUSH_PENDING_TIMEOUT
+	 *   and flush_data_end_io() is called
+	 * - the other rqs still can't go ahead if not updating
+	 *   q->in_flight[BLK_RW_ASYNC] here, meantime these rqs
+	 *   are held in flush data queue and make no progress of
+	 *   handling post flush rq
+	 * - only after the post flush rq is handled, all these rqs
+	 *   can be completed
+	 */
+
+	elv_completed_request(q, rq);
+
+	/* for avoiding double accounting */
+	rq->rq_flags &= ~RQF_STARTED;
+
+	/*
 	 * After populating an empty queue, kick it to avoid stall.  Read
 	 * the comment in flush_end_io().
 	 */
@@ -358,16 +390,17 @@ static void mq_flush_data_end_io(struct request *rq, int error)
 	unsigned long flags;
 	struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
 
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+	hctx = blk_mq_map_queue(q, ctx->cpu);
 
 	/*
 	 * After populating an empty queue, kick it to avoid stall.  Read
 	 * the comment in flush_end_io().
 	 */
 	spin_lock_irqsave(&fq->mq_flush_lock, flags);
-	if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
-		blk_mq_run_hw_queue(hctx, true);
+	blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
 	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
+
+	blk_mq_run_hw_queue(hctx, true);
 }
 
 /**
@@ -398,6 +431,13 @@ void blk_insert_flush(struct request *rq)
 		rq->cmd_flags &= ~REQ_FUA;
 
 	/*
+	 * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any
+	 * of those flags, we have to set REQ_SYNC to avoid skewing
+	 * the request accounting.
+	 */
+	rq->cmd_flags |= REQ_SYNC;
+
+	/*
 	 * An empty flush handed down from a stacking driver may
 	 * translate into nothing if the underlying device does not
 	 * advertise a write-back cache.  In this case, simply
@@ -420,9 +460,9 @@ void blk_insert_flush(struct request *rq)
 	 */
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-		if (q->mq_ops) {
-			blk_mq_insert_request(rq, false, false, true);
-		} else
+		if (q->mq_ops)
+			blk_mq_sched_insert_request(rq, false, true, false, false);
+		else
 			list_add_tail(&rq->queuelist, &q->queue_head);
 		return;
 	}
@@ -433,7 +473,7 @@ void blk_insert_flush(struct request *rq)
 	 */
 	memset(&rq->flush, 0, sizeof(rq->flush));
 	INIT_LIST_HEAD(&rq->flush.list);
-	rq->cmd_flags |= REQ_FLUSH_SEQ;
+	rq->rq_flags |= RQF_FLUSH_SEQ;
 	rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
 	if (q->mq_ops) {
 		rq->end_io = mq_flush_data_end_io;
@@ -485,7 +525,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 
 	bio = bio_alloc(gfp_mask, 0);
 	bio->bi_bdev = bdev;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 
 	ret = submit_bio_wait(bio);
 
@@ -512,11 +552,10 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
 	if (!fq)
 		goto fail;
 
-	if (q->mq_ops) {
+	if (q->mq_ops)
 		spin_lock_init(&fq->mq_flush_lock);
-		rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
-	}
 
+	rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
 	fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node);
 	if (!fq->flush_rq)
 		goto fail_rq;
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index d69c5c79f98e..9f0ff5ba4f84 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -443,10 +443,10 @@ void blk_integrity_revalidate(struct gendisk *disk)
 		return;
 
 	if (bi->profile)
-		disk->queue->backing_dev_info.capabilities |=
+		disk->queue->backing_dev_info->capabilities |=
 			BDI_CAP_STABLE_WRITES;
 	else
-		disk->queue->backing_dev_info.capabilities &=
+		disk->queue->backing_dev_info->capabilities &=
 			~BDI_CAP_STABLE_WRITES;
 }
 
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 381cb50a673c..b12f9c87b4c3 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -35,7 +35,10 @@ static void icq_free_icq_rcu(struct rcu_head *head)
 	kmem_cache_free(icq->__rcu_icq_cache, icq);
 }
 
-/* Exit an icq. Called with both ioc and q locked. */
+/*
+ * Exit an icq. Called with both ioc and q locked for sq, only ioc locked for
+ * mq.
+ */
 static void ioc_exit_icq(struct io_cq *icq)
 {
 	struct elevator_type *et = icq->q->elevator->type;
@@ -43,8 +46,10 @@ static void ioc_exit_icq(struct io_cq *icq)
 	if (icq->flags & ICQ_EXITED)
 		return;
 
-	if (et->ops.elevator_exit_icq_fn)
-		et->ops.elevator_exit_icq_fn(icq);
+	if (et->uses_mq && et->ops.mq.exit_icq)
+		et->ops.mq.exit_icq(icq);
+	else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn)
+		et->ops.sq.elevator_exit_icq_fn(icq);
 
 	icq->flags |= ICQ_EXITED;
 }
@@ -164,6 +169,7 @@ EXPORT_SYMBOL(put_io_context);
  */
 void put_io_context_active(struct io_context *ioc)
 {
+	struct elevator_type *et;
 	unsigned long flags;
 	struct io_cq *icq;
 
@@ -182,13 +188,19 @@ retry:
 	hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
 		if (icq->flags & ICQ_EXITED)
 			continue;
-		if (spin_trylock(icq->q->queue_lock)) {
+
+		et = icq->q->elevator->type;
+		if (et->uses_mq) {
 			ioc_exit_icq(icq);
-			spin_unlock(icq->q->queue_lock);
 		} else {
-			spin_unlock_irqrestore(&ioc->lock, flags);
-			cpu_relax();
-			goto retry;
+			if (spin_trylock(icq->q->queue_lock)) {
+				ioc_exit_icq(icq);
+				spin_unlock(icq->q->queue_lock);
+			} else {
+				spin_unlock_irqrestore(&ioc->lock, flags);
+				cpu_relax();
+				goto retry;
+			}
 		}
 	}
 	spin_unlock_irqrestore(&ioc->lock, flags);
@@ -383,8 +395,10 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 	if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
 		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
 		list_add(&icq->q_node, &q->icq_list);
-		if (et->ops.elevator_init_icq_fn)
-			et->ops.elevator_init_icq_fn(icq);
+		if (et->uses_mq && et->ops.mq.init_icq)
+			et->ops.mq.init_icq(icq);
+		else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn)
+			et->ops.sq.elevator_init_icq_fn(icq);
 	} else {
 		kmem_cache_free(et->icq_cache, icq);
 		icq = ioc_lookup_icq(ioc, q);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 083e56f72308..ed1e78e24db0 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -29,8 +29,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	struct request_queue *q = bdev_get_queue(bdev);
 	struct bio *bio = *biop;
 	unsigned int granularity;
-	enum req_op op;
+	unsigned int op;
 	int alignment;
+	sector_t bs_mask;
 
 	if (!q)
 		return -ENXIO;
@@ -50,6 +51,10 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		op = REQ_OP_DISCARD;
 	}
 
+	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+	if ((sector | nr_sects) & bs_mask)
+		return -EINVAL;
+
 	/* Zero-sector (unknown) and one-sector granularities are the same.  */
 	granularity = max(q->limits.discard_granularity >> 9, 1U);
 	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
@@ -75,7 +80,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			req_sects = end_sect - sector;
 		}
 
-		bio = next_bio(bio, 1, gfp_mask);
+		bio = next_bio(bio, 0, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio->bi_bdev = bdev;
 		bio_set_op_attrs(bio, op, 0);
@@ -132,28 +137,36 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 EXPORT_SYMBOL(blkdev_issue_discard);
 
 /**
- * blkdev_issue_write_same - queue a write same operation
+ * __blkdev_issue_write_same - generate number of bios with same page
  * @bdev:	target blockdev
  * @sector:	start sector
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
  * @page:	page containing data to write
+ * @biop:	pointer to anchor bio
  *
  * Description:
- *    Issue a write same request for the sectors in question.
+ *  Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page.
  */
-int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
-			    sector_t nr_sects, gfp_t gfp_mask,
-			    struct page *page)
+static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, struct page *page,
+		struct bio **biop)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 	unsigned int max_write_same_sectors;
-	struct bio *bio = NULL;
-	int ret = 0;
+	struct bio *bio = *biop;
+	sector_t bs_mask;
 
 	if (!q)
 		return -ENXIO;
 
+	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+	if ((sector | nr_sects) & bs_mask)
+		return -EINVAL;
+
+	if (!bdev_write_same(bdev))
+		return -EOPNOTSUPP;
+
 	/* Ensure that max_write_same_sectors doesn't overflow bi_size */
 	max_write_same_sectors = UINT_MAX >> 9;
 
@@ -175,34 +188,125 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 			bio->bi_iter.bi_size = nr_sects << 9;
 			nr_sects = 0;
 		}
+		cond_resched();
 	}
 
-	if (bio) {
+	*biop = bio;
+	return 0;
+}
+
+/**
+ * blkdev_issue_write_same - queue a write same operation
+ * @bdev:	target blockdev
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to write
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @page:	page containing data
+ *
+ * Description:
+ *    Issue a write same request for the sectors in question.
+ */
+int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+				sector_t nr_sects, gfp_t gfp_mask,
+				struct page *page)
+{
+	struct bio *bio = NULL;
+	struct blk_plug plug;
+	int ret;
+
+	blk_start_plug(&plug);
+	ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page,
+			&bio);
+	if (ret == 0 && bio) {
 		ret = submit_bio_wait(bio);
 		bio_put(bio);
 	}
+	blk_finish_plug(&plug);
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_write_same);
 
 /**
- * blkdev_issue_zeroout - generate number of zero filed write bios
+ * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES
  * @bdev:	blockdev to issue
  * @sector:	start sector
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @biop:	pointer to anchor bio
  *
  * Description:
- *  Generate and issue number of bios with zerofiled pages.
+ *  Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages.
  */
+static int __blkdev_issue_write_zeroes(struct block_device *bdev,
+		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
+		struct bio **biop)
+{
+	struct bio *bio = *biop;
+	unsigned int max_write_zeroes_sectors;
+	struct request_queue *q = bdev_get_queue(bdev);
 
-static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-				  sector_t nr_sects, gfp_t gfp_mask)
+	if (!q)
+		return -ENXIO;
+
+	/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
+	max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
+
+	if (max_write_zeroes_sectors == 0)
+		return -EOPNOTSUPP;
+
+	while (nr_sects) {
+		bio = next_bio(bio, 0, gfp_mask);
+		bio->bi_iter.bi_sector = sector;
+		bio->bi_bdev = bdev;
+		bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0);
+
+		if (nr_sects > max_write_zeroes_sectors) {
+			bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
+			nr_sects -= max_write_zeroes_sectors;
+			sector += max_write_zeroes_sectors;
+		} else {
+			bio->bi_iter.bi_size = nr_sects << 9;
+			nr_sects = 0;
+		}
+		cond_resched();
+	}
+
+	*biop = bio;
+	return 0;
+}
+
+/**
+ * __blkdev_issue_zeroout - generate number of zero filed write bios
+ * @bdev:	blockdev to issue
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to write
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @biop:	pointer to anchor bio
+ * @discard:	discard flag
+ *
+ * Description:
+ *  Generate and issue number of bios with zerofiled pages.
+ */
+int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
+		bool discard)
 {
 	int ret;
-	struct bio *bio = NULL;
+	int bi_size = 0;
+	struct bio *bio = *biop;
 	unsigned int sz;
+	sector_t bs_mask;
+
+	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+	if ((sector | nr_sects) & bs_mask)
+		return -EINVAL;
+
+	ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
+			biop);
+	if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+		goto out;
 
+	ret = 0;
 	while (nr_sects != 0) {
 		bio = next_bio(bio, min(nr_sects, (sector_t)BIO_MAX_PAGES),
 				gfp_mask);
@@ -212,21 +316,20 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 
 		while (nr_sects != 0) {
 			sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
-			ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
-			nr_sects -= ret >> 9;
-			sector += ret >> 9;
-			if (ret < (sz << 9))
+			bi_size = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
+			nr_sects -= bi_size >> 9;
+			sector += bi_size >> 9;
+			if (bi_size < (sz << 9))
 				break;
 		}
+		cond_resched();
 	}
 
-	if (bio) {
-		ret = submit_bio_wait(bio);
-		bio_put(bio);
-		return ret;
-	}
-	return 0;
+	*biop = bio;
+out:
+	return ret;
 }
+EXPORT_SYMBOL(__blkdev_issue_zeroout);
 
 /**
  * blkdev_issue_zeroout - zero-fill a block range
@@ -243,26 +346,37 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
  *  the discard request fail, if the discard flag is not set, or if
  *  discard_zeroes_data is not supported, this function will resort to
  *  zeroing the blocks manually, thus provisioning (allocating,
- *  anchoring) them. If the block device supports the WRITE SAME command
- *  blkdev_issue_zeroout() will use it to optimize the process of
+ *  anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME
+ *  command(s), blkdev_issue_zeroout() will use it to optimize the process of
  *  clearing the block range. Otherwise the zeroing will be performed
  *  using regular WRITE calls.
  */
-
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 			 sector_t nr_sects, gfp_t gfp_mask, bool discard)
 {
+	int ret;
+	struct bio *bio = NULL;
+	struct blk_plug plug;
+
 	if (discard) {
 		if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
 				BLKDEV_DISCARD_ZERO))
 			return 0;
 	}
 
-	if (bdev_write_same(bdev) &&
-	    blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
-				    ZERO_PAGE(0)) == 0)
+	if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
+			ZERO_PAGE(0)))
 		return 0;
 
-	return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
+	blk_start_plug(&plug);
+	ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
+			&bio, discard);
+	if (ret == 0 && bio) {
+		ret = submit_bio_wait(bio);
+		bio_put(bio);
+	}
+	blk_finish_plug(&plug);
+
+	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/blk-map.c b/block/blk-map.c
index b8657fa8dc9a..2f18c2a0be1b 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -60,6 +60,9 @@ static int __blk_rq_map_user_iov(struct request *rq,
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 
+	bio->bi_opf &= ~REQ_OP_MASK;
+	bio->bi_opf |= req_op(rq);
+
 	if (map_data && map_data->null_mapped)
 		bio_set_flag(bio, BIO_NULL_MAPPED);
 
@@ -88,7 +91,7 @@ static int __blk_rq_map_user_iov(struct request *rq,
 }
 
 /**
- * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
+ * blk_rq_map_user_iov - map user data to a request, for passthrough requests
  * @q:		request queue where request should be inserted
  * @rq:		request to map data to
  * @map_data:   pointer to the rq_map_data holding pages (if necessary)
@@ -118,6 +121,9 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	struct iov_iter i;
 	int ret;
 
+	if (!iter_is_iovec(iter))
+		goto fail;
+
 	if (map_data)
 		copy = true;
 	else if (iov_iter_alignment(iter) & align)
@@ -135,11 +141,12 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	} while (iov_iter_count(&i));
 
 	if (!bio_flagged(bio, BIO_USER_MAPPED))
-		rq->cmd_flags |= REQ_COPY_USER;
+		rq->rq_flags |= RQF_COPY_USER;
 	return 0;
 
 unmap_rq:
 	__blk_rq_unmap_user(bio);
+fail:
 	rq->bio = NULL;
 	return -EINVAL;
 }
@@ -193,7 +200,7 @@ int blk_rq_unmap_user(struct bio *bio)
 EXPORT_SYMBOL(blk_rq_unmap_user);
 
 /**
- * blk_rq_map_kern - map kernel data to a request, for REQ_TYPE_BLOCK_PC usage
+ * blk_rq_map_kern - map kernel data to a request, for passthrough requests
  * @q:		request queue where request should be inserted
  * @rq:		request to fill
  * @kbuf:	the kernel buffer
@@ -228,11 +235,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 
-	if (!reading)
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+	bio->bi_opf &= ~REQ_OP_MASK;
+	bio->bi_opf |= req_op(rq);
 
 	if (do_copy)
-		rq->cmd_flags |= REQ_COPY_USER;
+		rq->rq_flags |= RQF_COPY_USER;
 
 	ret = blk_rq_append_bio(rq, bio);
 	if (unlikely(ret)) {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 2642e5fc8b69..2afa262425d1 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -199,6 +199,10 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
 	case REQ_OP_SECURE_ERASE:
 		split = blk_bio_discard_split(q, *bio, bs, &nsegs);
 		break;
+	case REQ_OP_WRITE_ZEROES:
+		split = NULL;
+		nsegs = (*bio)->bi_phys_segments;
+		break;
 	case REQ_OP_WRITE_SAME:
 		split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
 		break;
@@ -237,15 +241,14 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	if (!bio)
 		return 0;
 
-	/*
-	 * This should probably be returning 0, but blk_add_request_payload()
-	 * (Christoph!!!!)
-	 */
-	if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
-		return 1;
-
-	if (bio_op(bio) == REQ_OP_WRITE_SAME)
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_SECURE_ERASE:
+	case REQ_OP_WRITE_ZEROES:
+		return 0;
+	case REQ_OP_WRITE_SAME:
 		return 1;
+	}
 
 	fbio = bio;
 	cluster = blk_queue_cluster(q);
@@ -402,38 +405,21 @@ new_segment:
 	*bvprv = *bvec;
 }
 
+static inline int __blk_bvec_map_sg(struct request_queue *q, struct bio_vec bv,
+		struct scatterlist *sglist, struct scatterlist **sg)
+{
+	*sg = sglist;
+	sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
+	return 1;
+}
+
 static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
 			     struct scatterlist *sglist,
 			     struct scatterlist **sg)
 {
 	struct bio_vec bvec, bvprv = { NULL };
 	struct bvec_iter iter;
-	int nsegs, cluster;
-
-	nsegs = 0;
-	cluster = blk_queue_cluster(q);
-
-	switch (bio_op(bio)) {
-	case REQ_OP_DISCARD:
-	case REQ_OP_SECURE_ERASE:
-		/*
-		 * This is a hack - drivers should be neither modifying the
-		 * biovec, nor relying on bi_vcnt - but because of
-		 * blk_add_request_payload(), a discard bio may or may not have
-		 * a payload we need to set up here (thank you Christoph) and
-		 * bi_vcnt is really the only way of telling if we need to.
-		 */
-		if (!bio->bi_vcnt)
-			return 0;
-		/* Fall through */
-	case REQ_OP_WRITE_SAME:
-		*sg = sglist;
-		bvec = bio_iovec(bio);
-		sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
-		return 1;
-	default:
-		break;
-	}
+	int cluster = blk_queue_cluster(q), nsegs = 0;
 
 	for_each_bio(bio)
 		bio_for_each_segment(bvec, bio, iter)
@@ -453,10 +439,14 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 	struct scatterlist *sg = NULL;
 	int nsegs = 0;
 
-	if (rq->bio)
+	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+		nsegs = __blk_bvec_map_sg(q, rq->special_vec, sglist, &sg);
+	else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME)
+		nsegs = __blk_bvec_map_sg(q, bio_iovec(rq->bio), sglist, &sg);
+	else if (rq->bio)
 		nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
 
-	if (unlikely(rq->cmd_flags & REQ_COPY_USER) &&
+	if (unlikely(rq->rq_flags & RQF_COPY_USER) &&
 	    (blk_rq_bytes(rq) & q->dma_pad_mask)) {
 		unsigned int pad_len =
 			(q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;
@@ -486,7 +476,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 	 * Something must have been wrong if the figured number of
 	 * segment is bigger than number of req's physical segments
 	 */
-	WARN_ON(nsegs > rq->nr_phys_segments);
+	WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
 
 	return nsegs;
 }
@@ -512,9 +502,7 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 	return 1;
 
 no_merge:
-	req->cmd_flags |= REQ_NOMERGE;
-	if (req == q->last_merge)
-		q->last_merge = NULL;
+	req_set_nomerge(q, req);
 	return 0;
 }
 
@@ -528,9 +516,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req, blk_rq_pos(req))) {
-		req->cmd_flags |= REQ_NOMERGE;
-		if (req == q->last_merge)
-			q->last_merge = NULL;
+		req_set_nomerge(q, req);
 		return 0;
 	}
 	if (!bio_flagged(req->biotail, BIO_SEG_VALID))
@@ -552,9 +538,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) {
-		req->cmd_flags |= REQ_NOMERGE;
-		if (req == q->last_merge)
-			q->last_merge = NULL;
+		req_set_nomerge(q, req);
 		return 0;
 	}
 	if (!bio_flagged(bio, BIO_SEG_VALID))
@@ -634,7 +618,7 @@ void blk_rq_set_mixed_merge(struct request *rq)
 	unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
 	struct bio *bio;
 
-	if (rq->cmd_flags & REQ_MIXED_MERGE)
+	if (rq->rq_flags & RQF_MIXED_MERGE)
 		return;
 
 	/*
@@ -647,7 +631,7 @@ void blk_rq_set_mixed_merge(struct request *rq)
 			     (bio->bi_opf & REQ_FAILFAST_MASK) != ff);
 		bio->bi_opf |= ff;
 	}
-	rq->cmd_flags |= REQ_MIXED_MERGE;
+	rq->rq_flags |= RQF_MIXED_MERGE;
 }
 
 static void blk_account_io_merge(struct request *req)
@@ -668,31 +652,32 @@ static void blk_account_io_merge(struct request *req)
 }
 
 /*
- * Has to be called with the request spinlock acquired
+ * For non-mq, this has to be called with the request spinlock acquired.
+ * For mq with scheduling, the appropriate queue wide lock should be held.
  */
-static int attempt_merge(struct request_queue *q, struct request *req,
-			  struct request *next)
+static struct request *attempt_merge(struct request_queue *q,
+				     struct request *req, struct request *next)
 {
 	if (!rq_mergeable(req) || !rq_mergeable(next))
-		return 0;
+		return NULL;
 
 	if (req_op(req) != req_op(next))
-		return 0;
+		return NULL;
 
 	/*
 	 * not contiguous
 	 */
 	if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next))
-		return 0;
+		return NULL;
 
 	if (rq_data_dir(req) != rq_data_dir(next)
 	    || req->rq_disk != next->rq_disk
 	    || req_no_special_merge(next))
-		return 0;
+		return NULL;
 
 	if (req_op(req) == REQ_OP_WRITE_SAME &&
 	    !blk_write_same_mergeable(req->bio, next->bio))
-		return 0;
+		return NULL;
 
 	/*
 	 * If we are allowed to merge, then append bio list
@@ -701,7 +686,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	 * counts here.
 	 */
 	if (!ll_merge_requests_fn(q, req, next))
-		return 0;
+		return NULL;
 
 	/*
 	 * If failfast settings disagree or any of the two is already
@@ -709,7 +694,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	 * makes sure that all involved bios have mixable attributes
 	 * set properly.
 	 */
-	if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE ||
+	if (((req->rq_flags | next->rq_flags) & RQF_MIXED_MERGE) ||
 	    (req->cmd_flags & REQ_FAILFAST_MASK) !=
 	    (next->cmd_flags & REQ_FAILFAST_MASK)) {
 		blk_rq_set_mixed_merge(req);
@@ -741,42 +726,51 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	if (blk_rq_cpu_valid(next))
 		req->cpu = next->cpu;
 
-	/* owner-ship of bio passed from next to req */
+	/*
+	 * ownership of bio passed from next to req, return 'next' for
+	 * the caller to free
+	 */
 	next->bio = NULL;
-	__blk_put_request(q, next);
-	return 1;
+	return next;
 }
 
-int attempt_back_merge(struct request_queue *q, struct request *rq)
+struct request *attempt_back_merge(struct request_queue *q, struct request *rq)
 {
 	struct request *next = elv_latter_request(q, rq);
 
 	if (next)
 		return attempt_merge(q, rq, next);
 
-	return 0;
+	return NULL;
 }
 
-int attempt_front_merge(struct request_queue *q, struct request *rq)
+struct request *attempt_front_merge(struct request_queue *q, struct request *rq)
 {
 	struct request *prev = elv_former_request(q, rq);
 
 	if (prev)
 		return attempt_merge(q, prev, rq);
 
-	return 0;
+	return NULL;
 }
 
 int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 			  struct request *next)
 {
 	struct elevator_queue *e = q->elevator;
+	struct request *free;
 
-	if (e->type->ops.elevator_allow_rq_merge_fn)
-		if (!e->type->ops.elevator_allow_rq_merge_fn(q, rq, next))
+	if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn)
+		if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next))
 			return 0;
 
-	return attempt_merge(q, rq, next);
+	free = attempt_merge(q, rq, next);
+	if (free) {
+		__blk_put_request(q, free);
+		return 1;
+	}
+
+	return 0;
 }
 
 bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
@@ -807,9 +801,12 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	return true;
 }
 
-int blk_try_merge(struct request *rq, struct bio *bio)
+enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
 {
-	if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
+	if (req_op(rq) == REQ_OP_DISCARD &&
+	    queue_max_discard_segments(rq->q) > 1)
+		return ELEVATOR_DISCARD_MERGE;
+	else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
 		return ELEVATOR_BACK_MERGE;
 	else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
 		return ELEVATOR_FRONT_MERGE;
diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
deleted file mode 100644
index bb3ed488f7b5..000000000000
--- a/block/blk-mq-cpu.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * CPU notifier helper code for blk-mq
- *
- * Copyright (C) 2013-2014 Jens Axboe
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/blkdev.h>
-#include <linux/list.h>
-#include <linux/llist.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <linux/blk-mq.h>
-#include "blk-mq.h"
-
-static LIST_HEAD(blk_mq_cpu_notify_list);
-static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock);
-
-static int blk_mq_main_cpu_notify(struct notifier_block *self,
-				  unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (unsigned long) hcpu;
-	struct blk_mq_cpu_notifier *notify;
-	int ret = NOTIFY_OK;
-
-	raw_spin_lock(&blk_mq_cpu_notify_lock);
-
-	list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
-		ret = notify->notify(notify->data, action, cpu);
-		if (ret != NOTIFY_OK)
-			break;
-	}
-
-	raw_spin_unlock(&blk_mq_cpu_notify_lock);
-	return ret;
-}
-
-void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
-{
-	BUG_ON(!notifier->notify);
-
-	raw_spin_lock(&blk_mq_cpu_notify_lock);
-	list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
-	raw_spin_unlock(&blk_mq_cpu_notify_lock);
-}
-
-void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
-{
-	raw_spin_lock(&blk_mq_cpu_notify_lock);
-	list_del(&notifier->list);
-	raw_spin_unlock(&blk_mq_cpu_notify_lock);
-}
-
-void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
-			      int (*fn)(void *, unsigned long, unsigned int),
-			      void *data)
-{
-	notifier->notify = fn;
-	notifier->data = data;
-}
-
-void __init blk_mq_cpu_init(void)
-{
-	hotcpu_notifier(blk_mq_main_cpu_notify, 0);
-}
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index d0634bcf322f..8e61e8640e17 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -31,14 +31,16 @@ static int get_first_sibling(unsigned int cpu)
 	return cpu;
 }
 
-int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
-			    const struct cpumask *online_mask)
+int blk_mq_map_queues(struct blk_mq_tag_set *set)
 {
+	unsigned int *map = set->mq_map;
+	unsigned int nr_queues = set->nr_hw_queues;
+	const struct cpumask *online_mask = cpu_online_mask;
 	unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
 	cpumask_var_t cpus;
 
 	if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
-		return 1;
+		return -ENOMEM;
 
 	cpumask_clear(cpus);
 	nr_cpus = nr_uniq_cpus = 0;
@@ -85,23 +87,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
 	free_cpumask_var(cpus);
 	return 0;
 }
-
-unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
-{
-	unsigned int *map;
-
-	/* If cpus are offline, map them to first hctx */
-	map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL,
-				set->numa_node);
-	if (!map)
-		return NULL;
-
-	if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask))
-		return map;
-
-	kfree(map);
-	return NULL;
-}
+EXPORT_SYMBOL_GPL(blk_mq_map_queues);
 
 /*
  * We have no quick way of doing reverse lookups. This is only used at
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
new file mode 100644
index 000000000000..f6d917977b33
--- /dev/null
+++ b/block/blk-mq-debugfs.c
@@ -0,0 +1,772 @@
+/*
+ * Copyright (C) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+struct blk_mq_debugfs_attr {
+	const char *name;
+	umode_t mode;
+	const struct file_operations *fops;
+};
+
+static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file,
+				   const struct seq_operations *ops)
+{
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open(file, ops);
+	if (!ret) {
+		m = file->private_data;
+		m->private = inode->i_private;
+	}
+	return ret;
+}
+
+static int hctx_state_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	seq_printf(m, "0x%lx\n", hctx->state);
+	return 0;
+}
+
+static int hctx_state_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hctx_state_show, inode->i_private);
+}
+
+static const struct file_operations hctx_state_fops = {
+	.open		= hctx_state_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int hctx_flags_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	seq_printf(m, "0x%lx\n", hctx->flags);
+	return 0;
+}
+
+static int hctx_flags_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hctx_flags_show, inode->i_private);
+}
+
+static const struct file_operations hctx_flags_fops = {
+	.open		= hctx_flags_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
+{
+	struct request *rq = list_entry_rq(v);
+
+	seq_printf(m, "%p {.cmd_flags=0x%x, .rq_flags=0x%x, .tag=%d, .internal_tag=%d}\n",
+		   rq, rq->cmd_flags, (__force unsigned int)rq->rq_flags,
+		   rq->tag, rq->internal_tag);
+	return 0;
+}
+
+static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos)
+	__acquires(&hctx->lock)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	spin_lock(&hctx->lock);
+	return seq_list_start(&hctx->dispatch, *pos);
+}
+
+static void *hctx_dispatch_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	return seq_list_next(v, &hctx->dispatch, pos);
+}
+
+static void hctx_dispatch_stop(struct seq_file *m, void *v)
+	__releases(&hctx->lock)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	spin_unlock(&hctx->lock);
+}
+
+static const struct seq_operations hctx_dispatch_seq_ops = {
+	.start	= hctx_dispatch_start,
+	.next	= hctx_dispatch_next,
+	.stop	= hctx_dispatch_stop,
+	.show	= blk_mq_debugfs_rq_show,
+};
+
+static int hctx_dispatch_open(struct inode *inode, struct file *file)
+{
+	return blk_mq_debugfs_seq_open(inode, file, &hctx_dispatch_seq_ops);
+}
+
+static const struct file_operations hctx_dispatch_fops = {
+	.open		= hctx_dispatch_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int hctx_ctx_map_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	sbitmap_bitmap_show(&hctx->ctx_map, m);
+	return 0;
+}
+
+static int hctx_ctx_map_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hctx_ctx_map_show, inode->i_private);
+}
+
+static const struct file_operations hctx_ctx_map_fops = {
+	.open		= hctx_ctx_map_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static void blk_mq_debugfs_tags_show(struct seq_file *m,
+				     struct blk_mq_tags *tags)
+{
+	seq_printf(m, "nr_tags=%u\n", tags->nr_tags);
+	seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
+	seq_printf(m, "active_queues=%d\n",
+		   atomic_read(&tags->active_queues));
+
+	seq_puts(m, "\nbitmap_tags:\n");
+	sbitmap_queue_show(&tags->bitmap_tags, m);
+
+	if (tags->nr_reserved_tags) {
+		seq_puts(m, "\nbreserved_tags:\n");
+		sbitmap_queue_show(&tags->breserved_tags, m);
+	}
+}
+
+static int hctx_tags_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+	struct request_queue *q = hctx->queue;
+	int res;
+
+	res = mutex_lock_interruptible(&q->sysfs_lock);
+	if (res)
+		goto out;
+	if (hctx->tags)
+		blk_mq_debugfs_tags_show(m, hctx->tags);
+	mutex_unlock(&q->sysfs_lock);
+
+out:
+	return res;
+}
+
+static int hctx_tags_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hctx_tags_show, inode->i_private);
+}
+
+static const struct file_operations hctx_tags_fops = {
+	.open		= hctx_tags_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int hctx_tags_bitmap_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+	struct request_queue *q = hctx->queue;
+	int res;
+
+	res = mutex_lock_interruptible(&q->sysfs_lock);
+	if (res)
+		goto out;
+	if (hctx->tags)
+		sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
+	mutex_unlock(&q->sysfs_lock);
+
+out:
+	return res;
+}
+
+static int hctx_tags_bitmap_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hctx_tags_bitmap_show, inode->i_private);
+}
+
+static const struct file_operations hctx_tags_bitmap_fops = {
+	.open		= hctx_tags_bitmap_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int hctx_sched_tags_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+	struct request_queue *q = hctx->queue;
+	int res;
+
+	res = mutex_lock_interruptible(&q->sysfs_lock);
+	if (res)
+		goto out;
+	if (hctx->sched_tags)
+		blk_mq_debugfs_tags_show(m, hctx->sched_tags);
+	mutex_unlock(&q->sysfs_lock);
+
+out:
+	return res;
+}
+
+static int hctx_sched_tags_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hctx_sched_tags_show, inode->i_private);
+}
+
+static const struct file_operations hctx_sched_tags_fops = {
+	.open		= hctx_sched_tags_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int hctx_sched_tags_bitmap_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+	struct request_queue *q = hctx->queue;
+	int res;
+
+	res = mutex_lock_interruptible(&q->sysfs_lock);
+	if (res)
+		goto out;
+	if (hctx->sched_tags)
+		sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
+	mutex_unlock(&q->sysfs_lock);
+
+out:
+	return res;
+}
+
+static int hctx_sched_tags_bitmap_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hctx_sched_tags_bitmap_show, inode->i_private);
+}
+
+static const struct file_operations hctx_sched_tags_bitmap_fops = {
+	.open		= hctx_sched_tags_bitmap_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int hctx_io_poll_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	seq_printf(m, "considered=%lu\n", hctx->poll_considered);
+	seq_printf(m, "invoked=%lu\n", hctx->poll_invoked);
+	seq_printf(m, "success=%lu\n", hctx->poll_success);
+	return 0;
+}
+
+static int hctx_io_poll_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hctx_io_poll_show, inode->i_private);
+}
+
+static ssize_t hctx_io_poll_write(struct file *file, const char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct seq_file *m = file->private_data;
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
+	return count;
+}
+
+static const struct file_operations hctx_io_poll_fops = {
+	.open		= hctx_io_poll_open,
+	.read		= seq_read,
+	.write		= hctx_io_poll_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
+{
+	seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
+		   stat->nr_samples, stat->mean, stat->min, stat->max);
+}
+
+static int hctx_stats_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+	struct blk_rq_stat stat[2];
+
+	blk_stat_init(&stat[BLK_STAT_READ]);
+	blk_stat_init(&stat[BLK_STAT_WRITE]);
+
+	blk_hctx_stat_get(hctx, stat);
+
+	seq_puts(m, "read: ");
+	print_stat(m, &stat[BLK_STAT_READ]);
+	seq_puts(m, "\n");
+
+	seq_puts(m, "write: ");
+	print_stat(m, &stat[BLK_STAT_WRITE]);
+	seq_puts(m, "\n");
+	return 0;
+}
+
+static int hctx_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hctx_stats_show, inode->i_private);
+}
+
+static ssize_t hctx_stats_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct seq_file *m = file->private_data;
+	struct blk_mq_hw_ctx *hctx = m->private;
+	struct blk_mq_ctx *ctx;
+	int i;
+
+	hctx_for_each_ctx(hctx, ctx, i) {
+		blk_stat_init(&ctx->stat[BLK_STAT_READ]);
+		blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
+	}
+	return count;
+}
+
+static const struct file_operations hctx_stats_fops = {
+	.open		= hctx_stats_open,
+	.read		= seq_read,
+	.write		= hctx_stats_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int hctx_dispatched_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+	int i;
+
+	seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
+
+	for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
+		unsigned int d = 1U << (i - 1);
+
+		seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]);
+	}
+
+	seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]);
+	return 0;
+}
+
+static int hctx_dispatched_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hctx_dispatched_show, inode->i_private);
+}
+
+static ssize_t hctx_dispatched_write(struct file *file, const char __user *buf,
+				     size_t count, loff_t *ppos)
+{
+	struct seq_file *m = file->private_data;
+	struct blk_mq_hw_ctx *hctx = m->private;
+	int i;
+
+	for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++)
+		hctx->dispatched[i] = 0;
+	return count;
+}
+
+static const struct file_operations hctx_dispatched_fops = {
+	.open		= hctx_dispatched_open,
+	.read		= seq_read,
+	.write		= hctx_dispatched_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int hctx_queued_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	seq_printf(m, "%lu\n", hctx->queued);
+	return 0;
+}
+
+static int hctx_queued_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hctx_queued_show, inode->i_private);
+}
+
+static ssize_t hctx_queued_write(struct file *file, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct seq_file *m = file->private_data;
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	hctx->queued = 0;
+	return count;
+}
+
+static const struct file_operations hctx_queued_fops = {
+	.open		= hctx_queued_open,
+	.read		= seq_read,
+	.write		= hctx_queued_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int hctx_run_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	seq_printf(m, "%lu\n", hctx->run);
+	return 0;
+}
+
+static int hctx_run_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hctx_run_show, inode->i_private);
+}
+
+static ssize_t hctx_run_write(struct file *file, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct seq_file *m = file->private_data;
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	hctx->run = 0;
+	return count;
+}
+
+static const struct file_operations hctx_run_fops = {
+	.open		= hctx_run_open,
+	.read		= seq_read,
+	.write		= hctx_run_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int hctx_active_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	seq_printf(m, "%d\n", atomic_read(&hctx->nr_active));
+	return 0;
+}
+
+static int hctx_active_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hctx_active_show, inode->i_private);
+}
+
+static const struct file_operations hctx_active_fops = {
+	.open		= hctx_active_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos)
+	__acquires(&ctx->lock)
+{
+	struct blk_mq_ctx *ctx = m->private;
+
+	spin_lock(&ctx->lock);
+	return seq_list_start(&ctx->rq_list, *pos);
+}
+
+static void *ctx_rq_list_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct blk_mq_ctx *ctx = m->private;
+
+	return seq_list_next(v, &ctx->rq_list, pos);
+}
+
+static void ctx_rq_list_stop(struct seq_file *m, void *v)
+	__releases(&ctx->lock)
+{
+	struct blk_mq_ctx *ctx = m->private;
+
+	spin_unlock(&ctx->lock);
+}
+
+static const struct seq_operations ctx_rq_list_seq_ops = {
+	.start	= ctx_rq_list_start,
+	.next	= ctx_rq_list_next,
+	.stop	= ctx_rq_list_stop,
+	.show	= blk_mq_debugfs_rq_show,
+};
+
+static int ctx_rq_list_open(struct inode *inode, struct file *file)
+{
+	return blk_mq_debugfs_seq_open(inode, file, &ctx_rq_list_seq_ops);
+}
+
+static const struct file_operations ctx_rq_list_fops = {
+	.open		= ctx_rq_list_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int ctx_dispatched_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_ctx *ctx = m->private;
+
+	seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]);
+	return 0;
+}
+
+static int ctx_dispatched_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ctx_dispatched_show, inode->i_private);
+}
+
+static ssize_t ctx_dispatched_write(struct file *file, const char __user *buf,
+				    size_t count, loff_t *ppos)
+{
+	struct seq_file *m = file->private_data;
+	struct blk_mq_ctx *ctx = m->private;
+
+	ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0;
+	return count;
+}
+
+static const struct file_operations ctx_dispatched_fops = {
+	.open		= ctx_dispatched_open,
+	.read		= seq_read,
+	.write		= ctx_dispatched_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int ctx_merged_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_ctx *ctx = m->private;
+
+	seq_printf(m, "%lu\n", ctx->rq_merged);
+	return 0;
+}
+
+static int ctx_merged_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ctx_merged_show, inode->i_private);
+}
+
+static ssize_t ctx_merged_write(struct file *file, const char __user *buf,
+				    size_t count, loff_t *ppos)
+{
+	struct seq_file *m = file->private_data;
+	struct blk_mq_ctx *ctx = m->private;
+
+	ctx->rq_merged = 0;
+	return count;
+}
+
+static const struct file_operations ctx_merged_fops = {
+	.open		= ctx_merged_open,
+	.read		= seq_read,
+	.write		= ctx_merged_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int ctx_completed_show(struct seq_file *m, void *v)
+{
+	struct blk_mq_ctx *ctx = m->private;
+
+	seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]);
+	return 0;
+}
+
+static int ctx_completed_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ctx_completed_show, inode->i_private);
+}
+
+static ssize_t ctx_completed_write(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	struct seq_file *m = file->private_data;
+	struct blk_mq_ctx *ctx = m->private;
+
+	ctx->rq_completed[0] = ctx->rq_completed[1] = 0;
+	return count;
+}
+
+static const struct file_operations ctx_completed_fops = {
+	.open		= ctx_completed_open,
+	.read		= seq_read,
+	.write		= ctx_completed_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
+	{"state", 0400, &hctx_state_fops},
+	{"flags", 0400, &hctx_flags_fops},
+	{"dispatch", 0400, &hctx_dispatch_fops},
+	{"ctx_map", 0400, &hctx_ctx_map_fops},
+	{"tags", 0400, &hctx_tags_fops},
+	{"tags_bitmap", 0400, &hctx_tags_bitmap_fops},
+	{"sched_tags", 0400, &hctx_sched_tags_fops},
+	{"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops},
+	{"io_poll", 0600, &hctx_io_poll_fops},
+	{"stats", 0600, &hctx_stats_fops},
+	{"dispatched", 0600, &hctx_dispatched_fops},
+	{"queued", 0600, &hctx_queued_fops},
+	{"run", 0600, &hctx_run_fops},
+	{"active", 0400, &hctx_active_fops},
+	{},
+};
+
+static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
+	{"rq_list", 0400, &ctx_rq_list_fops},
+	{"dispatched", 0600, &ctx_dispatched_fops},
+	{"merged", 0600, &ctx_merged_fops},
+	{"completed", 0600, &ctx_completed_fops},
+	{},
+};
+
+int blk_mq_debugfs_register(struct request_queue *q, const char *name)
+{
+	if (!blk_debugfs_root)
+		return -ENOENT;
+
+	q->debugfs_dir = debugfs_create_dir(name, blk_debugfs_root);
+	if (!q->debugfs_dir)
+		goto err;
+
+	if (blk_mq_debugfs_register_hctxs(q))
+		goto err;
+
+	return 0;
+
+err:
+	blk_mq_debugfs_unregister(q);
+	return -ENOMEM;
+}
+
+void blk_mq_debugfs_unregister(struct request_queue *q)
+{
+	debugfs_remove_recursive(q->debugfs_dir);
+	q->mq_debugfs_dir = NULL;
+	q->debugfs_dir = NULL;
+}
+
+static bool debugfs_create_files(struct dentry *parent, void *data,
+				const struct blk_mq_debugfs_attr *attr)
+{
+	for (; attr->name; attr++) {
+		if (!debugfs_create_file(attr->name, attr->mode, parent,
+					 data, attr->fops))
+			return false;
+	}
+	return true;
+}
+
+static int blk_mq_debugfs_register_ctx(struct request_queue *q,
+				       struct blk_mq_ctx *ctx,
+				       struct dentry *hctx_dir)
+{
+	struct dentry *ctx_dir;
+	char name[20];
+
+	snprintf(name, sizeof(name), "cpu%u", ctx->cpu);
+	ctx_dir = debugfs_create_dir(name, hctx_dir);
+	if (!ctx_dir)
+		return -ENOMEM;
+
+	if (!debugfs_create_files(ctx_dir, ctx, blk_mq_debugfs_ctx_attrs))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int blk_mq_debugfs_register_hctx(struct request_queue *q,
+					struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_ctx *ctx;
+	struct dentry *hctx_dir;
+	char name[20];
+	int i;
+
+	snprintf(name, sizeof(name), "%u", hctx->queue_num);
+	hctx_dir = debugfs_create_dir(name, q->mq_debugfs_dir);
+	if (!hctx_dir)
+		return -ENOMEM;
+
+	if (!debugfs_create_files(hctx_dir, hctx, blk_mq_debugfs_hctx_attrs))
+		return -ENOMEM;
+
+	hctx_for_each_ctx(hctx, ctx, i) {
+		if (blk_mq_debugfs_register_ctx(q, ctx, hctx_dir))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+int blk_mq_debugfs_register_hctxs(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	if (!q->debugfs_dir)
+		return -ENOENT;
+
+	q->mq_debugfs_dir = debugfs_create_dir("mq", q->debugfs_dir);
+	if (!q->mq_debugfs_dir)
+		goto err;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (blk_mq_debugfs_register_hctx(q, hctx))
+			goto err;
+	}
+
+	return 0;
+
+err:
+	blk_mq_debugfs_unregister_hctxs(q);
+	return -ENOMEM;
+}
+
+void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
+{
+	debugfs_remove_recursive(q->mq_debugfs_dir);
+	q->mq_debugfs_dir = NULL;
+}
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
new file mode 100644
index 000000000000..966c2169762e
--- /dev/null
+++ b/block/blk-mq-pci.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/kobject.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/blk-mq-pci.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+
+/**
+ * blk_mq_pci_map_queues - provide a default queue mapping for PCI device
+ * @set:	tagset to provide the mapping for
+ * @pdev:	PCI device associated with @set.
+ *
+ * This function assumes the PCI device @pdev has at least as many available
+ * interrupt vetors as @set has queues.  It will then queuery the vector
+ * corresponding to each queue for it's affinity mask and built queue mapping
+ * that maps a queue to the CPUs that have irq affinity for the corresponding
+ * vector.
+ */
+int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev)
+{
+	const struct cpumask *mask;
+	unsigned int queue, cpu;
+
+	for (queue = 0; queue < set->nr_hw_queues; queue++) {
+		mask = pci_irq_get_affinity(pdev, queue);
+		if (!mask)
+			return -EINVAL;
+
+		for_each_cpu(cpu, mask)
+			set->mq_map[cpu] = queue;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
new file mode 100644
index 000000000000..9e8d6795a8c1
--- /dev/null
+++ b/block/blk-mq-sched.c
@@ -0,0 +1,515 @@
+/*
+ * blk-mq scheduling framework
+ *
+ * Copyright (C) 2016 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/blk-mq.h>
+
+#include <trace/events/block.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-tag.h"
+#include "blk-wbt.h"
+
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+				 void (*exit)(struct blk_mq_hw_ctx *))
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (exit && hctx->sched_data)
+			exit(hctx);
+		kfree(hctx->sched_data);
+		hctx->sched_data = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+				int (*init)(struct blk_mq_hw_ctx *),
+				void (*exit)(struct blk_mq_hw_ctx *))
+{
+	struct blk_mq_hw_ctx *hctx;
+	int ret;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
+		if (!hctx->sched_data) {
+			ret = -ENOMEM;
+			goto error;
+		}
+
+		if (init) {
+			ret = init(hctx);
+			if (ret) {
+				/*
+				 * We don't want to give exit() a partially
+				 * initialized sched_data. init() must clean up
+				 * if it fails.
+				 */
+				kfree(hctx->sched_data);
+				hctx->sched_data = NULL;
+				goto error;
+			}
+		}
+	}
+
+	return 0;
+error:
+	blk_mq_sched_free_hctx_data(q, exit);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
+
+static void __blk_mq_sched_assign_ioc(struct request_queue *q,
+				      struct request *rq,
+				      struct bio *bio,
+				      struct io_context *ioc)
+{
+	struct io_cq *icq;
+
+	spin_lock_irq(q->queue_lock);
+	icq = ioc_lookup_icq(ioc, q);
+	spin_unlock_irq(q->queue_lock);
+
+	if (!icq) {
+		icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
+		if (!icq)
+			return;
+	}
+
+	rq->elv.icq = icq;
+	if (!blk_mq_sched_get_rq_priv(q, rq, bio)) {
+		rq->rq_flags |= RQF_ELVPRIV;
+		get_io_context(icq->ioc);
+		return;
+	}
+
+	rq->elv.icq = NULL;
+}
+
+static void blk_mq_sched_assign_ioc(struct request_queue *q,
+				    struct request *rq, struct bio *bio)
+{
+	struct io_context *ioc;
+
+	ioc = rq_ioc(bio);
+	if (ioc)
+		__blk_mq_sched_assign_ioc(q, rq, bio, ioc);
+}
+
+struct request *blk_mq_sched_get_request(struct request_queue *q,
+					 struct bio *bio,
+					 unsigned int op,
+					 struct blk_mq_alloc_data *data)
+{
+	struct elevator_queue *e = q->elevator;
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+	struct request *rq;
+
+	blk_queue_enter_live(q);
+	ctx = blk_mq_get_ctx(q);
+	hctx = blk_mq_map_queue(q, ctx->cpu);
+
+	blk_mq_set_alloc_data(data, q, data->flags, ctx, hctx);
+
+	if (e) {
+		data->flags |= BLK_MQ_REQ_INTERNAL;
+
+		/*
+		 * Flush requests are special and go directly to the
+		 * dispatch list.
+		 */
+		if (!op_is_flush(op) && e->type->ops.mq.get_request) {
+			rq = e->type->ops.mq.get_request(q, op, data);
+			if (rq)
+				rq->rq_flags |= RQF_QUEUED;
+		} else
+			rq = __blk_mq_alloc_request(data, op);
+	} else {
+		rq = __blk_mq_alloc_request(data, op);
+		if (rq)
+			data->hctx->tags->rqs[rq->tag] = rq;
+	}
+
+	if (rq) {
+		if (!op_is_flush(op)) {
+			rq->elv.icq = NULL;
+			if (e && e->type->icq_cache)
+				blk_mq_sched_assign_ioc(q, rq, bio);
+		}
+		data->hctx->queued++;
+		return rq;
+	}
+
+	blk_queue_exit(q);
+	return NULL;
+}
+
+void blk_mq_sched_put_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+
+	if (rq->rq_flags & RQF_ELVPRIV) {
+		blk_mq_sched_put_rq_priv(rq->q, rq);
+		if (rq->elv.icq) {
+			put_io_context(rq->elv.icq->ioc);
+			rq->elv.icq = NULL;
+		}
+	}
+
+	if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request)
+		e->type->ops.mq.put_request(rq);
+	else
+		blk_mq_finish_request(rq);
+}
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+{
+	struct elevator_queue *e = hctx->queue->elevator;
+	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
+	bool did_work = false;
+	LIST_HEAD(rq_list);
+
+	if (unlikely(blk_mq_hctx_stopped(hctx)))
+		return;
+
+	hctx->run++;
+
+	/*
+	 * If we have previous entries on our dispatch list, grab them first for
+	 * more fair dispatch.
+	 */
+	if (!list_empty_careful(&hctx->dispatch)) {
+		spin_lock(&hctx->lock);
+		if (!list_empty(&hctx->dispatch))
+			list_splice_init(&hctx->dispatch, &rq_list);
+		spin_unlock(&hctx->lock);
+	}
+
+	/*
+	 * Only ask the scheduler for requests, if we didn't have residual
+	 * requests from the dispatch list. This is to avoid the case where
+	 * we only ever dispatch a fraction of the requests available because
+	 * of low device queue depth. Once we pull requests out of the IO
+	 * scheduler, we can no longer merge or sort them. So it's best to
+	 * leave them there for as long as we can. Mark the hw queue as
+	 * needing a restart in that case.
+	 */
+	if (!list_empty(&rq_list)) {
+		blk_mq_sched_mark_restart(hctx);
+		did_work = blk_mq_dispatch_rq_list(hctx, &rq_list);
+	} else if (!has_sched_dispatch) {
+		blk_mq_flush_busy_ctxs(hctx, &rq_list);
+		blk_mq_dispatch_rq_list(hctx, &rq_list);
+	}
+
+	/*
+	 * We want to dispatch from the scheduler if we had no work left
+	 * on the dispatch list, OR if we did have work but weren't able
+	 * to make progress.
+	 */
+	if (!did_work && has_sched_dispatch) {
+		do {
+			struct request *rq;
+
+			rq = e->type->ops.mq.dispatch_request(hctx);
+			if (!rq)
+				break;
+			list_add(&rq->queuelist, &rq_list);
+		} while (blk_mq_dispatch_rq_list(hctx, &rq_list));
+	}
+}
+
+void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
+				   struct list_head *rq_list,
+				   struct request *(*get_rq)(struct blk_mq_hw_ctx *))
+{
+	do {
+		struct request *rq;
+
+		rq = get_rq(hctx);
+		if (!rq)
+			break;
+
+		list_add_tail(&rq->queuelist, rq_list);
+	} while (1);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch);
+
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
+			    struct request **merged_request)
+{
+	struct request *rq;
+
+	switch (elv_merge(q, &rq, bio)) {
+	case ELEVATOR_BACK_MERGE:
+		if (!blk_mq_sched_allow_merge(q, rq, bio))
+			return false;
+		if (!bio_attempt_back_merge(q, rq, bio))
+			return false;
+		*merged_request = attempt_back_merge(q, rq);
+		if (!*merged_request)
+			elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
+		return true;
+	case ELEVATOR_FRONT_MERGE:
+		if (!blk_mq_sched_allow_merge(q, rq, bio))
+			return false;
+		if (!bio_attempt_front_merge(q, rq, bio))
+			return false;
+		*merged_request = attempt_front_merge(q, rq);
+		if (!*merged_request)
+			elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
+		return true;
+	default:
+		return false;
+	}
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
+
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e->type->ops.mq.bio_merge) {
+		struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+		struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+		blk_mq_put_ctx(ctx);
+		return e->type->ops.mq.bio_merge(hctx, bio);
+	}
+
+	return false;
+}
+
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
+{
+	return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
+
+void blk_mq_sched_request_inserted(struct request *rq)
+{
+	trace_block_rq_insert(rq->q, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
+
+static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
+				       struct request *rq)
+{
+	if (rq->tag == -1) {
+		rq->rq_flags |= RQF_SORTED;
+		return false;
+	}
+
+	/*
+	 * If we already have a real request tag, send directly to
+	 * the dispatch list.
+	 */
+	spin_lock(&hctx->lock);
+	list_add(&rq->queuelist, &hctx->dispatch);
+	spin_unlock(&hctx->lock);
+	return true;
+}
+
+static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+{
+	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
+		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+		if (blk_mq_hctx_has_pending(hctx))
+			blk_mq_run_hw_queue(hctx, true);
+	}
+}
+
+void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx)
+{
+	unsigned int i;
+
+	if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+		blk_mq_sched_restart_hctx(hctx);
+	else {
+		struct request_queue *q = hctx->queue;
+
+		if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
+			return;
+
+		clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
+
+		queue_for_each_hw_ctx(q, hctx, i)
+			blk_mq_sched_restart_hctx(hctx);
+	}
+}
+
+/*
+ * Add flush/fua to the queue. If we fail getting a driver tag, then
+ * punt to the requeue list. Requeue will re-invoke us from a context
+ * that's safe to block from.
+ */
+static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
+				      struct request *rq, bool can_block)
+{
+	if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
+		blk_insert_flush(rq);
+		blk_mq_run_hw_queue(hctx, true);
+	} else
+		blk_mq_add_to_requeue_list(rq, false, true);
+}
+
+void blk_mq_sched_insert_request(struct request *rq, bool at_head,
+				 bool run_queue, bool async, bool can_block)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+	if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
+		blk_mq_sched_insert_flush(hctx, rq, can_block);
+		return;
+	}
+
+	if (e && blk_mq_sched_bypass_insert(hctx, rq))
+		goto run;
+
+	if (e && e->type->ops.mq.insert_requests) {
+		LIST_HEAD(list);
+
+		list_add(&rq->queuelist, &list);
+		e->type->ops.mq.insert_requests(hctx, &list, at_head);
+	} else {
+		spin_lock(&ctx->lock);
+		__blk_mq_insert_request(hctx, rq, at_head);
+		spin_unlock(&ctx->lock);
+	}
+
+run:
+	if (run_queue)
+		blk_mq_run_hw_queue(hctx, async);
+}
+
+void blk_mq_sched_insert_requests(struct request_queue *q,
+				  struct blk_mq_ctx *ctx,
+				  struct list_head *list, bool run_queue_async)
+{
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+	struct elevator_queue *e = hctx->queue->elevator;
+
+	if (e) {
+		struct request *rq, *next;
+
+		/*
+		 * We bypass requests that already have a driver tag assigned,
+		 * which should only be flushes. Flushes are only ever inserted
+		 * as single requests, so we shouldn't ever hit the
+		 * WARN_ON_ONCE() below (but let's handle it just in case).
+		 */
+		list_for_each_entry_safe(rq, next, list, queuelist) {
+			if (WARN_ON_ONCE(rq->tag != -1)) {
+				list_del_init(&rq->queuelist);
+				blk_mq_sched_bypass_insert(hctx, rq);
+			}
+		}
+	}
+
+	if (e && e->type->ops.mq.insert_requests)
+		e->type->ops.mq.insert_requests(hctx, list, false);
+	else
+		blk_mq_insert_requests(hctx, ctx, list);
+
+	blk_mq_run_hw_queue(hctx, run_queue_async);
+}
+
+static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
+				   struct blk_mq_hw_ctx *hctx,
+				   unsigned int hctx_idx)
+{
+	if (hctx->sched_tags) {
+		blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
+		blk_mq_free_rq_map(hctx->sched_tags);
+		hctx->sched_tags = NULL;
+	}
+}
+
+int blk_mq_sched_setup(struct request_queue *q)
+{
+	struct blk_mq_tag_set *set = q->tag_set;
+	struct blk_mq_hw_ctx *hctx;
+	int ret, i;
+
+	/*
+	 * Default to 256, since we don't split into sync/async like the
+	 * old code did. Additionally, this is a per-hw queue depth.
+	 */
+	q->nr_requests = 2 * BLKDEV_MAX_RQ;
+
+	/*
+	 * We're switching to using an IO scheduler, so setup the hctx
+	 * scheduler tags and switch the request map from the regular
+	 * tags to scheduler tags. First allocate what we need, so we
+	 * can safely fail and fallback, if needed.
+	 */
+	ret = 0;
+	queue_for_each_hw_ctx(q, hctx, i) {
+		hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0);
+		if (!hctx->sched_tags) {
+			ret = -ENOMEM;
+			break;
+		}
+		ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests);
+		if (ret)
+			break;
+	}
+
+	/*
+	 * If we failed, free what we did allocate
+	 */
+	if (ret) {
+		queue_for_each_hw_ctx(q, hctx, i) {
+			if (!hctx->sched_tags)
+				continue;
+			blk_mq_sched_free_tags(set, hctx, i);
+		}
+
+		return ret;
+	}
+
+	return 0;
+}
+
+void blk_mq_sched_teardown(struct request_queue *q)
+{
+	struct blk_mq_tag_set *set = q->tag_set;
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		blk_mq_sched_free_tags(set, hctx, i);
+}
+
+int blk_mq_sched_init(struct request_queue *q)
+{
+	int ret;
+
+#if defined(CONFIG_DEFAULT_SQ_NONE)
+	if (q->nr_hw_queues == 1)
+		return 0;
+#endif
+#if defined(CONFIG_DEFAULT_MQ_NONE)
+	if (q->nr_hw_queues > 1)
+		return 0;
+#endif
+
+	mutex_lock(&q->sysfs_lock);
+	ret = elevator_init(q, NULL);
+	mutex_unlock(&q->sysfs_lock);
+
+	return ret;
+}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
new file mode 100644
index 000000000000..7b5f3b95c78e
--- /dev/null
+++ b/block/blk-mq-sched.h
@@ -0,0 +1,143 @@
+#ifndef BLK_MQ_SCHED_H
+#define BLK_MQ_SCHED_H
+
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+				int (*init)(struct blk_mq_hw_ctx *),
+				void (*exit)(struct blk_mq_hw_ctx *));
+
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+				 void (*exit)(struct blk_mq_hw_ctx *));
+
+struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data);
+void blk_mq_sched_put_request(struct request *rq);
+
+void blk_mq_sched_request_inserted(struct request *rq);
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
+				struct request **merged_request);
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
+void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx);
+
+void blk_mq_sched_insert_request(struct request *rq, bool at_head,
+				 bool run_queue, bool async, bool can_block);
+void blk_mq_sched_insert_requests(struct request_queue *q,
+				  struct blk_mq_ctx *ctx,
+				  struct list_head *list, bool run_queue_async);
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
+void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
+			struct list_head *rq_list,
+			struct request *(*get_rq)(struct blk_mq_hw_ctx *));
+
+int blk_mq_sched_setup(struct request_queue *q);
+void blk_mq_sched_teardown(struct request_queue *q);
+
+int blk_mq_sched_init(struct request_queue *q);
+
+static inline bool
+blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (!e || blk_queue_nomerges(q) || !bio_mergeable(bio))
+		return false;
+
+	return __blk_mq_sched_bio_merge(q, bio);
+}
+
+static inline int blk_mq_sched_get_rq_priv(struct request_queue *q,
+					   struct request *rq,
+					   struct bio *bio)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.mq.get_rq_priv)
+		return e->type->ops.mq.get_rq_priv(q, rq, bio);
+
+	return 0;
+}
+
+static inline void blk_mq_sched_put_rq_priv(struct request_queue *q,
+					    struct request *rq)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.mq.put_rq_priv)
+		e->type->ops.mq.put_rq_priv(q, rq);
+}
+
+static inline bool
+blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
+			 struct bio *bio)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.mq.allow_merge)
+		return e->type->ops.mq.allow_merge(q, rq, bio);
+
+	return true;
+}
+
+static inline void
+blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	struct elevator_queue *e = hctx->queue->elevator;
+
+	if (e && e->type->ops.mq.completed_request)
+		e->type->ops.mq.completed_request(hctx, rq);
+
+	BUG_ON(rq->internal_tag == -1);
+
+	blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
+}
+
+static inline void blk_mq_sched_started_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.mq.started_request)
+		e->type->ops.mq.started_request(rq);
+}
+
+static inline void blk_mq_sched_requeue_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.mq.requeue_request)
+		e->type->ops.mq.requeue_request(rq);
+}
+
+static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
+{
+	struct elevator_queue *e = hctx->queue->elevator;
+
+	if (e && e->type->ops.mq.has_work)
+		return e->type->ops.mq.has_work(hctx);
+
+	return false;
+}
+
+static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx)
+{
+	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
+		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+		if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
+			struct request_queue *q = hctx->queue;
+
+			if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
+				set_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
+		}
+	}
+}
+
+static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
+{
+	return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+}
+
+#endif
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index fe822aa5b8e4..295e69670c39 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -122,111 +122,16 @@ static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
 	return res;
 }
 
-static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page)
-{
-	return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1],
-				ctx->rq_dispatched[0]);
-}
-
-static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page)
-{
-	return sprintf(page, "%lu\n", ctx->rq_merged);
-}
-
-static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page)
-{
-	return sprintf(page, "%lu %lu\n", ctx->rq_completed[1],
-				ctx->rq_completed[0]);
-}
-
-static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg)
-{
-	struct request *rq;
-	int len = snprintf(page, PAGE_SIZE - 1, "%s:\n", msg);
-
-	list_for_each_entry(rq, list, queuelist) {
-		const int rq_len = 2 * sizeof(rq) + 2;
-
-		/* if the output will be truncated */
-		if (PAGE_SIZE - 1 < len + rq_len) {
-			/* backspacing if it can't hold '\t...\n' */
-			if (PAGE_SIZE - 1 < len + 5)
-				len -= rq_len;
-			len += snprintf(page + len, PAGE_SIZE - 1 - len,
-					"\t...\n");
-			break;
-		}
-		len += snprintf(page + len, PAGE_SIZE - 1 - len,
-				"\t%p\n", rq);
-	}
-
-	return len;
-}
-
-static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
-{
-	ssize_t ret;
-
-	spin_lock(&ctx->lock);
-	ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending");
-	spin_unlock(&ctx->lock);
-
-	return ret;
-}
-
-static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
-	return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success);
-}
-
-static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
-					   char *page)
-{
-	return sprintf(page, "%lu\n", hctx->queued);
-}
-
-static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
-	return sprintf(page, "%lu\n", hctx->run);
-}
-
-static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
-					       char *page)
-{
-	char *start_page = page;
-	int i;
-
-	page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
-
-	for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) {
-		unsigned long d = 1U << (i - 1);
-
-		page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]);
-	}
-
-	return page - start_page;
-}
-
-static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
+static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx,
 					    char *page)
 {
-	ssize_t ret;
-
-	spin_lock(&hctx->lock);
-	ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending");
-	spin_unlock(&hctx->lock);
-
-	return ret;
-}
-
-static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
-	return blk_mq_tag_sysfs_show(hctx->tags, page);
+	return sprintf(page, "%u\n", hctx->tags->nr_tags);
 }
 
-static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
+static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx,
+						     char *page)
 {
-	return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
+	return sprintf(page, "%u\n", hctx->tags->nr_reserved_tags);
 }
 
 static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
@@ -247,73 +152,27 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 	return ret;
 }
 
-static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
-	.attr = {.name = "dispatched", .mode = S_IRUGO },
-	.show = blk_mq_sysfs_dispatched_show,
-};
-static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = {
-	.attr = {.name = "merged", .mode = S_IRUGO },
-	.show = blk_mq_sysfs_merged_show,
-};
-static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = {
-	.attr = {.name = "completed", .mode = S_IRUGO },
-	.show = blk_mq_sysfs_completed_show,
-};
-static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = {
-	.attr = {.name = "rq_list", .mode = S_IRUGO },
-	.show = blk_mq_sysfs_rq_list_show,
-};
-
 static struct attribute *default_ctx_attrs[] = {
-	&blk_mq_sysfs_dispatched.attr,
-	&blk_mq_sysfs_merged.attr,
-	&blk_mq_sysfs_completed.attr,
-	&blk_mq_sysfs_rq_list.attr,
 	NULL,
 };
 
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = {
-	.attr = {.name = "queued", .mode = S_IRUGO },
-	.show = blk_mq_hw_sysfs_queued_show,
-};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = {
-	.attr = {.name = "run", .mode = S_IRUGO },
-	.show = blk_mq_hw_sysfs_run_show,
-};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
-	.attr = {.name = "dispatched", .mode = S_IRUGO },
-	.show = blk_mq_hw_sysfs_dispatched_show,
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = {
+	.attr = {.name = "nr_tags", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_nr_tags_show,
 };
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
-	.attr = {.name = "active", .mode = S_IRUGO },
-	.show = blk_mq_hw_sysfs_active_show,
-};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
-	.attr = {.name = "pending", .mode = S_IRUGO },
-	.show = blk_mq_hw_sysfs_rq_list_show,
-};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
-	.attr = {.name = "tags", .mode = S_IRUGO },
-	.show = blk_mq_hw_sysfs_tags_show,
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = {
+	.attr = {.name = "nr_reserved_tags", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_nr_reserved_tags_show,
 };
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
 	.attr = {.name = "cpu_list", .mode = S_IRUGO },
 	.show = blk_mq_hw_sysfs_cpus_show,
 };
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
-	.attr = {.name = "io_poll", .mode = S_IRUGO },
-	.show = blk_mq_hw_sysfs_poll_show,
-};
 
 static struct attribute *default_hw_ctx_attrs[] = {
-	&blk_mq_hw_sysfs_queued.attr,
-	&blk_mq_hw_sysfs_run.attr,
-	&blk_mq_hw_sysfs_dispatched.attr,
-	&blk_mq_hw_sysfs_pending.attr,
-	&blk_mq_hw_sysfs_tags.attr,
+	&blk_mq_hw_sysfs_nr_tags.attr,
+	&blk_mq_hw_sysfs_nr_reserved_tags.attr,
 	&blk_mq_hw_sysfs_cpus.attr,
-	&blk_mq_hw_sysfs_active.attr,
-	&blk_mq_hw_sysfs_poll.attr,
 	NULL,
 };
 
@@ -380,9 +239,8 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
 	return ret;
 }
 
-static void __blk_mq_unregister_disk(struct gendisk *disk)
+static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 {
-	struct request_queue *q = disk->queue;
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	int i, j;
@@ -396,19 +254,21 @@ static void __blk_mq_unregister_disk(struct gendisk *disk)
 		kobject_put(&hctx->kobj);
 	}
 
+	blk_mq_debugfs_unregister_hctxs(q);
+
 	kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
 	kobject_del(&q->mq_kobj);
 	kobject_put(&q->mq_kobj);
 
-	kobject_put(&disk_to_dev(disk)->kobj);
+	kobject_put(&dev->kobj);
 
 	q->mq_sysfs_init_done = false;
 }
 
-void blk_mq_unregister_disk(struct gendisk *disk)
+void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 {
 	blk_mq_disable_hotplug();
-	__blk_mq_unregister_disk(disk);
+	__blk_mq_unregister_dev(dev, q);
 	blk_mq_enable_hotplug();
 }
 
@@ -430,10 +290,8 @@ static void blk_mq_sysfs_init(struct request_queue *q)
 	}
 }
 
-int blk_mq_register_disk(struct gendisk *disk)
+int blk_mq_register_dev(struct device *dev, struct request_queue *q)
 {
-	struct device *dev = disk_to_dev(disk);
-	struct request_queue *q = disk->queue;
 	struct blk_mq_hw_ctx *hctx;
 	int ret, i;
 
@@ -447,6 +305,8 @@ int blk_mq_register_disk(struct gendisk *disk)
 
 	kobject_uevent(&q->mq_kobj, KOBJ_ADD);
 
+	blk_mq_debugfs_register(q, kobject_name(&dev->kobj));
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		ret = blk_mq_register_hctx(hctx);
 		if (ret)
@@ -454,7 +314,7 @@ int blk_mq_register_disk(struct gendisk *disk)
 	}
 
 	if (ret)
-		__blk_mq_unregister_disk(disk);
+		__blk_mq_unregister_dev(dev, q);
 	else
 		q->mq_sysfs_init_done = true;
 out:
@@ -462,7 +322,7 @@ out:
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(blk_mq_register_disk);
+EXPORT_SYMBOL_GPL(blk_mq_register_dev);
 
 void blk_mq_sysfs_unregister(struct request_queue *q)
 {
@@ -472,6 +332,8 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
 	if (!q->mq_sysfs_init_done)
 		return;
 
+	blk_mq_debugfs_unregister_hctxs(q);
+
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_unregister_hctx(hctx);
 }
@@ -484,6 +346,8 @@ int blk_mq_sysfs_register(struct request_queue *q)
 	if (!q->mq_sysfs_init_done)
 		return ret;
 
+	blk_mq_debugfs_register_hctxs(q);
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		ret = blk_mq_register_hctx(hctx);
 		if (ret)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 729bac3a673b..54c84363c1b2 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -1,58 +1,24 @@
 /*
- * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread
- * over multiple cachelines to avoid ping-pong between multiple submitters
- * or submitter and completer. Uses rolling wakeups to avoid falling of
- * the scaling cliff when we run out of tags and have to start putting
- * submitters to sleep.
- *
- * Uses active queue tracking to support fairer distribution of tags
- * between multiple submitters when a shared tag map is used.
+ * Tag allocation using scalable bitmaps. Uses active queue tracking to support
+ * fairer distribution of tags between multiple submitters when a shared tag map
+ * is used.
  *
  * Copyright (C) 2013-2014 Jens Axboe
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/random.h>
 
 #include <linux/blk-mq.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
-static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
-{
-	int i;
-
-	for (i = 0; i < bt->map_nr; i++) {
-		struct blk_align_bitmap *bm = &bt->map[i];
-		int ret;
-
-		ret = find_first_zero_bit(&bm->word, bm->depth);
-		if (ret < bm->depth)
-			return true;
-	}
-
-	return false;
-}
-
 bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
 {
 	if (!tags)
 		return true;
 
-	return bt_has_free_tags(&tags->bitmap_tags);
-}
-
-static inline int bt_index_inc(int index)
-{
-	return (index + 1) & (BT_WAIT_QUEUES - 1);
-}
-
-static inline void bt_index_atomic_inc(atomic_t *index)
-{
-	int old = atomic_read(index);
-	int new = bt_index_inc(old);
-	atomic_cmpxchg(index, old, new);
+	return sbitmap_any_bit_clear(&tags->bitmap_tags.sb);
 }
 
 /*
@@ -72,29 +38,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
  */
 void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
 {
-	struct blk_mq_bitmap_tags *bt;
-	int i, wake_index;
-
-	/*
-	 * Make sure all changes prior to this are visible from other CPUs.
-	 */
-	smp_mb();
-	bt = &tags->bitmap_tags;
-	wake_index = atomic_read(&bt->wake_index);
-	for (i = 0; i < BT_WAIT_QUEUES; i++) {
-		struct bt_wait_state *bs = &bt->bs[wake_index];
-
-		if (waitqueue_active(&bs->wait))
-			wake_up(&bs->wait);
-
-		wake_index = bt_index_inc(wake_index);
-	}
-
-	if (include_reserve) {
-		bt = &tags->breserved_tags;
-		if (waitqueue_active(&bt->bs[0].wait))
-			wake_up(&bt->bs[0].wait);
-	}
+	sbitmap_queue_wake_all(&tags->bitmap_tags);
+	if (include_reserve)
+		sbitmap_queue_wake_all(&tags->breserved_tags);
 }
 
 /*
@@ -118,7 +64,7 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
  * and attempt to provide a fair share of the tag depth for each of them.
  */
 static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
-				  struct blk_mq_bitmap_tags *bt)
+				  struct sbitmap_queue *bt)
 {
 	unsigned int depth, users;
 
@@ -130,7 +76,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	/*
 	 * Don't try dividing an ant
 	 */
-	if (bt->depth == 1)
+	if (bt->sb.depth == 1)
 		return true;
 
 	users = atomic_read(&hctx->tags->active_queues);
@@ -140,337 +86,191 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	/*
 	 * Allow at least some tags
 	 */
-	depth = max((bt->depth + users - 1) / users, 4U);
+	depth = max((bt->sb.depth + users - 1) / users, 4U);
 	return atomic_read(&hctx->nr_active) < depth;
 }
 
-static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag,
-			 bool nowrap)
+static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
+			    struct sbitmap_queue *bt)
 {
-	int tag, org_last_tag = last_tag;
-
-	while (1) {
-		tag = find_next_zero_bit(&bm->word, bm->depth, last_tag);
-		if (unlikely(tag >= bm->depth)) {
-			/*
-			 * We started with an offset, and we didn't reset the
-			 * offset to 0 in a failure case, so start from 0 to
-			 * exhaust the map.
-			 */
-			if (org_last_tag && last_tag && !nowrap) {
-				last_tag = org_last_tag = 0;
-				continue;
-			}
-			return -1;
-		}
-
-		if (!test_and_set_bit(tag, &bm->word))
-			break;
-
-		last_tag = tag + 1;
-		if (last_tag >= bm->depth - 1)
-			last_tag = 0;
-	}
-
-	return tag;
-}
-
-#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR)
-
-/*
- * Straight forward bitmap tag implementation, where each bit is a tag
- * (cleared == free, and set == busy). The small twist is using per-cpu
- * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue
- * contexts. This enables us to drastically limit the space searched,
- * without dirtying an extra shared cacheline like we would if we stored
- * the cache value inside the shared blk_mq_bitmap_tags structure. On top
- * of that, each word of tags is in a separate cacheline. This means that
- * multiple users will tend to stick to different cachelines, at least
- * until the map is exhausted.
- */
-static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
-		    unsigned int *tag_cache, struct blk_mq_tags *tags)
-{
-	unsigned int last_tag, org_last_tag;
-	int index, i, tag;
-
-	if (!hctx_may_queue(hctx, bt))
+	if (!(data->flags & BLK_MQ_REQ_INTERNAL) &&
+	    !hctx_may_queue(data->hctx, bt))
 		return -1;
-
-	last_tag = org_last_tag = *tag_cache;
-	index = TAG_TO_INDEX(bt, last_tag);
-
-	for (i = 0; i < bt->map_nr; i++) {
-		tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag),
-				    BT_ALLOC_RR(tags));
-		if (tag != -1) {
-			tag += (index << bt->bits_per_word);
-			goto done;
-		}
-
-		/*
-		 * Jump to next index, and reset the last tag to be the
-		 * first tag of that index
-		 */
-		index++;
-		last_tag = (index << bt->bits_per_word);
-
-		if (index >= bt->map_nr) {
-			index = 0;
-			last_tag = 0;
-		}
-	}
-
-	*tag_cache = 0;
-	return -1;
-
-	/*
-	 * Only update the cache from the allocation path, if we ended
-	 * up using the specific cached tag.
-	 */
-done:
-	if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) {
-		last_tag = tag + 1;
-		if (last_tag >= bt->depth - 1)
-			last_tag = 0;
-
-		*tag_cache = last_tag;
-	}
-
-	return tag;
+	return __sbitmap_queue_get(bt);
 }
 
-static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
-					 struct blk_mq_hw_ctx *hctx)
-{
-	struct bt_wait_state *bs;
-	int wait_index;
-
-	if (!hctx)
-		return &bt->bs[0];
-
-	wait_index = atomic_read(&hctx->wait_index);
-	bs = &bt->bs[wait_index];
-	bt_index_atomic_inc(&hctx->wait_index);
-	return bs;
-}
-
-static int bt_get(struct blk_mq_alloc_data *data,
-		struct blk_mq_bitmap_tags *bt,
-		struct blk_mq_hw_ctx *hctx,
-		unsigned int *last_tag, struct blk_mq_tags *tags)
+unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 {
-	struct bt_wait_state *bs;
+	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+	struct sbitmap_queue *bt;
+	struct sbq_wait_state *ws;
 	DEFINE_WAIT(wait);
+	unsigned int tag_offset;
+	bool drop_ctx;
 	int tag;
 
-	tag = __bt_get(hctx, bt, last_tag, tags);
+	if (data->flags & BLK_MQ_REQ_RESERVED) {
+		if (unlikely(!tags->nr_reserved_tags)) {
+			WARN_ON_ONCE(1);
+			return BLK_MQ_TAG_FAIL;
+		}
+		bt = &tags->breserved_tags;
+		tag_offset = 0;
+	} else {
+		bt = &tags->bitmap_tags;
+		tag_offset = tags->nr_reserved_tags;
+	}
+
+	tag = __blk_mq_get_tag(data, bt);
 	if (tag != -1)
-		return tag;
+		goto found_tag;
 
 	if (data->flags & BLK_MQ_REQ_NOWAIT)
-		return -1;
+		return BLK_MQ_TAG_FAIL;
 
-	bs = bt_wait_ptr(bt, hctx);
+	ws = bt_wait_ptr(bt, data->hctx);
+	drop_ctx = data->ctx == NULL;
 	do {
-		prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
 
-		tag = __bt_get(hctx, bt, last_tag, tags);
+		tag = __blk_mq_get_tag(data, bt);
 		if (tag != -1)
 			break;
 
 		/*
 		 * We're out of tags on this hardware queue, kick any
 		 * pending IO submits before going to sleep waiting for
-		 * some to complete. Note that hctx can be NULL here for
-		 * reserved tag allocation.
+		 * some to complete.
 		 */
-		if (hctx)
-			blk_mq_run_hw_queue(hctx, false);
+		blk_mq_run_hw_queue(data->hctx, false);
 
 		/*
 		 * Retry tag allocation after running the hardware queue,
 		 * as running the queue may also have found completions.
 		 */
-		tag = __bt_get(hctx, bt, last_tag, tags);
+		tag = __blk_mq_get_tag(data, bt);
 		if (tag != -1)
 			break;
 
-		blk_mq_put_ctx(data->ctx);
+		if (data->ctx)
+			blk_mq_put_ctx(data->ctx);
 
 		io_schedule();
 
 		data->ctx = blk_mq_get_ctx(data->q);
-		data->hctx = data->q->mq_ops->map_queue(data->q,
-				data->ctx->cpu);
-		if (data->flags & BLK_MQ_REQ_RESERVED) {
-			bt = &data->hctx->tags->breserved_tags;
-		} else {
-			last_tag = &data->ctx->last_tag;
-			hctx = data->hctx;
-			bt = &hctx->tags->bitmap_tags;
-		}
-		finish_wait(&bs->wait, &wait);
-		bs = bt_wait_ptr(bt, hctx);
+		data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu);
+		tags = blk_mq_tags_from_data(data);
+		if (data->flags & BLK_MQ_REQ_RESERVED)
+			bt = &tags->breserved_tags;
+		else
+			bt = &tags->bitmap_tags;
+
+		finish_wait(&ws->wait, &wait);
+		ws = bt_wait_ptr(bt, data->hctx);
 	} while (1);
 
-	finish_wait(&bs->wait, &wait);
-	return tag;
-}
-
-static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
-{
-	int tag;
+	if (drop_ctx && data->ctx)
+		blk_mq_put_ctx(data->ctx);
 
-	tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
-			&data->ctx->last_tag, data->hctx->tags);
-	if (tag >= 0)
-		return tag + data->hctx->tags->nr_reserved_tags;
+	finish_wait(&ws->wait, &wait);
 
-	return BLK_MQ_TAG_FAIL;
+found_tag:
+	return tag + tag_offset;
 }
 
-static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
+void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
+		    struct blk_mq_ctx *ctx, unsigned int tag)
 {
-	int tag, zero = 0;
+	if (tag >= tags->nr_reserved_tags) {
+		const int real_tag = tag - tags->nr_reserved_tags;
 
-	if (unlikely(!data->hctx->tags->nr_reserved_tags)) {
-		WARN_ON_ONCE(1);
-		return BLK_MQ_TAG_FAIL;
+		BUG_ON(real_tag >= tags->nr_tags);
+		sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
+	} else {
+		BUG_ON(tag >= tags->nr_reserved_tags);
+		sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
 	}
-
-	tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero,
-		data->hctx->tags);
-	if (tag < 0)
-		return BLK_MQ_TAG_FAIL;
-
-	return tag;
 }
 
-unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
-{
-	if (data->flags & BLK_MQ_REQ_RESERVED)
-		return __blk_mq_get_reserved_tag(data);
-	return __blk_mq_get_tag(data);
-}
+struct bt_iter_data {
+	struct blk_mq_hw_ctx *hctx;
+	busy_iter_fn *fn;
+	void *data;
+	bool reserved;
+};
 
-static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
+static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 {
-	int i, wake_index;
-
-	wake_index = atomic_read(&bt->wake_index);
-	for (i = 0; i < BT_WAIT_QUEUES; i++) {
-		struct bt_wait_state *bs = &bt->bs[wake_index];
-
-		if (waitqueue_active(&bs->wait)) {
-			int o = atomic_read(&bt->wake_index);
-			if (wake_index != o)
-				atomic_cmpxchg(&bt->wake_index, o, wake_index);
-
-			return bs;
-		}
+	struct bt_iter_data *iter_data = data;
+	struct blk_mq_hw_ctx *hctx = iter_data->hctx;
+	struct blk_mq_tags *tags = hctx->tags;
+	bool reserved = iter_data->reserved;
+	struct request *rq;
 
-		wake_index = bt_index_inc(wake_index);
-	}
+	if (!reserved)
+		bitnr += tags->nr_reserved_tags;
+	rq = tags->rqs[bitnr];
 
-	return NULL;
+	if (rq->q == hctx->queue)
+		iter_data->fn(hctx, rq, iter_data->data, reserved);
+	return true;
 }
 
-static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
+static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
+			busy_iter_fn *fn, void *data, bool reserved)
 {
-	const int index = TAG_TO_INDEX(bt, tag);
-	struct bt_wait_state *bs;
-	int wait_cnt;
-
-	clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word);
-
-	/* Ensure that the wait list checks occur after clear_bit(). */
-	smp_mb();
+	struct bt_iter_data iter_data = {
+		.hctx = hctx,
+		.fn = fn,
+		.data = data,
+		.reserved = reserved,
+	};
 
-	bs = bt_wake_ptr(bt);
-	if (!bs)
-		return;
-
-	wait_cnt = atomic_dec_return(&bs->wait_cnt);
-	if (unlikely(wait_cnt < 0))
-		wait_cnt = atomic_inc_return(&bs->wait_cnt);
-	if (wait_cnt == 0) {
-		atomic_add(bt->wake_cnt, &bs->wait_cnt);
-		bt_index_atomic_inc(&bt->wake_index);
-		wake_up(&bs->wait);
-	}
+	sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
 }
 
-void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
-		    unsigned int *last_tag)
-{
-	struct blk_mq_tags *tags = hctx->tags;
-
-	if (tag >= tags->nr_reserved_tags) {
-		const int real_tag = tag - tags->nr_reserved_tags;
-
-		BUG_ON(real_tag >= tags->nr_tags);
-		bt_clear_tag(&tags->bitmap_tags, real_tag);
-		if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO))
-			*last_tag = real_tag;
-	} else {
-		BUG_ON(tag >= tags->nr_reserved_tags);
-		bt_clear_tag(&tags->breserved_tags, tag);
-	}
-}
+struct bt_tags_iter_data {
+	struct blk_mq_tags *tags;
+	busy_tag_iter_fn *fn;
+	void *data;
+	bool reserved;
+};
 
-static void bt_for_each(struct blk_mq_hw_ctx *hctx,
-		struct blk_mq_bitmap_tags *bt, unsigned int off,
-		busy_iter_fn *fn, void *data, bool reserved)
+static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 {
+	struct bt_tags_iter_data *iter_data = data;
+	struct blk_mq_tags *tags = iter_data->tags;
+	bool reserved = iter_data->reserved;
 	struct request *rq;
-	int bit, i;
-
-	for (i = 0; i < bt->map_nr; i++) {
-		struct blk_align_bitmap *bm = &bt->map[i];
 
-		for (bit = find_first_bit(&bm->word, bm->depth);
-		     bit < bm->depth;
-		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
-			rq = hctx->tags->rqs[off + bit];
-			if (rq->q == hctx->queue)
-				fn(hctx, rq, data, reserved);
-		}
+	if (!reserved)
+		bitnr += tags->nr_reserved_tags;
+	rq = tags->rqs[bitnr];
 
-		off += (1 << bt->bits_per_word);
-	}
+	iter_data->fn(rq, iter_data->data, reserved);
+	return true;
 }
 
-static void bt_tags_for_each(struct blk_mq_tags *tags,
-		struct blk_mq_bitmap_tags *bt, unsigned int off,
-		busy_tag_iter_fn *fn, void *data, bool reserved)
+static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
+			     busy_tag_iter_fn *fn, void *data, bool reserved)
 {
-	struct request *rq;
-	int bit, i;
-
-	if (!tags->rqs)
-		return;
-	for (i = 0; i < bt->map_nr; i++) {
-		struct blk_align_bitmap *bm = &bt->map[i];
-
-		for (bit = find_first_bit(&bm->word, bm->depth);
-		     bit < bm->depth;
-		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
-			rq = tags->rqs[off + bit];
-			fn(rq, data, reserved);
-		}
+	struct bt_tags_iter_data iter_data = {
+		.tags = tags,
+		.fn = fn,
+		.data = data,
+		.reserved = reserved,
+	};
 
-		off += (1 << bt->bits_per_word);
-	}
+	if (tags->rqs)
+		sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
 }
 
 static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
 		busy_tag_iter_fn *fn, void *priv)
 {
 	if (tags->nr_reserved_tags)
-		bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
-	bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
-			false);
+		bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true);
+	bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false);
 }
 
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
@@ -496,11 +296,11 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set)
 		struct blk_mq_tags *tags = set->tags[i];
 
 		for (j = 0; j < tags->nr_tags; j++) {
-			if (!tags->rqs[j])
+			if (!tags->static_rqs[j])
 				continue;
 
 			ret = set->ops->reinit_request(set->driver_data,
-						tags->rqs[j]);
+						tags->static_rqs[j]);
 			if (ret)
 				goto out;
 		}
@@ -529,124 +329,35 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 			continue;
 
 		if (tags->nr_reserved_tags)
-			bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
-		bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
-		      false);
+			bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
+		bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
 	}
 
 }
 
-static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
+static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
+		    bool round_robin, int node)
 {
-	unsigned int i, used;
-
-	for (i = 0, used = 0; i < bt->map_nr; i++) {
-		struct blk_align_bitmap *bm = &bt->map[i];
-
-		used += bitmap_weight(&bm->word, bm->depth);
-	}
-
-	return bt->depth - used;
-}
-
-static void bt_update_count(struct blk_mq_bitmap_tags *bt,
-			    unsigned int depth)
-{
-	unsigned int tags_per_word = 1U << bt->bits_per_word;
-	unsigned int map_depth = depth;
-
-	if (depth) {
-		int i;
-
-		for (i = 0; i < bt->map_nr; i++) {
-			bt->map[i].depth = min(map_depth, tags_per_word);
-			map_depth -= bt->map[i].depth;
-		}
-	}
-
-	bt->wake_cnt = BT_WAIT_BATCH;
-	if (bt->wake_cnt > depth / BT_WAIT_QUEUES)
-		bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES);
-
-	bt->depth = depth;
-}
-
-static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
-			int node, bool reserved)
-{
-	int i;
-
-	bt->bits_per_word = ilog2(BITS_PER_LONG);
-
-	/*
-	 * Depth can be zero for reserved tags, that's not a failure
-	 * condition.
-	 */
-	if (depth) {
-		unsigned int nr, tags_per_word;
-
-		tags_per_word = (1 << bt->bits_per_word);
-
-		/*
-		 * If the tag space is small, shrink the number of tags
-		 * per word so we spread over a few cachelines, at least.
-		 * If less than 4 tags, just forget about it, it's not
-		 * going to work optimally anyway.
-		 */
-		if (depth >= 4) {
-			while (tags_per_word * 4 > depth) {
-				bt->bits_per_word--;
-				tags_per_word = (1 << bt->bits_per_word);
-			}
-		}
-
-		nr = ALIGN(depth, tags_per_word) / tags_per_word;
-		bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
-						GFP_KERNEL, node);
-		if (!bt->map)
-			return -ENOMEM;
-
-		bt->map_nr = nr;
-	}
-
-	bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
-	if (!bt->bs) {
-		kfree(bt->map);
-		bt->map = NULL;
-		return -ENOMEM;
-	}
-
-	bt_update_count(bt, depth);
-
-	for (i = 0; i < BT_WAIT_QUEUES; i++) {
-		init_waitqueue_head(&bt->bs[i].wait);
-		atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt);
-	}
-
-	return 0;
-}
-
-static void bt_free(struct blk_mq_bitmap_tags *bt)
-{
-	kfree(bt->map);
-	kfree(bt->bs);
+	return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL,
+				       node);
 }
 
 static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
 						   int node, int alloc_policy)
 {
 	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+	bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
 
-	tags->alloc_policy = alloc_policy;
-
-	if (bt_alloc(&tags->bitmap_tags, depth, node, false))
-		goto enomem;
-	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
-		goto enomem;
+	if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
+		goto free_tags;
+	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin,
+		     node))
+		goto free_bitmap_tags;
 
 	return tags;
-enomem:
-	bt_free(&tags->bitmap_tags);
+free_bitmap_tags:
+	sbitmap_queue_free(&tags->bitmap_tags);
+free_tags:
 	kfree(tags);
 	return NULL;
 }
@@ -666,11 +377,6 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	if (!tags)
 		return NULL;
 
-	if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) {
-		kfree(tags);
-		return NULL;
-	}
-
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;
 
@@ -679,31 +385,61 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 
 void blk_mq_free_tags(struct blk_mq_tags *tags)
 {
-	bt_free(&tags->bitmap_tags);
-	bt_free(&tags->breserved_tags);
-	free_cpumask_var(tags->cpumask);
+	sbitmap_queue_free(&tags->bitmap_tags);
+	sbitmap_queue_free(&tags->breserved_tags);
 	kfree(tags);
 }
 
-void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
+int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
+			    struct blk_mq_tags **tagsptr, unsigned int tdepth,
+			    bool can_grow)
 {
-	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+	struct blk_mq_tags *tags = *tagsptr;
 
-	*tag = prandom_u32() % depth;
-}
+	if (tdepth <= tags->nr_reserved_tags)
+		return -EINVAL;
 
-int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
-{
 	tdepth -= tags->nr_reserved_tags;
-	if (tdepth > tags->nr_tags)
-		return -EINVAL;
 
 	/*
-	 * Don't need (or can't) update reserved tags here, they remain
-	 * static and should never need resizing.
+	 * If we are allowed to grow beyond the original size, allocate
+	 * a new set of tags before freeing the old one.
 	 */
-	bt_update_count(&tags->bitmap_tags, tdepth);
-	blk_mq_tag_wakeup_all(tags, false);
+	if (tdepth > tags->nr_tags) {
+		struct blk_mq_tag_set *set = hctx->queue->tag_set;
+		struct blk_mq_tags *new;
+		bool ret;
+
+		if (!can_grow)
+			return -EINVAL;
+
+		/*
+		 * We need some sort of upper limit, set it high enough that
+		 * no valid use cases should require more.
+		 */
+		if (tdepth > 16 * BLKDEV_MAX_RQ)
+			return -EINVAL;
+
+		new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0);
+		if (!new)
+			return -ENOMEM;
+		ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
+		if (ret) {
+			blk_mq_free_rq_map(new);
+			return -ENOMEM;
+		}
+
+		blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
+		blk_mq_free_rq_map(*tagsptr);
+		*tagsptr = new;
+	} else {
+		/*
+		 * Don't need (or can't) update reserved tags here, they
+		 * remain static and should never need resizing.
+		 */
+		sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
+	}
+
 	return 0;
 }
 
@@ -726,7 +462,7 @@ u32 blk_mq_unique_tag(struct request *rq)
 	int hwq = 0;
 
 	if (q->mq_ops) {
-		hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
+		hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
 		hwq = hctx->queue_num;
 	}
 
@@ -734,25 +470,3 @@ u32 blk_mq_unique_tag(struct request *rq)
 		(rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
 }
 EXPORT_SYMBOL(blk_mq_unique_tag);
-
-ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
-{
-	char *orig_page = page;
-	unsigned int free, res;
-
-	if (!tags)
-		return 0;
-
-	page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
-			"bits_per_word=%u\n",
-			tags->nr_tags, tags->nr_reserved_tags,
-			tags->bitmap_tags.bits_per_word);
-
-	free = bt_unused_tags(&tags->bitmap_tags);
-	res = bt_unused_tags(&tags->breserved_tags);
-
-	page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
-	page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));
-
-	return page - orig_page;
-}
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index d468a79f2c4a..63497423c5cd 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -3,31 +3,6 @@
 
 #include "blk-mq.h"
 
-enum {
-	BT_WAIT_QUEUES	= 8,
-	BT_WAIT_BATCH	= 8,
-};
-
-struct bt_wait_state {
-	atomic_t wait_cnt;
-	wait_queue_head_t wait;
-} ____cacheline_aligned_in_smp;
-
-#define TAG_TO_INDEX(bt, tag)	((tag) >> (bt)->bits_per_word)
-#define TAG_TO_BIT(bt, tag)	((tag) & ((1 << (bt)->bits_per_word) - 1))
-
-struct blk_mq_bitmap_tags {
-	unsigned int depth;
-	unsigned int wake_cnt;
-	unsigned int bits_per_word;
-
-	unsigned int map_nr;
-	struct blk_align_bitmap *map;
-
-	atomic_t wake_index;
-	struct bt_wait_state *bs;
-};
-
 /*
  * Tag address space map.
  */
@@ -37,14 +12,12 @@ struct blk_mq_tags {
 
 	atomic_t active_queues;
 
-	struct blk_mq_bitmap_tags bitmap_tags;
-	struct blk_mq_bitmap_tags breserved_tags;
+	struct sbitmap_queue bitmap_tags;
+	struct sbitmap_queue breserved_tags;
 
 	struct request **rqs;
+	struct request **static_rqs;
 	struct list_head page_list;
-
-	int alloc_policy;
-	cpumask_var_t cpumask;
 };
 
 
@@ -52,15 +25,24 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
 extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
-extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
+extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
+			   struct blk_mq_ctx *ctx, unsigned int tag);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
-extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
-extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
-extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
+extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
+					struct blk_mq_tags **tags,
+					unsigned int depth, bool can_grow);
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv);
 
+static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
+						 struct blk_mq_hw_ctx *hctx)
+{
+	if (!hctx)
+		return &bt->ws[0];
+	return sbq_wait_ptr(bt, &hctx->wait_index);
+}
+
 enum {
 	BLK_MQ_TAG_CACHE_MIN	= 1,
 	BLK_MQ_TAG_CACHE_MAX	= 64,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c207fa9870eb..b29e7dc7b309 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -22,6 +22,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
+#include <linux/prefetch.h>
 
 #include <trace/events/block.h>
 
@@ -29,53 +30,37 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
+#include "blk-stat.h"
+#include "blk-wbt.h"
+#include "blk-mq-sched.h"
 
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 
-static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
-
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
-static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
-{
-	unsigned int i;
-
-	for (i = 0; i < hctx->ctx_map.size; i++)
-		if (hctx->ctx_map.map[i].word)
-			return true;
-
-	return false;
-}
-
-static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
-					      struct blk_mq_ctx *ctx)
+bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
-	return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
+	return sbitmap_any_bit_set(&hctx->ctx_map) ||
+			!list_empty_careful(&hctx->dispatch) ||
+			blk_mq_sched_has_work(hctx);
 }
 
-#define CTX_TO_BIT(hctx, ctx)	\
-	((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
-
 /*
  * Mark this ctx as having pending work in this hardware queue
  */
 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
 				     struct blk_mq_ctx *ctx)
 {
-	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
-
-	if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
-		set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
+	if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
+		sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
 }
 
 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 				      struct blk_mq_ctx *ctx)
 {
-	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
-
-	clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
+	sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
 }
 
 void blk_mq_freeze_queue_start(struct request_queue *q)
@@ -135,6 +120,33 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 
+/**
+ * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished
+ * @q: request queue.
+ *
+ * Note: this function does not prevent that the struct request end_io()
+ * callback function is invoked. Additionally, it is not prevented that
+ * new queue_rq() calls occur unless the queue has been stopped first.
+ */
+void blk_mq_quiesce_queue(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+	bool rcu = false;
+
+	blk_mq_stop_hw_queues(q);
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (hctx->flags & BLK_MQ_F_BLOCKING)
+			synchronize_srcu(&hctx->queue_rq_srcu);
+		else
+			rcu = true;
+	}
+	if (rcu)
+		synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
+
 void blk_mq_wake_waiters(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
@@ -158,18 +170,16 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 }
 EXPORT_SYMBOL(blk_mq_can_queue);
 
-static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
-			       struct request *rq, int op,
-			       unsigned int op_flags)
+void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+			struct request *rq, unsigned int op)
 {
-	if (blk_queue_io_stat(q))
-		op_flags |= REQ_IO_STAT;
-
 	INIT_LIST_HEAD(&rq->queuelist);
 	/* csd/requeue_work/fifo_time is initialized before use */
 	rq->q = q;
 	rq->mq_ctx = ctx;
-	req_set_op_attrs(rq, op, op_flags);
+	rq->cmd_flags = op;
+	if (blk_queue_io_stat(q))
+		rq->rq_flags |= RQF_IO_STAT;
 	/* do not touch atomic flags, it needs atomic ops against the timer */
 	rq->cpu = -1;
 	INIT_HLIST_NODE(&rq->hash);
@@ -189,13 +199,7 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 	rq->special = NULL;
 	/* tag was already set */
 	rq->errors = 0;
-
-	rq->cmd = rq->__cmd;
-
 	rq->extra_len = 0;
-	rq->sense_len = 0;
-	rq->resid_len = 0;
-	rq->sense = NULL;
 
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->timeout = 0;
@@ -204,65 +208,60 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 	rq->end_io_data = NULL;
 	rq->next_rq = NULL;
 
-	ctx->rq_dispatched[rw_is_sync(op, op_flags)]++;
+	ctx->rq_dispatched[op_is_sync(op)]++;
 }
+EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init);
 
-static struct request *
-__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags)
+struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
+				       unsigned int op)
 {
 	struct request *rq;
 	unsigned int tag;
 
 	tag = blk_mq_get_tag(data);
 	if (tag != BLK_MQ_TAG_FAIL) {
-		rq = data->hctx->tags->rqs[tag];
+		struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 
-		if (blk_mq_tag_busy(data->hctx)) {
-			rq->cmd_flags = REQ_MQ_INFLIGHT;
-			atomic_inc(&data->hctx->nr_active);
+		rq = tags->static_rqs[tag];
+
+		if (data->flags & BLK_MQ_REQ_INTERNAL) {
+			rq->tag = -1;
+			rq->internal_tag = tag;
+		} else {
+			if (blk_mq_tag_busy(data->hctx)) {
+				rq->rq_flags = RQF_MQ_INFLIGHT;
+				atomic_inc(&data->hctx->nr_active);
+			}
+			rq->tag = tag;
+			rq->internal_tag = -1;
 		}
 
-		rq->tag = tag;
-		blk_mq_rq_ctx_init(data->q, data->ctx, rq, op, op_flags);
+		blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
 		return rq;
 	}
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(__blk_mq_alloc_request);
 
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		unsigned int flags)
 {
-	struct blk_mq_ctx *ctx;
-	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_alloc_data alloc_data = { .flags = flags };
 	struct request *rq;
-	struct blk_mq_alloc_data alloc_data;
 	int ret;
 
 	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
 	if (ret)
 		return ERR_PTR(ret);
 
-	ctx = blk_mq_get_ctx(q);
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
-	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
+	rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
 
-	rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
-	if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) {
-		__blk_mq_run_hw_queue(hctx);
-		blk_mq_put_ctx(ctx);
+	blk_mq_put_ctx(alloc_data.ctx);
+	blk_queue_exit(q);
 
-		ctx = blk_mq_get_ctx(q);
-		hctx = q->mq_ops->map_queue(q, ctx->cpu);
-		blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-		rq =  __blk_mq_alloc_request(&alloc_data, rw, 0);
-		ctx = alloc_data.ctx;
-	}
-	blk_mq_put_ctx(ctx);
-	if (!rq) {
-		blk_queue_exit(q);
+	if (!rq)
 		return ERR_PTR(-EWOULDBLOCK);
-	}
 
 	rq->__data_len = 0;
 	rq->__sector = (sector_t) -1;
@@ -308,7 +307,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
 	ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
 
 	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-	rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
+	rq = __blk_mq_alloc_request(&alloc_data, rw);
 	if (!rq) {
 		ret = -EWOULDBLOCK;
 		goto out_queue_exit;
@@ -322,38 +321,45 @@ out_queue_exit:
 }
 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 
-static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
-				  struct blk_mq_ctx *ctx, struct request *rq)
+void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+			     struct request *rq)
 {
-	const int tag = rq->tag;
+	const int sched_tag = rq->internal_tag;
 	struct request_queue *q = rq->q;
 
-	if (rq->cmd_flags & REQ_MQ_INFLIGHT)
+	if (rq->rq_flags & RQF_MQ_INFLIGHT)
 		atomic_dec(&hctx->nr_active);
-	rq->cmd_flags = 0;
+
+	wbt_done(q->rq_wb, &rq->issue_stat);
+	rq->rq_flags = 0;
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-	blk_mq_put_tag(hctx, tag, &ctx->last_tag);
+	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+	if (rq->tag != -1)
+		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
+	if (sched_tag != -1)
+		blk_mq_sched_completed_request(hctx, rq);
+	blk_mq_sched_restart_queues(hctx);
 	blk_queue_exit(q);
 }
 
-void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx,
+				     struct request *rq)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 
 	ctx->rq_completed[rq_is_sync(rq)]++;
-	__blk_mq_free_request(hctx, ctx, rq);
+	__blk_mq_finish_request(hctx, ctx, rq);
+}
 
+void blk_mq_finish_request(struct request *rq)
+{
+	blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
 }
-EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
 
 void blk_mq_free_request(struct request *rq)
 {
-	struct blk_mq_hw_ctx *hctx;
-	struct request_queue *q = rq->q;
-
-	hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
-	blk_mq_free_hctx_request(hctx, rq);
+	blk_mq_sched_put_request(rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
@@ -362,6 +368,7 @@ inline void __blk_mq_end_request(struct request *rq, int error)
 	blk_account_io_done(rq);
 
 	if (rq->end_io) {
+		wbt_done(rq->q->rq_wb, &rq->issue_stat);
 		rq->end_io(rq, error);
 	} else {
 		if (unlikely(blk_bidi_rq(rq)))
@@ -412,10 +419,27 @@ static void blk_mq_ipi_complete_request(struct request *rq)
 	put_cpu();
 }
 
+static void blk_mq_stat_add(struct request *rq)
+{
+	if (rq->rq_flags & RQF_STATS) {
+		/*
+		 * We could rq->mq_ctx here, but there's less of a risk
+		 * of races if we have the completion event add the stats
+		 * to the local software queue.
+		 */
+		struct blk_mq_ctx *ctx;
+
+		ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
+		blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
+	}
+}
+
 static void __blk_mq_complete_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
+	blk_mq_stat_add(rq);
+
 	if (!q->softirq_done_fn)
 		blk_mq_end_request(rq, rq->errors);
 	else
@@ -453,11 +477,15 @@ void blk_mq_start_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
+	blk_mq_sched_started_request(rq);
+
 	trace_block_rq_issue(q, rq);
 
-	rq->resid_len = blk_rq_bytes(rq);
-	if (unlikely(blk_bidi_rq(rq)))
-		rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
+	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
+		blk_stat_set_issue_time(&rq->issue_stat);
+		rq->rq_flags |= RQF_STATS;
+		wbt_issue(q->rq_wb, &rq->issue_stat);
+	}
 
 	blk_add_timer(rq);
 
@@ -494,6 +522,8 @@ static void __blk_mq_requeue_request(struct request *rq)
 	struct request_queue *q = rq->q;
 
 	trace_block_rq_requeue(q, rq);
+	wbt_requeue(q->rq_wb, &rq->issue_stat);
+	blk_mq_sched_requeue_request(rq);
 
 	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
 		if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -501,19 +531,19 @@ static void __blk_mq_requeue_request(struct request *rq)
 	}
 }
 
-void blk_mq_requeue_request(struct request *rq)
+void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 {
 	__blk_mq_requeue_request(rq);
 
 	BUG_ON(blk_queued_rq(rq));
-	blk_mq_add_to_requeue_list(rq, true);
+	blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
 }
 EXPORT_SYMBOL(blk_mq_requeue_request);
 
 static void blk_mq_requeue_work(struct work_struct *work)
 {
 	struct request_queue *q =
-		container_of(work, struct request_queue, requeue_work);
+		container_of(work, struct request_queue, requeue_work.work);
 	LIST_HEAD(rq_list);
 	struct request *rq, *next;
 	unsigned long flags;
@@ -523,28 +553,25 @@ static void blk_mq_requeue_work(struct work_struct *work)
 	spin_unlock_irqrestore(&q->requeue_lock, flags);
 
 	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
-		if (!(rq->cmd_flags & REQ_SOFTBARRIER))
+		if (!(rq->rq_flags & RQF_SOFTBARRIER))
 			continue;
 
-		rq->cmd_flags &= ~REQ_SOFTBARRIER;
+		rq->rq_flags &= ~RQF_SOFTBARRIER;
 		list_del_init(&rq->queuelist);
-		blk_mq_insert_request(rq, true, false, false);
+		blk_mq_sched_insert_request(rq, true, false, false, true);
 	}
 
 	while (!list_empty(&rq_list)) {
 		rq = list_entry(rq_list.next, struct request, queuelist);
 		list_del_init(&rq->queuelist);
-		blk_mq_insert_request(rq, false, false, false);
+		blk_mq_sched_insert_request(rq, false, false, false, true);
 	}
 
-	/*
-	 * Use the start variant of queue running here, so that running
-	 * the requeue work will kick stopped queues.
-	 */
-	blk_mq_start_hw_queues(q);
+	blk_mq_run_hw_queues(q, false);
 }
 
-void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
+void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
+				bool kick_requeue_list)
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags;
@@ -553,30 +580,35 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
 	 * We abuse this flag that is otherwise used by the I/O scheduler to
 	 * request head insertation from the workqueue.
 	 */
-	BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
+	BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
 
 	spin_lock_irqsave(&q->requeue_lock, flags);
 	if (at_head) {
-		rq->cmd_flags |= REQ_SOFTBARRIER;
+		rq->rq_flags |= RQF_SOFTBARRIER;
 		list_add(&rq->queuelist, &q->requeue_list);
 	} else {
 		list_add_tail(&rq->queuelist, &q->requeue_list);
 	}
 	spin_unlock_irqrestore(&q->requeue_lock, flags);
+
+	if (kick_requeue_list)
+		blk_mq_kick_requeue_list(q);
 }
 EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 
-void blk_mq_cancel_requeue_work(struct request_queue *q)
+void blk_mq_kick_requeue_list(struct request_queue *q)
 {
-	cancel_work_sync(&q->requeue_work);
+	kblockd_schedule_delayed_work(&q->requeue_work, 0);
 }
-EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work);
+EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 
-void blk_mq_kick_requeue_list(struct request_queue *q)
+void blk_mq_delay_kick_requeue_list(struct request_queue *q,
+				    unsigned long msecs)
 {
-	kblockd_schedule_work(&q->requeue_work);
+	kblockd_schedule_delayed_work(&q->requeue_work,
+				      msecs_to_jiffies(msecs));
 }
-EXPORT_SYMBOL(blk_mq_kick_requeue_list);
+EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 
 void blk_mq_abort_requeue_list(struct request_queue *q)
 {
@@ -600,8 +632,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list);
 
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
-	if (tag < tags->nr_tags)
+	if (tag < tags->nr_tags) {
+		prefetch(tags->rqs[tag]);
 		return tags->rqs[tag];
+	}
 
 	return NULL;
 }
@@ -614,7 +648,7 @@ struct blk_mq_timeout_data {
 
 void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
-	struct blk_mq_ops *ops = req->q->mq_ops;
+	const struct blk_mq_ops *ops = req->q->mq_ops;
 	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 
 	/*
@@ -729,7 +763,7 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 	int checked = 8;
 
 	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
-		int el_ret;
+		bool merged = false;
 
 		if (!checked--)
 			break;
@@ -737,98 +771,147 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 		if (!blk_rq_merge_ok(rq, bio))
 			continue;
 
-		el_ret = blk_try_merge(rq, bio);
-		if (el_ret == ELEVATOR_BACK_MERGE) {
-			if (bio_attempt_back_merge(q, rq, bio)) {
-				ctx->rq_merged++;
-				return true;
-			}
+		switch (blk_try_merge(rq, bio)) {
+		case ELEVATOR_BACK_MERGE:
+			if (blk_mq_sched_allow_merge(q, rq, bio))
+				merged = bio_attempt_back_merge(q, rq, bio);
 			break;
-		} else if (el_ret == ELEVATOR_FRONT_MERGE) {
-			if (bio_attempt_front_merge(q, rq, bio)) {
-				ctx->rq_merged++;
-				return true;
-			}
+		case ELEVATOR_FRONT_MERGE:
+			if (blk_mq_sched_allow_merge(q, rq, bio))
+				merged = bio_attempt_front_merge(q, rq, bio);
 			break;
+		case ELEVATOR_DISCARD_MERGE:
+			merged = bio_attempt_discard_merge(q, rq, bio);
+			break;
+		default:
+			continue;
 		}
+
+		if (merged)
+			ctx->rq_merged++;
+		return merged;
 	}
 
 	return false;
 }
 
+struct flush_busy_ctx_data {
+	struct blk_mq_hw_ctx *hctx;
+	struct list_head *list;
+};
+
+static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
+{
+	struct flush_busy_ctx_data *flush_data = data;
+	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
+	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+
+	sbitmap_clear_bit(sb, bitnr);
+	spin_lock(&ctx->lock);
+	list_splice_tail_init(&ctx->rq_list, flush_data->list);
+	spin_unlock(&ctx->lock);
+	return true;
+}
+
 /*
  * Process software queues that have been marked busy, splicing them
  * to the for-dispatch
  */
-static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
+void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
-	struct blk_mq_ctx *ctx;
-	int i;
+	struct flush_busy_ctx_data data = {
+		.hctx = hctx,
+		.list = list,
+	};
 
-	for (i = 0; i < hctx->ctx_map.size; i++) {
-		struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
-		unsigned int off, bit;
+	sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
+}
+EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 
-		if (!bm->word)
-			continue;
+static inline unsigned int queued_to_index(unsigned int queued)
+{
+	if (!queued)
+		return 0;
 
-		bit = 0;
-		off = i * hctx->ctx_map.bits_per_word;
-		do {
-			bit = find_next_bit(&bm->word, bm->depth, bit);
-			if (bit >= bm->depth)
-				break;
+	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
+}
 
-			ctx = hctx->ctxs[bit + off];
-			clear_bit(bit, &bm->word);
-			spin_lock(&ctx->lock);
-			list_splice_tail_init(&ctx->rq_list, list);
-			spin_unlock(&ctx->lock);
+bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
+			   bool wait)
+{
+	struct blk_mq_alloc_data data = {
+		.q = rq->q,
+		.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
+		.flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
+	};
 
-			bit++;
-		} while (1);
+	if (rq->tag != -1) {
+done:
+		if (hctx)
+			*hctx = data.hctx;
+		return true;
+	}
+
+	rq->tag = blk_mq_get_tag(&data);
+	if (rq->tag >= 0) {
+		if (blk_mq_tag_busy(data.hctx)) {
+			rq->rq_flags |= RQF_MQ_INFLIGHT;
+			atomic_inc(&data.hctx->nr_active);
+		}
+		data.hctx->tags->rqs[rq->tag] = rq;
+		goto done;
 	}
+
+	return false;
 }
 
-/*
- * Run this hardware queue, pulling any software queues mapped to it in.
- * Note that this function currently has various problems around ordering
- * of IO. In particular, we'd like FIFO behaviour on handling existing
- * items on the hctx->dispatch list. Ignore that for now.
- */
-static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
+static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
+				  struct request *rq)
 {
-	struct request_queue *q = hctx->queue;
-	struct request *rq;
-	LIST_HEAD(rq_list);
-	LIST_HEAD(driver_list);
-	struct list_head *dptr;
-	int queued;
-
-	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
+	if (rq->tag == -1 || rq->internal_tag == -1)
 		return;
 
-	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
-		cpu_online(hctx->next_cpu));
+	blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
+	rq->tag = -1;
 
-	hctx->run++;
+	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
+		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
+		atomic_dec(&hctx->nr_active);
+	}
+}
 
-	/*
-	 * Touch any software queue that has pending entries.
-	 */
-	flush_busy_ctxs(hctx, &rq_list);
+/*
+ * If we fail getting a driver tag because all the driver tags are already
+ * assigned and on the dispatch list, BUT the first entry does not have a
+ * tag, then we could deadlock. For that case, move entries with assigned
+ * driver tags to the front, leaving the set of tagged requests in the
+ * same order, and the untagged set in the same order.
+ */
+static bool reorder_tags_to_front(struct list_head *list)
+{
+	struct request *rq, *tmp, *first = NULL;
 
-	/*
-	 * If we have previous entries on our dispatch list, grab them
-	 * and stuff them at the front for more fair dispatch.
-	 */
-	if (!list_empty_careful(&hctx->dispatch)) {
-		spin_lock(&hctx->lock);
-		if (!list_empty(&hctx->dispatch))
-			list_splice_init(&hctx->dispatch, &rq_list);
-		spin_unlock(&hctx->lock);
+	list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
+		if (rq == first)
+			break;
+		if (rq->tag != -1) {
+			list_move(&rq->queuelist, list);
+			if (!first)
+				first = rq;
+		}
 	}
 
+	return first != NULL;
+}
+
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
+{
+	struct request_queue *q = hctx->queue;
+	struct request *rq;
+	LIST_HEAD(driver_list);
+	struct list_head *dptr;
+	int queued, ret = BLK_MQ_RQ_QUEUE_OK;
+
 	/*
 	 * Start off with dptr being NULL, so we start the first request
 	 * immediately, even if we have more pending.
@@ -839,16 +922,29 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	 * Now process all the entries, sending them to the driver.
 	 */
 	queued = 0;
-	while (!list_empty(&rq_list)) {
+	while (!list_empty(list)) {
 		struct blk_mq_queue_data bd;
-		int ret;
 
-		rq = list_first_entry(&rq_list, struct request, queuelist);
+		rq = list_first_entry(list, struct request, queuelist);
+		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+			if (!queued && reorder_tags_to_front(list))
+				continue;
+
+			/*
+			 * We failed getting a driver tag. Mark the queue(s)
+			 * as needing a restart. Retry getting a tag again,
+			 * in case the needed IO completed right before we
+			 * marked the queue as needing a restart.
+			 */
+			blk_mq_sched_mark_restart(hctx);
+			if (!blk_mq_get_driver_tag(rq, &hctx, false))
+				break;
+		}
 		list_del_init(&rq->queuelist);
 
 		bd.rq = rq;
 		bd.list = dptr;
-		bd.last = list_empty(&rq_list);
+		bd.last = list_empty(list);
 
 		ret = q->mq_ops->queue_rq(hctx, &bd);
 		switch (ret) {
@@ -856,7 +952,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 			queued++;
 			break;
 		case BLK_MQ_RQ_QUEUE_BUSY:
-			list_add(&rq->queuelist, &rq_list);
+			blk_mq_put_driver_tag(hctx, rq);
+			list_add(&rq->queuelist, list);
 			__blk_mq_requeue_request(rq);
 			break;
 		default:
@@ -874,23 +971,21 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 		 * We've done the first request. If we have more than 1
 		 * left in the list, set dptr to defer issue.
 		 */
-		if (!dptr && rq_list.next != rq_list.prev)
+		if (!dptr && list->next != list->prev)
 			dptr = &driver_list;
 	}
 
-	if (!queued)
-		hctx->dispatched[0]++;
-	else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
-		hctx->dispatched[ilog2(queued) + 1]++;
+	hctx->dispatched[queued_to_index(queued)]++;
 
 	/*
 	 * Any items that need requeuing? Stuff them into hctx->dispatch,
 	 * that is where we will continue on next queue run.
 	 */
-	if (!list_empty(&rq_list)) {
+	if (!list_empty(list)) {
 		spin_lock(&hctx->lock);
-		list_splice(&rq_list, &hctx->dispatch);
+		list_splice_init(list, &hctx->dispatch);
 		spin_unlock(&hctx->lock);
+
 		/*
 		 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
 		 * it's possible the queue is stopped and restarted again
@@ -899,8 +994,32 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 		 * the requests in rq_list might get lost.
 		 *
 		 * blk_mq_run_hw_queue() already checks the STOPPED bit
-		 **/
-		blk_mq_run_hw_queue(hctx, true);
+		 *
+		 * If RESTART is set, then let completion restart the queue
+		 * instead of potentially looping here.
+		 */
+		if (!blk_mq_sched_needs_restart(hctx))
+			blk_mq_run_hw_queue(hctx, true);
+	}
+
+	return queued != 0;
+}
+
+static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+	int srcu_idx;
+
+	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+		cpu_online(hctx->next_cpu));
+
+	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+		rcu_read_lock();
+		blk_mq_sched_dispatch_requests(hctx);
+		rcu_read_unlock();
+	} else {
+		srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
+		blk_mq_sched_dispatch_requests(hctx);
+		srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
 	}
 }
 
@@ -916,7 +1035,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 		return WORK_CPU_UNBOUND;
 
 	if (--hctx->next_cpu_batch <= 0) {
-		int cpu = hctx->next_cpu, next_cpu;
+		int next_cpu;
 
 		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
 		if (next_cpu >= nr_cpu_ids)
@@ -924,8 +1043,6 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 
 		hctx->next_cpu = next_cpu;
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
-
-		return cpu;
 	}
 
 	return hctx->next_cpu;
@@ -933,11 +1050,11 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
-	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
-	    !blk_mq_hw_queue_mapped(hctx)))
+	if (unlikely(blk_mq_hctx_stopped(hctx) ||
+		     !blk_mq_hw_queue_mapped(hctx)))
 		return;
 
-	if (!async) {
+	if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
 		int cpu = get_cpu();
 		if (cpumask_test_cpu(cpu, hctx->cpumask)) {
 			__blk_mq_run_hw_queue(hctx);
@@ -948,8 +1065,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 		put_cpu();
 	}
 
-	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-			&hctx->run_work, 0);
+	kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
 }
 
 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
@@ -958,9 +1074,8 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		if ((!blk_mq_hctx_has_pending(hctx) &&
-		    list_empty_careful(&hctx->dispatch)) ||
-		    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
+		if (!blk_mq_hctx_has_pending(hctx) ||
+		    blk_mq_hctx_stopped(hctx))
 			continue;
 
 		blk_mq_run_hw_queue(hctx, async);
@@ -968,9 +1083,29 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 }
 EXPORT_SYMBOL(blk_mq_run_hw_queues);
 
+/**
+ * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
+ * @q: request queue.
+ *
+ * The caller is responsible for serializing this function against
+ * blk_mq_{start,stop}_hw_queue().
+ */
+bool blk_mq_queue_stopped(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		if (blk_mq_hctx_stopped(hctx))
+			return true;
+
+	return false;
+}
+EXPORT_SYMBOL(blk_mq_queue_stopped);
+
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-	cancel_delayed_work(&hctx->run_work);
+	cancel_work(&hctx->run_work);
 	cancel_delayed_work(&hctx->delay_work);
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
@@ -1004,18 +1139,23 @@ void blk_mq_start_hw_queues(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queues);
 
+void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+{
+	if (!blk_mq_hctx_stopped(hctx))
+		return;
+
+	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+	blk_mq_run_hw_queue(hctx, async);
+}
+EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
+
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
-	queue_for_each_hw_ctx(q, hctx, i) {
-		if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
-			continue;
-
-		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
-		blk_mq_run_hw_queue(hctx, async);
-	}
+	queue_for_each_hw_ctx(q, hctx, i)
+		blk_mq_start_stopped_hw_queue(hctx, async);
 }
 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
 
@@ -1023,7 +1163,7 @@ static void blk_mq_run_work_fn(struct work_struct *work)
 {
 	struct blk_mq_hw_ctx *hctx;
 
-	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
+	hctx = container_of(work, struct blk_mq_hw_ctx, run_work);
 
 	__blk_mq_run_hw_queue(hctx);
 }
@@ -1043,6 +1183,7 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 	if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
 		return;
 
+	blk_mq_stop_hw_queue(hctx);
 	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
 			&hctx->delay_work, msecs_to_jiffies(msecs));
 }
@@ -1062,8 +1203,8 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
 		list_add_tail(&rq->queuelist, &ctx->rq_list);
 }
 
-static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
-				    struct request *rq, bool at_head)
+void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+			     bool at_head)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 
@@ -1071,36 +1212,10 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
 	blk_mq_hctx_mark_pending(hctx, ctx);
 }
 
-void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
-			   bool async)
-{
-	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx;
-
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
-
-	spin_lock(&ctx->lock);
-	__blk_mq_insert_request(hctx, rq, at_head);
-	spin_unlock(&ctx->lock);
-
-	if (run_queue)
-		blk_mq_run_hw_queue(hctx, async);
-}
-
-static void blk_mq_insert_requests(struct request_queue *q,
-				     struct blk_mq_ctx *ctx,
-				     struct list_head *list,
-				     int depth,
-				     bool from_schedule)
+void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+			    struct list_head *list)
 
 {
-	struct blk_mq_hw_ctx *hctx;
-
-	trace_block_unplug(q, depth, !from_schedule);
-
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
-
 	/*
 	 * preemption doesn't flush plug list, so it's possible ctx->cpu is
 	 * offline now
@@ -1116,8 +1231,6 @@ static void blk_mq_insert_requests(struct request_queue *q,
 	}
 	blk_mq_hctx_mark_pending(hctx, ctx);
 	spin_unlock(&ctx->lock);
-
-	blk_mq_run_hw_queue(hctx, from_schedule);
 }
 
 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -1153,9 +1266,10 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		BUG_ON(!rq->q);
 		if (rq->mq_ctx != this_ctx) {
 			if (this_ctx) {
-				blk_mq_insert_requests(this_q, this_ctx,
-							&ctx_list, depth,
-							from_schedule);
+				trace_block_unplug(this_q, depth, from_schedule);
+				blk_mq_sched_insert_requests(this_q, this_ctx,
+								&ctx_list,
+								from_schedule);
 			}
 
 			this_ctx = rq->mq_ctx;
@@ -1172,8 +1286,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	 * on 'ctx_list'. Do those.
 	 */
 	if (this_ctx) {
-		blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
-				       from_schedule);
+		trace_block_unplug(this_q, depth, from_schedule);
+		blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
+						from_schedule);
 	}
 }
 
@@ -1181,7 +1296,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
 	init_request_from_bio(rq, bio);
 
-	blk_account_io_start(rq, 1);
+	blk_account_io_start(rq, true);
 }
 
 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
@@ -1211,68 +1326,38 @@ insert_rq:
 		}
 
 		spin_unlock(&ctx->lock);
-		__blk_mq_free_request(hctx, ctx, rq);
+		__blk_mq_finish_request(hctx, ctx, rq);
 		return true;
 	}
 }
 
-struct blk_map_ctx {
-	struct blk_mq_hw_ctx *hctx;
-	struct blk_mq_ctx *ctx;
-};
-
-static struct request *blk_mq_map_request(struct request_queue *q,
-					  struct bio *bio,
-					  struct blk_map_ctx *data)
+static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
-	struct blk_mq_hw_ctx *hctx;
-	struct blk_mq_ctx *ctx;
-	struct request *rq;
-	int op = bio_data_dir(bio);
-	int op_flags = 0;
-	struct blk_mq_alloc_data alloc_data;
-
-	blk_queue_enter_live(q);
-	ctx = blk_mq_get_ctx(q);
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
-
-	if (rw_is_sync(bio_op(bio), bio->bi_opf))
-		op_flags |= REQ_SYNC;
-
-	trace_block_getrq(q, bio, op);
-	blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx);
-	rq = __blk_mq_alloc_request(&alloc_data, op, op_flags);
-	if (unlikely(!rq)) {
-		__blk_mq_run_hw_queue(hctx);
-		blk_mq_put_ctx(ctx);
-		trace_block_sleeprq(q, bio, op);
+	if (rq->tag != -1)
+		return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
 
-		ctx = blk_mq_get_ctx(q);
-		hctx = q->mq_ops->map_queue(q, ctx->cpu);
-		blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
-		rq = __blk_mq_alloc_request(&alloc_data, op, op_flags);
-		ctx = alloc_data.ctx;
-		hctx = alloc_data.hctx;
-	}
-
-	hctx->queued++;
-	data->hctx = hctx;
-	data->ctx = ctx;
-	return rq;
+	return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
 }
 
-static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
+static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 {
-	int ret;
 	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
-			rq->mq_ctx->cpu);
 	struct blk_mq_queue_data bd = {
 		.rq = rq,
 		.list = NULL,
 		.last = 1
 	};
-	blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
+	struct blk_mq_hw_ctx *hctx;
+	blk_qc_t new_cookie;
+	int ret;
+
+	if (q->elevator)
+		goto insert;
+
+	if (!blk_mq_get_driver_tag(rq, &hctx, false))
+		goto insert;
+
+	new_cookie = request_to_qc_t(hctx, rq);
 
 	/*
 	 * For OK queue, we are done. For error, kill it. Any other
@@ -1282,7 +1367,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
 	ret = q->mq_ops->queue_rq(hctx, &bd);
 	if (ret == BLK_MQ_RQ_QUEUE_OK) {
 		*cookie = new_cookie;
-		return 0;
+		return;
 	}
 
 	__blk_mq_requeue_request(rq);
@@ -1291,10 +1376,11 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
 		*cookie = BLK_QC_T_NONE;
 		rq->errors = -EIO;
 		blk_mq_end_request(rq, rq->errors);
-		return 0;
+		return;
 	}
 
-	return -1;
+insert:
+	blk_mq_sched_insert_request(rq, false, true, true, false);
 }
 
 /*
@@ -1304,14 +1390,15 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
  */
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
-	const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf);
-	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
-	struct blk_map_ctx data;
+	const int is_sync = op_is_sync(bio->bi_opf);
+	const int is_flush_fua = op_is_flush(bio->bi_opf);
+	struct blk_mq_alloc_data data = { .flags = 0 };
 	struct request *rq;
-	unsigned int request_count = 0;
+	unsigned int request_count = 0, srcu_idx;
 	struct blk_plug *plug;
 	struct request *same_queue_rq = NULL;
 	blk_qc_t cookie;
+	unsigned int wb_acct;
 
 	blk_queue_bounce(q, &bio);
 
@@ -1326,13 +1413,26 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
 		return BLK_QC_T_NONE;
 
-	rq = blk_mq_map_request(q, bio, &data);
-	if (unlikely(!rq))
+	if (blk_mq_sched_bio_merge(q, bio))
 		return BLK_QC_T_NONE;
 
-	cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
+	wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+
+	trace_block_getrq(q, bio, bio->bi_opf);
+
+	rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
+	if (unlikely(!rq)) {
+		__wbt_done(q->rq_wb, wb_acct);
+		return BLK_QC_T_NONE;
+	}
+
+	wbt_track(&rq->issue_stat, wb_acct);
+
+	cookie = request_to_qc_t(data.hctx, rq);
 
 	if (unlikely(is_flush_fua)) {
+		if (q->elevator)
+			goto elv_insert;
 		blk_mq_bio_to_request(rq, bio);
 		blk_insert_flush(rq);
 		goto run_queue;
@@ -1351,7 +1451,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		blk_mq_bio_to_request(rq, bio);
 
 		/*
-		 * We do limited pluging. If the bio can be merged, do that.
+		 * We do limited plugging. If the bio can be merged, do that.
 		 * Otherwise the existing request in the plug list will be
 		 * issued. So the plug list will have one request at most
 		 */
@@ -1371,12 +1471,27 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		blk_mq_put_ctx(data.ctx);
 		if (!old_rq)
 			goto done;
-		if (!blk_mq_direct_issue_request(old_rq, &cookie))
-			goto done;
-		blk_mq_insert_request(old_rq, false, true, true);
+
+		if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
+			rcu_read_lock();
+			blk_mq_try_issue_directly(old_rq, &cookie);
+			rcu_read_unlock();
+		} else {
+			srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
+			blk_mq_try_issue_directly(old_rq, &cookie);
+			srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
+		}
 		goto done;
 	}
 
+	if (q->elevator) {
+elv_insert:
+		blk_mq_put_ctx(data.ctx);
+		blk_mq_bio_to_request(rq, bio);
+		blk_mq_sched_insert_request(rq, false, true,
+						!is_sync || is_flush_fua, true);
+		goto done;
+	}
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
 		/*
 		 * For a SYNC request, send it to the hardware immediately. For
@@ -1398,13 +1513,14 @@ done:
  */
 static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
-	const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf);
-	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
+	const int is_sync = op_is_sync(bio->bi_opf);
+	const int is_flush_fua = op_is_flush(bio->bi_opf);
 	struct blk_plug *plug;
 	unsigned int request_count = 0;
-	struct blk_map_ctx data;
+	struct blk_mq_alloc_data data = { .flags = 0 };
 	struct request *rq;
 	blk_qc_t cookie;
+	unsigned int wb_acct;
 
 	blk_queue_bounce(q, &bio);
 
@@ -1421,13 +1537,26 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	} else
 		request_count = blk_plug_queued_count(q);
 
-	rq = blk_mq_map_request(q, bio, &data);
-	if (unlikely(!rq))
+	if (blk_mq_sched_bio_merge(q, bio))
+		return BLK_QC_T_NONE;
+
+	wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+
+	trace_block_getrq(q, bio, bio->bi_opf);
+
+	rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
+	if (unlikely(!rq)) {
+		__wbt_done(q->rq_wb, wb_acct);
 		return BLK_QC_T_NONE;
+	}
 
-	cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
+	wbt_track(&rq->issue_stat, wb_acct);
+
+	cookie = request_to_qc_t(data.hctx, rq);
 
 	if (unlikely(is_flush_fua)) {
+		if (q->elevator)
+			goto elv_insert;
 		blk_mq_bio_to_request(rq, bio);
 		blk_insert_flush(rq);
 		goto run_queue;
@@ -1440,13 +1569,25 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	 */
 	plug = current->plug;
 	if (plug) {
+		struct request *last = NULL;
+
 		blk_mq_bio_to_request(rq, bio);
+
+		/*
+		 * @request_count may become stale because of schedule
+		 * out, so check the list again.
+		 */
+		if (list_empty(&plug->mq_list))
+			request_count = 0;
 		if (!request_count)
 			trace_block_plug(q);
+		else
+			last = list_entry_rq(plug->mq_list.prev);
 
 		blk_mq_put_ctx(data.ctx);
 
-		if (request_count >= BLK_MAX_REQUEST_COUNT) {
+		if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
+		    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
 			blk_flush_plug_list(plug, false);
 			trace_block_plug(q);
 		}
@@ -1455,6 +1596,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 		return cookie;
 	}
 
+	if (q->elevator) {
+elv_insert:
+		blk_mq_put_ctx(data.ctx);
+		blk_mq_bio_to_request(rq, bio);
+		blk_mq_sched_insert_request(rq, false, true,
+						!is_sync || is_flush_fua, true);
+		goto done;
+	}
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
 		/*
 		 * For a SYNC request, send it to the hardware immediately. For
@@ -1467,20 +1616,12 @@ run_queue:
 	}
 
 	blk_mq_put_ctx(data.ctx);
+done:
 	return cookie;
 }
 
-/*
- * Default mapping to a software queue, since we use one per CPU.
- */
-struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
-{
-	return q->queue_hw_ctx[q->mq_map[cpu]];
-}
-EXPORT_SYMBOL(blk_mq_map_queue);
-
-static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
-		struct blk_mq_tags *tags, unsigned int hctx_idx)
+void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+		     unsigned int hctx_idx)
 {
 	struct page *page;
 
@@ -1488,11 +1629,13 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
 		int i;
 
 		for (i = 0; i < tags->nr_tags; i++) {
-			if (!tags->rqs[i])
+			struct request *rq = tags->static_rqs[i];
+
+			if (!rq)
 				continue;
-			set->ops->exit_request(set->driver_data, tags->rqs[i],
+			set->ops->exit_request(set->driver_data, rq,
 						hctx_idx, i);
-			tags->rqs[i] = NULL;
+			tags->static_rqs[i] = NULL;
 		}
 	}
 
@@ -1506,49 +1649,73 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
 		kmemleak_free(page_address(page));
 		__free_pages(page, page->private);
 	}
+}
 
+void blk_mq_free_rq_map(struct blk_mq_tags *tags)
+{
 	kfree(tags->rqs);
+	tags->rqs = NULL;
+	kfree(tags->static_rqs);
+	tags->static_rqs = NULL;
 
 	blk_mq_free_tags(tags);
 }
 
-static size_t order_to_size(unsigned int order)
-{
-	return (size_t)PAGE_SIZE << order;
-}
-
-static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
-		unsigned int hctx_idx)
+struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+					unsigned int hctx_idx,
+					unsigned int nr_tags,
+					unsigned int reserved_tags)
 {
 	struct blk_mq_tags *tags;
-	unsigned int i, j, entries_per_page, max_order = 4;
-	size_t rq_size, left;
 
-	tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
+	tags = blk_mq_init_tags(nr_tags, reserved_tags,
 				set->numa_node,
 				BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
 	if (!tags)
 		return NULL;
 
-	INIT_LIST_HEAD(&tags->page_list);
-
-	tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
-				 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
+	tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
+				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
 				 set->numa_node);
 	if (!tags->rqs) {
 		blk_mq_free_tags(tags);
 		return NULL;
 	}
 
+	tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *),
+				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
+				 set->numa_node);
+	if (!tags->static_rqs) {
+		kfree(tags->rqs);
+		blk_mq_free_tags(tags);
+		return NULL;
+	}
+
+	return tags;
+}
+
+static size_t order_to_size(unsigned int order)
+{
+	return (size_t)PAGE_SIZE << order;
+}
+
+int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+		     unsigned int hctx_idx, unsigned int depth)
+{
+	unsigned int i, j, entries_per_page, max_order = 4;
+	size_t rq_size, left;
+
+	INIT_LIST_HEAD(&tags->page_list);
+
 	/*
 	 * rq_size is the size of the request plus driver payload, rounded
 	 * to the cacheline size
 	 */
 	rq_size = round_up(sizeof(struct request) + set->cmd_size,
 				cache_line_size());
-	left = rq_size * set->queue_depth;
+	left = rq_size * depth;
 
-	for (i = 0; i < set->queue_depth; ) {
+	for (i = 0; i < depth; ) {
 		int this_order = max_order;
 		struct page *page;
 		int to_do;
@@ -1559,7 +1726,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 
 		do {
 			page = alloc_pages_node(set->numa_node,
-				GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
+				GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
 				this_order);
 			if (page)
 				break;
@@ -1580,17 +1747,19 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 		 * Allow kmemleak to scan these pages as they contain pointers
 		 * to additional allocations like via ops->init_request().
 		 */
-		kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL);
+		kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
 		entries_per_page = order_to_size(this_order) / rq_size;
-		to_do = min(entries_per_page, set->queue_depth - i);
+		to_do = min(entries_per_page, depth - i);
 		left -= to_do * rq_size;
 		for (j = 0; j < to_do; j++) {
-			tags->rqs[i] = p;
+			struct request *rq = p;
+
+			tags->static_rqs[i] = rq;
 			if (set->ops->init_request) {
 				if (set->ops->init_request(set->driver_data,
-						tags->rqs[i], hctx_idx, i,
+						rq, hctx_idx, i,
 						set->numa_node)) {
-					tags->rqs[i] = NULL;
+					tags->static_rqs[i] = NULL;
 					goto fail;
 				}
 			}
@@ -1599,37 +1768,11 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 			i++;
 		}
 	}
-	return tags;
+	return 0;
 
 fail:
-	blk_mq_free_rq_map(set, tags, hctx_idx);
-	return NULL;
-}
-
-static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
-{
-	kfree(bitmap->map);
-}
-
-static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
-{
-	unsigned int bpw = 8, total, num_maps, i;
-
-	bitmap->bits_per_word = bpw;
-
-	num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
-	bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
-					GFP_KERNEL, node);
-	if (!bitmap->map)
-		return -ENOMEM;
-
-	total = nr_cpu_ids;
-	for (i = 0; i < num_maps; i++) {
-		bitmap->map[i].depth = min(total, bitmap->bits_per_word);
-		total -= bitmap->map[i].depth;
-	}
-
-	return 0;
+	blk_mq_free_rqs(set, tags, hctx_idx);
+	return -ENOMEM;
 }
 
 /*
@@ -1637,11 +1780,13 @@ static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
  * software queue to the hw queue dispatch list, and ensure that it
  * gets run.
  */
-static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
+static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 {
+	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	LIST_HEAD(tmp);
 
+	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
 	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
 
 	spin_lock(&ctx->lock);
@@ -1652,30 +1797,20 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
 	spin_unlock(&ctx->lock);
 
 	if (list_empty(&tmp))
-		return NOTIFY_OK;
+		return 0;
 
 	spin_lock(&hctx->lock);
 	list_splice_tail_init(&tmp, &hctx->dispatch);
 	spin_unlock(&hctx->lock);
 
 	blk_mq_run_hw_queue(hctx, true);
-	return NOTIFY_OK;
+	return 0;
 }
 
-static int blk_mq_hctx_notify(void *data, unsigned long action,
-			      unsigned int cpu)
+static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
 {
-	struct blk_mq_hw_ctx *hctx = data;
-
-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
-		return blk_mq_hctx_cpu_offline(hctx, cpu);
-
-	/*
-	 * In case of CPU online, tags may be reallocated
-	 * in blk_mq_map_swqueue() after mapping is updated.
-	 */
-
-	return NOTIFY_OK;
+	cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
+					    &hctx->cpuhp_dead);
 }
 
 /* hctx->ctxs will be freed in queue's release handler */
@@ -1695,9 +1830,12 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 	if (set->ops->exit_hctx)
 		set->ops->exit_hctx(hctx, hctx_idx);
 
-	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
+	if (hctx->flags & BLK_MQ_F_BLOCKING)
+		cleanup_srcu_struct(&hctx->queue_rq_srcu);
+
+	blk_mq_remove_cpuhp(hctx);
 	blk_free_flush_queue(hctx->fq);
-	blk_mq_free_bitmap(&hctx->ctx_map);
+	sbitmap_free(&hctx->ctx_map);
 }
 
 static void blk_mq_exit_hw_queues(struct request_queue *q,
@@ -1734,7 +1872,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (node == NUMA_NO_NODE)
 		node = hctx->numa_node = set->numa_node;
 
-	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
+	INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
 	INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
 	spin_lock_init(&hctx->lock);
 	INIT_LIST_HEAD(&hctx->dispatch);
@@ -1742,9 +1880,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	hctx->queue_num = hctx_idx;
 	hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
 
-	blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
-					blk_mq_hctx_notify, hctx);
-	blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
+	cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
 
 	hctx->tags = set->tags[hctx_idx];
 
@@ -1757,7 +1893,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (!hctx->ctxs)
 		goto unregister_cpu_notifier;
 
-	if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
+	if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
+			      node))
 		goto free_ctxs;
 
 	hctx->nr_ctx = 0;
@@ -1776,6 +1913,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
 				   flush_start_tag + hctx_idx, node))
 		goto free_fq;
 
+	if (hctx->flags & BLK_MQ_F_BLOCKING)
+		init_srcu_struct(&hctx->queue_rq_srcu);
+
 	return 0;
 
  free_fq:
@@ -1784,12 +1924,11 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (set->ops->exit_hctx)
 		set->ops->exit_hctx(hctx, hctx_idx);
  free_bitmap:
-	blk_mq_free_bitmap(&hctx->ctx_map);
+	sbitmap_free(&hctx->ctx_map);
  free_ctxs:
 	kfree(hctx->ctxs);
  unregister_cpu_notifier:
-	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
-
+	blk_mq_remove_cpuhp(hctx);
 	return -1;
 }
 
@@ -1807,12 +1946,14 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		spin_lock_init(&__ctx->lock);
 		INIT_LIST_HEAD(&__ctx->rq_list);
 		__ctx->queue = q;
+		blk_stat_init(&__ctx->stat[BLK_STAT_READ]);
+		blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]);
 
 		/* If the cpu isn't online, the cpu is mapped to first hctx */
 		if (!cpu_online(i))
 			continue;
 
-		hctx = q->mq_ops->map_queue(q, i);
+		hctx = blk_mq_map_queue(q, i);
 
 		/*
 		 * Set local node, IFF we have more than one hw queue. If
@@ -1823,10 +1964,39 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 	}
 }
 
+static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
+{
+	int ret = 0;
+
+	set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
+					set->queue_depth, set->reserved_tags);
+	if (!set->tags[hctx_idx])
+		return false;
+
+	ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
+				set->queue_depth);
+	if (!ret)
+		return true;
+
+	blk_mq_free_rq_map(set->tags[hctx_idx]);
+	set->tags[hctx_idx] = NULL;
+	return false;
+}
+
+static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
+					 unsigned int hctx_idx)
+{
+	if (set->tags[hctx_idx]) {
+		blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
+		blk_mq_free_rq_map(set->tags[hctx_idx]);
+		set->tags[hctx_idx] = NULL;
+	}
+}
+
 static void blk_mq_map_swqueue(struct request_queue *q,
 			       const struct cpumask *online_mask)
 {
-	unsigned int i;
+	unsigned int i, hctx_idx;
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_tag_set *set = q->tag_set;
@@ -1849,8 +2019,21 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 		if (!cpumask_test_cpu(i, online_mask))
 			continue;
 
+		hctx_idx = q->mq_map[i];
+		/* unmapped hw queue can be remapped after CPU topo changed */
+		if (!set->tags[hctx_idx] &&
+		    !__blk_mq_alloc_rq_map(set, hctx_idx)) {
+			/*
+			 * If tags initialization fail for some hctx,
+			 * that hctx won't be brought online.  In this
+			 * case, remap the current ctx to hctx[0] which
+			 * is guaranteed to always have tags allocated
+			 */
+			q->mq_map[i] = 0;
+		}
+
 		ctx = per_cpu_ptr(q->queue_ctx, i);
-		hctx = q->mq_ops->map_queue(q, i);
+		hctx = blk_mq_map_queue(q, i);
 
 		cpumask_set_cpu(i, hctx->cpumask);
 		ctx->index_hw = hctx->nr_ctx;
@@ -1860,34 +2043,31 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 	mutex_unlock(&q->sysfs_lock);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		struct blk_mq_ctxmap *map = &hctx->ctx_map;
-
 		/*
 		 * If no software queues are mapped to this hardware queue,
 		 * disable it and free the request entries.
 		 */
 		if (!hctx->nr_ctx) {
-			if (set->tags[i]) {
-				blk_mq_free_rq_map(set, set->tags[i], i);
-				set->tags[i] = NULL;
-			}
+			/* Never unmap queue 0.  We need it as a
+			 * fallback in case of a new remap fails
+			 * allocation
+			 */
+			if (i && set->tags[i])
+				blk_mq_free_map_and_requests(set, i);
+
 			hctx->tags = NULL;
 			continue;
 		}
 
-		/* unmapped hw queue can be remapped after CPU topo changed */
-		if (!set->tags[i])
-			set->tags[i] = blk_mq_init_rq_map(set, i);
 		hctx->tags = set->tags[i];
 		WARN_ON(!hctx->tags);
 
-		cpumask_copy(hctx->tags->cpumask, hctx->cpumask);
 		/*
 		 * Set the map size to the number of mapped software queues.
 		 * This is more accurate and more efficient than looping
 		 * over all possibly mapped software queues.
 		 */
-		map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word);
+		sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
 
 		/*
 		 * Initialize batch roundrobin counts
@@ -1967,6 +2147,8 @@ void blk_mq_release(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	unsigned int i;
 
+	blk_mq_sched_teardown(q);
+
 	/* hctx kobj stays in hctx */
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx)
@@ -1975,7 +2157,6 @@ void blk_mq_release(struct request_queue *q)
 		kfree(hctx);
 	}
 
-	kfree(q->mq_map);
 	q->mq_map = NULL;
 
 	kfree(q->queue_hw_ctx);
@@ -2042,10 +2223,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx = hctxs[j];
 
 		if (hctx) {
-			if (hctx->tags) {
-				blk_mq_free_rq_map(set, hctx->tags, j);
-				set->tags[j] = NULL;
-			}
+			if (hctx->tags)
+				blk_mq_free_map_and_requests(set, j);
 			blk_mq_exit_hctx(q, set, hctx, j);
 			free_cpumask_var(hctx->cpumask);
 			kobject_put(&hctx->kobj);
@@ -2074,9 +2253,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	if (!q->queue_hw_ctx)
 		goto err_percpu;
 
-	q->mq_map = blk_mq_make_queue_map(set);
-	if (!q->mq_map)
-		goto err_map;
+	q->mq_map = set->mq_map;
 
 	blk_mq_realloc_hw_ctxs(set, q);
 	if (!q->nr_hw_queues)
@@ -2094,7 +2271,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
 	q->sg_reserved_size = INT_MAX;
 
-	INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
+	INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
 	INIT_LIST_HEAD(&q->requeue_list);
 	spin_lock_init(&q->requeue_lock);
 
@@ -2108,6 +2285,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	 */
 	q->nr_requests = set->queue_depth;
 
+	/*
+	 * Default to classic polling
+	 */
+	q->poll_nsec = -1;
+
 	if (set->ops->complete)
 		blk_queue_softirq_done(q, set->ops->complete);
 
@@ -2123,11 +2305,17 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	mutex_unlock(&all_q_mutex);
 	put_online_cpus();
 
+	if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
+		int ret;
+
+		ret = blk_mq_sched_init(q);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
 	return q;
 
 err_hctxs:
-	kfree(q->mq_map);
-err_map:
 	kfree(q->queue_hw_ctx);
 err_percpu:
 	free_percpu(q->queue_ctx);
@@ -2145,6 +2333,8 @@ void blk_mq_free_queue(struct request_queue *q)
 	list_del_init(&q->all_q_node);
 	mutex_unlock(&all_q_mutex);
 
+	wbt_exit(q);
+
 	blk_mq_del_queue_tag_set(q);
 
 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
@@ -2159,8 +2349,6 @@ static void blk_mq_queue_reinit(struct request_queue *q,
 
 	blk_mq_sysfs_unregister(q);
 
-	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
-
 	/*
 	 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
 	 * we should change hctx numa_node according to new topology (this
@@ -2172,50 +2360,18 @@ static void blk_mq_queue_reinit(struct request_queue *q,
 	blk_mq_sysfs_register(q);
 }
 
-static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
-				      unsigned long action, void *hcpu)
+/*
+ * New online cpumask which is going to be set in this hotplug event.
+ * Declare this cpumasks as global as cpu-hotplug operation is invoked
+ * one-by-one and dynamically allocating this could result in a failure.
+ */
+static struct cpumask cpuhp_online_new;
+
+static void blk_mq_queue_reinit_work(void)
 {
 	struct request_queue *q;
-	int cpu = (unsigned long)hcpu;
-	/*
-	 * New online cpumask which is going to be set in this hotplug event.
-	 * Declare this cpumasks as global as cpu-hotplug operation is invoked
-	 * one-by-one and dynamically allocating this could result in a failure.
-	 */
-	static struct cpumask online_new;
-
-	/*
-	 * Before hotadded cpu starts handling requests, new mappings must
-	 * be established.  Otherwise, these requests in hw queue might
-	 * never be dispatched.
-	 *
-	 * For example, there is a single hw queue (hctx) and two CPU queues
-	 * (ctx0 for CPU0, and ctx1 for CPU1).
-	 *
-	 * Now CPU1 is just onlined and a request is inserted into
-	 * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is
-	 * still zero.
-	 *
-	 * And then while running hw queue, flush_busy_ctxs() finds bit0 is
-	 * set in pending bitmap and tries to retrieve requests in
-	 * hctx->ctxs[0]->rq_list.  But htx->ctxs[0] is a pointer to ctx0,
-	 * so the request in ctx1->rq_list is ignored.
-	 */
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DEAD:
-	case CPU_UP_CANCELED:
-		cpumask_copy(&online_new, cpu_online_mask);
-		break;
-	case CPU_UP_PREPARE:
-		cpumask_copy(&online_new, cpu_online_mask);
-		cpumask_set_cpu(cpu, &online_new);
-		break;
-	default:
-		return NOTIFY_OK;
-	}
 
 	mutex_lock(&all_q_mutex);
-
 	/*
 	 * We need to freeze and reinit all existing queues.  Freezing
 	 * involves synchronous wait for an RCU grace period and doing it
@@ -2225,41 +2381,62 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 	 */
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		blk_mq_freeze_queue_start(q);
-	list_for_each_entry(q, &all_q_list, all_q_node) {
+	list_for_each_entry(q, &all_q_list, all_q_node)
 		blk_mq_freeze_queue_wait(q);
 
-		/*
-		 * timeout handler can't touch hw queue during the
-		 * reinitialization
-		 */
-		del_timer_sync(&q->timeout);
-	}
-
 	list_for_each_entry(q, &all_q_list, all_q_node)
-		blk_mq_queue_reinit(q, &online_new);
+		blk_mq_queue_reinit(q, &cpuhp_online_new);
 
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		blk_mq_unfreeze_queue(q);
 
 	mutex_unlock(&all_q_mutex);
-	return NOTIFY_OK;
+}
+
+static int blk_mq_queue_reinit_dead(unsigned int cpu)
+{
+	cpumask_copy(&cpuhp_online_new, cpu_online_mask);
+	blk_mq_queue_reinit_work();
+	return 0;
+}
+
+/*
+ * Before hotadded cpu starts handling requests, new mappings must be
+ * established.  Otherwise, these requests in hw queue might never be
+ * dispatched.
+ *
+ * For example, there is a single hw queue (hctx) and two CPU queues (ctx0
+ * for CPU0, and ctx1 for CPU1).
+ *
+ * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list
+ * and set bit0 in pending bitmap as ctx1->index_hw is still zero.
+ *
+ * And then while running hw queue, blk_mq_flush_busy_ctxs() finds bit0 is set
+ * in pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
+ * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list is
+ * ignored.
+ */
+static int blk_mq_queue_reinit_prepare(unsigned int cpu)
+{
+	cpumask_copy(&cpuhp_online_new, cpu_online_mask);
+	cpumask_set_cpu(cpu, &cpuhp_online_new);
+	blk_mq_queue_reinit_work();
+	return 0;
 }
 
 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 {
 	int i;
 
-	for (i = 0; i < set->nr_hw_queues; i++) {
-		set->tags[i] = blk_mq_init_rq_map(set, i);
-		if (!set->tags[i])
+	for (i = 0; i < set->nr_hw_queues; i++)
+		if (!__blk_mq_alloc_rq_map(set, i))
 			goto out_unwind;
-	}
 
 	return 0;
 
 out_unwind:
 	while (--i >= 0)
-		blk_mq_free_rq_map(set, set->tags[i], i);
+		blk_mq_free_rq_map(set->tags[i]);
 
 	return -ENOMEM;
 }
@@ -2299,12 +2476,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 	return 0;
 }
 
-struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
-{
-	return tags->cpumask;
-}
-EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
-
 /*
  * Alloc a tag set to be associated with one or more request queues.
  * May fail with EINVAL for various error conditions. May adjust the
@@ -2313,6 +2484,8 @@ EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
  */
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 {
+	int ret;
+
 	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
 
 	if (!set->nr_hw_queues)
@@ -2322,7 +2495,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
 		return -EINVAL;
 
-	if (!set->ops->queue_rq || !set->ops->map_queue)
+	if (!set->ops->queue_rq)
 		return -EINVAL;
 
 	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
@@ -2351,17 +2524,35 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (!set->tags)
 		return -ENOMEM;
 
-	if (blk_mq_alloc_rq_maps(set))
-		goto enomem;
+	ret = -ENOMEM;
+	set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
+			GFP_KERNEL, set->numa_node);
+	if (!set->mq_map)
+		goto out_free_tags;
+
+	if (set->ops->map_queues)
+		ret = set->ops->map_queues(set);
+	else
+		ret = blk_mq_map_queues(set);
+	if (ret)
+		goto out_free_mq_map;
+
+	ret = blk_mq_alloc_rq_maps(set);
+	if (ret)
+		goto out_free_mq_map;
 
 	mutex_init(&set->tag_list_lock);
 	INIT_LIST_HEAD(&set->tag_list);
 
 	return 0;
-enomem:
+
+out_free_mq_map:
+	kfree(set->mq_map);
+	set->mq_map = NULL;
+out_free_tags:
 	kfree(set->tags);
 	set->tags = NULL;
-	return -ENOMEM;
+	return ret;
 }
 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
 
@@ -2369,10 +2560,11 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 {
 	int i;
 
-	for (i = 0; i < nr_cpu_ids; i++) {
-		if (set->tags[i])
-			blk_mq_free_rq_map(set, set->tags[i], i);
-	}
+	for (i = 0; i < nr_cpu_ids; i++)
+		blk_mq_free_map_and_requests(set, i);
+
+	kfree(set->mq_map);
+	set->mq_map = NULL;
 
 	kfree(set->tags);
 	set->tags = NULL;
@@ -2385,14 +2577,28 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 	struct blk_mq_hw_ctx *hctx;
 	int i, ret;
 
-	if (!set || nr > set->queue_depth)
+	if (!set)
 		return -EINVAL;
 
+	blk_mq_freeze_queue(q);
+	blk_mq_quiesce_queue(q);
+
 	ret = 0;
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx->tags)
 			continue;
-		ret = blk_mq_tag_update_depth(hctx->tags, nr);
+		/*
+		 * If we're using an MQ scheduler, just update the scheduler
+		 * queue depth. This is similar to what the old code would do.
+		 */
+		if (!hctx->sched_tags) {
+			ret = blk_mq_tag_update_depth(hctx, &hctx->tags,
+							min(nr, set->queue_depth),
+							false);
+		} else {
+			ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
+							nr, true);
+		}
 		if (ret)
 			break;
 	}
@@ -2400,6 +2606,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 	if (!ret)
 		q->nr_requests = nr;
 
+	blk_mq_unfreeze_queue(q);
+	blk_mq_start_stopped_hw_queues(q, true);
+
 	return ret;
 }
 
@@ -2419,10 +2628,14 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
 		blk_mq_realloc_hw_ctxs(set, q);
 
+		/*
+		 * Manually set the make_request_fn as blk_queue_make_request
+		 * resets a lot of the queue settings.
+		 */
 		if (q->nr_hw_queues > 1)
-			blk_queue_make_request(q, blk_mq_make_request);
+			q->make_request_fn = blk_mq_make_request;
 		else
-			blk_queue_make_request(q, blk_sq_make_request);
+			q->make_request_fn = blk_sq_make_request;
 
 		blk_mq_queue_reinit(q, cpu_online_mask);
 	}
@@ -2432,6 +2645,168 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 }
 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 
+static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
+				       struct blk_mq_hw_ctx *hctx,
+				       struct request *rq)
+{
+	struct blk_rq_stat stat[2];
+	unsigned long ret = 0;
+
+	/*
+	 * If stats collection isn't on, don't sleep but turn it on for
+	 * future users
+	 */
+	if (!blk_stat_enable(q))
+		return 0;
+
+	/*
+	 * We don't have to do this once per IO, should optimize this
+	 * to just use the current window of stats until it changes
+	 */
+	memset(&stat, 0, sizeof(stat));
+	blk_hctx_stat_get(hctx, stat);
+
+	/*
+	 * As an optimistic guess, use half of the mean service time
+	 * for this type of request. We can (and should) make this smarter.
+	 * For instance, if the completion latencies are tight, we can
+	 * get closer than just half the mean. This is especially
+	 * important on devices where the completion latencies are longer
+	 * than ~10 usec.
+	 */
+	if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples)
+		ret = (stat[BLK_STAT_READ].mean + 1) / 2;
+	else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples)
+		ret = (stat[BLK_STAT_WRITE].mean + 1) / 2;
+
+	return ret;
+}
+
+static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
+				     struct blk_mq_hw_ctx *hctx,
+				     struct request *rq)
+{
+	struct hrtimer_sleeper hs;
+	enum hrtimer_mode mode;
+	unsigned int nsecs;
+	ktime_t kt;
+
+	if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+		return false;
+
+	/*
+	 * poll_nsec can be:
+	 *
+	 * -1:	don't ever hybrid sleep
+	 *  0:	use half of prev avg
+	 * >0:	use this specific value
+	 */
+	if (q->poll_nsec == -1)
+		return false;
+	else if (q->poll_nsec > 0)
+		nsecs = q->poll_nsec;
+	else
+		nsecs = blk_mq_poll_nsecs(q, hctx, rq);
+
+	if (!nsecs)
+		return false;
+
+	set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+
+	/*
+	 * This will be replaced with the stats tracking code, using
+	 * 'avg_completion_time / 2' as the pre-sleep target.
+	 */
+	kt = nsecs;
+
+	mode = HRTIMER_MODE_REL;
+	hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
+	hrtimer_set_expires(&hs.timer, kt);
+
+	hrtimer_init_sleeper(&hs, current);
+	do {
+		if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+			break;
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		hrtimer_start_expires(&hs.timer, mode);
+		if (hs.task)
+			io_schedule();
+		hrtimer_cancel(&hs.timer);
+		mode = HRTIMER_MODE_ABS;
+	} while (hs.task && !signal_pending(current));
+
+	__set_current_state(TASK_RUNNING);
+	destroy_hrtimer_on_stack(&hs.timer);
+	return true;
+}
+
+static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	struct request_queue *q = hctx->queue;
+	long state;
+
+	/*
+	 * If we sleep, have the caller restart the poll loop to reset
+	 * the state. Like for the other success return cases, the
+	 * caller is responsible for checking if the IO completed. If
+	 * the IO isn't complete, we'll get called again and will go
+	 * straight to the busy poll loop.
+	 */
+	if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
+		return true;
+
+	hctx->poll_considered++;
+
+	state = current->state;
+	while (!need_resched()) {
+		int ret;
+
+		hctx->poll_invoked++;
+
+		ret = q->mq_ops->poll(hctx, rq->tag);
+		if (ret > 0) {
+			hctx->poll_success++;
+			set_current_state(TASK_RUNNING);
+			return true;
+		}
+
+		if (signal_pending_state(state, current))
+			set_current_state(TASK_RUNNING);
+
+		if (current->state == TASK_RUNNING)
+			return true;
+		if (ret < 0)
+			break;
+		cpu_relax();
+	}
+
+	return false;
+}
+
+bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+{
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_plug *plug;
+	struct request *rq;
+
+	if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
+	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+		return false;
+
+	plug = current->plug;
+	if (plug)
+		blk_flush_plug_list(plug, false);
+
+	hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
+	if (!blk_qc_t_is_internal(cookie))
+		rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+	else
+		rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
+
+	return __blk_mq_poll(hctx, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_poll);
+
 void blk_mq_disable_hotplug(void)
 {
 	mutex_lock(&all_q_mutex);
@@ -2444,10 +2819,12 @@ void blk_mq_enable_hotplug(void)
 
 static int __init blk_mq_init(void)
 {
-	blk_mq_cpu_init();
-
-	hotcpu_notifier(blk_mq_queue_reinit_notify, 0);
+	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
+				blk_mq_hctx_notify_dead);
 
+	cpuhp_setup_state_nocalls(CPUHP_BLK_MQ_PREPARE, "block/mq:prepare",
+				  blk_mq_queue_reinit_prepare,
+				  blk_mq_queue_reinit_dead);
 	return 0;
 }
 subsys_initcall(blk_mq_init);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9087b11037b7..24b2256186f3 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -1,6 +1,8 @@
 #ifndef INT_BLK_MQ_H
 #define INT_BLK_MQ_H
 
+#include "blk-stat.h"
+
 struct blk_mq_tag_set;
 
 struct blk_mq_ctx {
@@ -12,14 +14,13 @@ struct blk_mq_ctx {
 	unsigned int		cpu;
 	unsigned int		index_hw;
 
-	unsigned int		last_tag ____cacheline_aligned_in_smp;
-
 	/* incremented at dispatch time */
 	unsigned long		rq_dispatched[2];
 	unsigned long		rq_merged;
 
 	/* incremented at completion time */
 	unsigned long		____cacheline_aligned_in_smp rq_completed[2];
+	struct blk_rq_stat	stat[2];
 
 	struct request_queue	*queue;
 	struct kobject		kobj;
@@ -30,28 +31,49 @@ void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *);
+void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
+bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
+bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
+				bool wait);
+
+/*
+ * Internal helpers for allocating/freeing the request map
+ */
+void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+		     unsigned int hctx_idx);
+void blk_mq_free_rq_map(struct blk_mq_tags *tags);
+struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+					unsigned int hctx_idx,
+					unsigned int nr_tags,
+					unsigned int reserved_tags);
+int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+		     unsigned int hctx_idx, unsigned int depth);
 
 /*
+ * Internal helpers for request insertion into sw queues
+ */
+void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+				bool at_head);
+void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+				struct list_head *list);
+/*
  * CPU hotplug helpers
  */
-struct blk_mq_cpu_notifier;
-void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
-			      int (*fn)(void *, unsigned long, unsigned int),
-			      void *data);
-void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
-void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
-void blk_mq_cpu_init(void);
 void blk_mq_enable_hotplug(void);
 void blk_mq_disable_hotplug(void);
 
 /*
  * CPU -> queue mappings
  */
-extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
-extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
-				   const struct cpumask *online_mask);
 extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
 
+static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
+		int cpu)
+{
+	return q->queue_hw_ctx[q->mq_map[cpu]];
+}
+
 /*
  * sysfs helpers
  */
@@ -59,19 +81,39 @@ extern int blk_mq_sysfs_register(struct request_queue *q);
 extern void blk_mq_sysfs_unregister(struct request_queue *q);
 extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
 
+/*
+ * debugfs helpers
+ */
+#ifdef CONFIG_BLK_DEBUG_FS
+int blk_mq_debugfs_register(struct request_queue *q, const char *name);
+void blk_mq_debugfs_unregister(struct request_queue *q);
+int blk_mq_debugfs_register_hctxs(struct request_queue *q);
+void blk_mq_debugfs_unregister_hctxs(struct request_queue *q);
+#else
+static inline int blk_mq_debugfs_register(struct request_queue *q,
+					  const char *name)
+{
+	return 0;
+}
+
+static inline void blk_mq_debugfs_unregister(struct request_queue *q)
+{
+}
+
+static inline int blk_mq_debugfs_register_hctxs(struct request_queue *q)
+{
+	return 0;
+}
+
+static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
+{
+}
+#endif
+
 extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
 
 void blk_mq_release(struct request_queue *q);
 
-/*
- * Basic implementation of sparser bitmap, allowing the user to spread
- * the bits over more cachelines.
- */
-struct blk_align_bitmap {
-	unsigned long word;
-	unsigned long depth;
-} ____cacheline_aligned_in_smp;
-
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
 					   unsigned int cpu)
 {
@@ -114,6 +156,30 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
 	data->hctx = hctx;
 }
 
+static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
+{
+	if (data->flags & BLK_MQ_REQ_INTERNAL)
+		return data->hctx->sched_tags;
+
+	return data->hctx->tags;
+}
+
+/*
+ * Internal helpers for request allocation/init/free
+ */
+void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+			struct request *rq, unsigned int op);
+void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+				struct request *rq);
+void blk_mq_finish_request(struct request *rq);
+struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
+					unsigned int op);
+
+static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
+{
+	return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
+}
+
 static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 {
 	return hctx->nr_ctx && hctx->tags;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index f679ae122843..1e7174ffc9d4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -13,6 +13,7 @@
 #include <linux/gfp.h>
 
 #include "blk.h"
+#include "blk-wbt.h"
 
 unsigned long blk_max_low_pfn;
 EXPORT_SYMBOL(blk_max_low_pfn);
@@ -87,6 +88,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
 void blk_set_default_limits(struct queue_limits *lim)
 {
 	lim->max_segments = BLK_MAX_SEGMENTS;
+	lim->max_discard_segments = 1;
 	lim->max_integrity_segments = 0;
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->virt_boundary_mask = 0;
@@ -95,6 +97,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->max_dev_sectors = 0;
 	lim->chunk_sectors = 0;
 	lim->max_write_same_sectors = 0;
+	lim->max_write_zeroes_sectors = 0;
 	lim->max_discard_sectors = 0;
 	lim->max_hw_discard_sectors = 0;
 	lim->discard_granularity = 0;
@@ -107,6 +110,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->io_opt = 0;
 	lim->misaligned = 0;
 	lim->cluster = 1;
+	lim->zoned = BLK_ZONED_NONE;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 
@@ -125,11 +129,13 @@ void blk_set_stacking_limits(struct queue_limits *lim)
 	/* Inherit limits from component devices */
 	lim->discard_zeroes_data = 1;
 	lim->max_segments = USHRT_MAX;
+	lim->max_discard_segments = 1;
 	lim->max_hw_sectors = UINT_MAX;
 	lim->max_segment_size = UINT_MAX;
 	lim->max_sectors = UINT_MAX;
 	lim->max_dev_sectors = UINT_MAX;
 	lim->max_write_same_sectors = UINT_MAX;
+	lim->max_write_zeroes_sectors = UINT_MAX;
 }
 EXPORT_SYMBOL(blk_set_stacking_limits);
 
@@ -249,6 +255,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
 	max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
 	max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
 	limits->max_sectors = max_sectors;
+	q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9);
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
@@ -298,6 +305,19 @@ void blk_queue_max_write_same_sectors(struct request_queue *q,
 EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
 
 /**
+ * blk_queue_max_write_zeroes_sectors - set max sectors for a single
+ *                                      write zeroes
+ * @q:  the request queue for the device
+ * @max_write_zeroes_sectors: maximum number of sectors to write per command
+ **/
+void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
+		unsigned int max_write_zeroes_sectors)
+{
+	q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
+
+/**
  * blk_queue_max_segments - set max hw segments for a request for this queue
  * @q:  the request queue for the device
  * @max_segments:  max number of segments
@@ -319,6 +339,22 @@ void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments
 EXPORT_SYMBOL(blk_queue_max_segments);
 
 /**
+ * blk_queue_max_discard_segments - set max segments for discard requests
+ * @q:  the request queue for the device
+ * @max_segments:  max number of segments
+ *
+ * Description:
+ *    Enables a low level driver to set an upper limit on the number of
+ *    segments in a discard request.
+ **/
+void blk_queue_max_discard_segments(struct request_queue *q,
+		unsigned short max_segments)
+{
+	q->limits.max_discard_segments = max_segments;
+}
+EXPORT_SYMBOL_GPL(blk_queue_max_discard_segments);
+
+/**
  * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
  * @q:  the request queue for the device
  * @max_size:  max size of segment in bytes
@@ -525,6 +561,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
 	t->max_write_same_sectors = min(t->max_write_same_sectors,
 					b->max_write_same_sectors);
+	t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
+					b->max_write_zeroes_sectors);
 	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
 
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
@@ -533,6 +571,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 					    b->virt_boundary_mask);
 
 	t->max_segments = min_not_zero(t->max_segments, b->max_segments);
+	t->max_discard_segments = min_not_zero(t->max_discard_segments,
+					       b->max_discard_segments);
 	t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
 						 b->max_integrity_segments);
 
@@ -630,6 +670,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 			t->discard_granularity;
 	}
 
+	if (b->chunk_sectors)
+		t->chunk_sectors = min_not_zero(t->chunk_sectors,
+						b->chunk_sectors);
+
 	return ret;
 }
 EXPORT_SYMBOL(blk_stack_limits);
@@ -832,6 +876,19 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
 EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
 
 /**
+ * blk_set_queue_depth - tell the block layer about the device queue depth
+ * @q:		the request queue for the device
+ * @depth:		queue depth
+ *
+ */
+void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
+{
+	q->queue_depth = depth;
+	wbt_set_queue_depth(q->rq_wb, depth);
+}
+EXPORT_SYMBOL(blk_set_queue_depth);
+
+/**
  * blk_queue_write_cache - configure queue's write cache
  * @q:		the request queue for the device
  * @wc:		write back cache on or off
@@ -851,6 +908,8 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
 	else
 		queue_flag_clear(QUEUE_FLAG_FUA, q);
 	spin_unlock_irq(q->queue_lock);
+
+	wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
 }
 EXPORT_SYMBOL_GPL(blk_queue_write_cache);
 
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 53b1737e978d..06cf9807f49a 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -18,7 +18,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
  * Softirq action handler - move entries to local list and loop over them
  * while passing them to the queue registered handler.
  */
-static void blk_done_softirq(struct softirq_action *h)
+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
 {
 	struct list_head *cpu_list, local_list;
 
@@ -78,30 +78,21 @@ static int raise_blk_irq(int cpu, struct request *rq)
 }
 #endif
 
-static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
-			  void *hcpu)
+static int blk_softirq_cpu_dead(unsigned int cpu)
 {
 	/*
 	 * If a CPU goes away, splice its entries to the current CPU
 	 * and trigger a run of the softirq
 	 */
-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-		int cpu = (unsigned long) hcpu;
-
-		local_irq_disable();
-		list_splice_init(&per_cpu(blk_cpu_done, cpu),
-				 this_cpu_ptr(&blk_cpu_done));
-		raise_softirq_irqoff(BLOCK_SOFTIRQ);
-		local_irq_enable();
-	}
+	local_irq_disable();
+	list_splice_init(&per_cpu(blk_cpu_done, cpu),
+			 this_cpu_ptr(&blk_cpu_done));
+	raise_softirq_irqoff(BLOCK_SOFTIRQ);
+	local_irq_enable();
 
-	return NOTIFY_OK;
+	return 0;
 }
 
-static struct notifier_block blk_cpu_notifier = {
-	.notifier_call	= blk_cpu_notify,
-};
-
 void __blk_complete_request(struct request *req)
 {
 	int ccpu, cpu;
@@ -180,7 +171,9 @@ static __init int blk_softirq_init(void)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 
 	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
-	register_hotcpu_notifier(&blk_cpu_notifier);
+	cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
+				  "block/softirq:dead", NULL,
+				  blk_softirq_cpu_dead);
 	return 0;
 }
 subsys_initcall(blk_softirq_init);
diff --git a/block/blk-stat.c b/block/blk-stat.c
new file mode 100644
index 000000000000..9b43efb8933f
--- /dev/null
+++ b/block/blk-stat.c
@@ -0,0 +1,256 @@
+/*
+ * Block stat tracking code
+ *
+ * Copyright (C) 2016 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/blk-mq.h>
+
+#include "blk-stat.h"
+#include "blk-mq.h"
+
+static void blk_stat_flush_batch(struct blk_rq_stat *stat)
+{
+	const s32 nr_batch = READ_ONCE(stat->nr_batch);
+	const s32 nr_samples = READ_ONCE(stat->nr_samples);
+
+	if (!nr_batch)
+		return;
+	if (!nr_samples)
+		stat->mean = div64_s64(stat->batch, nr_batch);
+	else {
+		stat->mean = div64_s64((stat->mean * nr_samples) +
+					stat->batch,
+					nr_batch + nr_samples);
+	}
+
+	stat->nr_samples += nr_batch;
+	stat->nr_batch = stat->batch = 0;
+}
+
+static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
+{
+	if (!src->nr_samples)
+		return;
+
+	blk_stat_flush_batch(src);
+
+	dst->min = min(dst->min, src->min);
+	dst->max = max(dst->max, src->max);
+
+	if (!dst->nr_samples)
+		dst->mean = src->mean;
+	else {
+		dst->mean = div64_s64((src->mean * src->nr_samples) +
+					(dst->mean * dst->nr_samples),
+					dst->nr_samples + src->nr_samples);
+	}
+	dst->nr_samples += src->nr_samples;
+}
+
+static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+{
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+	uint64_t latest = 0;
+	int i, j, nr;
+
+	blk_stat_init(&dst[BLK_STAT_READ]);
+	blk_stat_init(&dst[BLK_STAT_WRITE]);
+
+	nr = 0;
+	do {
+		uint64_t newest = 0;
+
+		queue_for_each_hw_ctx(q, hctx, i) {
+			hctx_for_each_ctx(hctx, ctx, j) {
+				blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
+				blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
+
+				if (!ctx->stat[BLK_STAT_READ].nr_samples &&
+				    !ctx->stat[BLK_STAT_WRITE].nr_samples)
+					continue;
+				if (ctx->stat[BLK_STAT_READ].time > newest)
+					newest = ctx->stat[BLK_STAT_READ].time;
+				if (ctx->stat[BLK_STAT_WRITE].time > newest)
+					newest = ctx->stat[BLK_STAT_WRITE].time;
+			}
+		}
+
+		/*
+		 * No samples
+		 */
+		if (!newest)
+			break;
+
+		if (newest > latest)
+			latest = newest;
+
+		queue_for_each_hw_ctx(q, hctx, i) {
+			hctx_for_each_ctx(hctx, ctx, j) {
+				if (ctx->stat[BLK_STAT_READ].time == newest) {
+					blk_stat_sum(&dst[BLK_STAT_READ],
+						     &ctx->stat[BLK_STAT_READ]);
+					nr++;
+				}
+				if (ctx->stat[BLK_STAT_WRITE].time == newest) {
+					blk_stat_sum(&dst[BLK_STAT_WRITE],
+						     &ctx->stat[BLK_STAT_WRITE]);
+					nr++;
+				}
+			}
+		}
+		/*
+		 * If we race on finding an entry, just loop back again.
+		 * Should be very rare.
+		 */
+	} while (!nr);
+
+	dst[BLK_STAT_READ].time = dst[BLK_STAT_WRITE].time = latest;
+}
+
+void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+{
+	if (q->mq_ops)
+		blk_mq_stat_get(q, dst);
+	else {
+		blk_stat_flush_batch(&q->rq_stats[BLK_STAT_READ]);
+		blk_stat_flush_batch(&q->rq_stats[BLK_STAT_WRITE]);
+		memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ],
+				sizeof(struct blk_rq_stat));
+		memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE],
+				sizeof(struct blk_rq_stat));
+	}
+}
+
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
+{
+	struct blk_mq_ctx *ctx;
+	unsigned int i, nr;
+
+	nr = 0;
+	do {
+		uint64_t newest = 0;
+
+		hctx_for_each_ctx(hctx, ctx, i) {
+			blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
+			blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
+
+			if (!ctx->stat[BLK_STAT_READ].nr_samples &&
+			    !ctx->stat[BLK_STAT_WRITE].nr_samples)
+				continue;
+
+			if (ctx->stat[BLK_STAT_READ].time > newest)
+				newest = ctx->stat[BLK_STAT_READ].time;
+			if (ctx->stat[BLK_STAT_WRITE].time > newest)
+				newest = ctx->stat[BLK_STAT_WRITE].time;
+		}
+
+		if (!newest)
+			break;
+
+		hctx_for_each_ctx(hctx, ctx, i) {
+			if (ctx->stat[BLK_STAT_READ].time == newest) {
+				blk_stat_sum(&dst[BLK_STAT_READ],
+						&ctx->stat[BLK_STAT_READ]);
+				nr++;
+			}
+			if (ctx->stat[BLK_STAT_WRITE].time == newest) {
+				blk_stat_sum(&dst[BLK_STAT_WRITE],
+						&ctx->stat[BLK_STAT_WRITE]);
+				nr++;
+			}
+		}
+		/*
+		 * If we race on finding an entry, just loop back again.
+		 * Should be very rare, as the window is only updated
+		 * occasionally
+		 */
+	} while (!nr);
+}
+
+static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
+{
+	stat->min = -1ULL;
+	stat->max = stat->nr_samples = stat->mean = 0;
+	stat->batch = stat->nr_batch = 0;
+	stat->time = time_now & BLK_STAT_NSEC_MASK;
+}
+
+void blk_stat_init(struct blk_rq_stat *stat)
+{
+	__blk_stat_init(stat, ktime_to_ns(ktime_get()));
+}
+
+static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now)
+{
+	return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK);
+}
+
+bool blk_stat_is_current(struct blk_rq_stat *stat)
+{
+	return __blk_stat_is_current(stat, ktime_to_ns(ktime_get()));
+}
+
+void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
+{
+	s64 now, value;
+
+	now = __blk_stat_time(ktime_to_ns(ktime_get()));
+	if (now < blk_stat_time(&rq->issue_stat))
+		return;
+
+	if (!__blk_stat_is_current(stat, now))
+		__blk_stat_init(stat, now);
+
+	value = now - blk_stat_time(&rq->issue_stat);
+	if (value > stat->max)
+		stat->max = value;
+	if (value < stat->min)
+		stat->min = value;
+
+	if (stat->batch + value < stat->batch ||
+	    stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
+		blk_stat_flush_batch(stat);
+
+	stat->batch += value;
+	stat->nr_batch++;
+}
+
+void blk_stat_clear(struct request_queue *q)
+{
+	if (q->mq_ops) {
+		struct blk_mq_hw_ctx *hctx;
+		struct blk_mq_ctx *ctx;
+		int i, j;
+
+		queue_for_each_hw_ctx(q, hctx, i) {
+			hctx_for_each_ctx(hctx, ctx, j) {
+				blk_stat_init(&ctx->stat[BLK_STAT_READ]);
+				blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
+			}
+		}
+	} else {
+		blk_stat_init(&q->rq_stats[BLK_STAT_READ]);
+		blk_stat_init(&q->rq_stats[BLK_STAT_WRITE]);
+	}
+}
+
+void blk_stat_set_issue_time(struct blk_issue_stat *stat)
+{
+	stat->time = (stat->time & BLK_STAT_MASK) |
+			(ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK);
+}
+
+/*
+ * Enable stat tracking, return whether it was enabled
+ */
+bool blk_stat_enable(struct request_queue *q)
+{
+	if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
+		set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+		return false;
+	}
+
+	return true;
+}
diff --git a/block/blk-stat.h b/block/blk-stat.h
new file mode 100644
index 000000000000..a2050a0a5314
--- /dev/null
+++ b/block/blk-stat.h
@@ -0,0 +1,42 @@
+#ifndef BLK_STAT_H
+#define BLK_STAT_H
+
+/*
+ * ~0.13s window as a power-of-2 (2^27 nsecs)
+ */
+#define BLK_STAT_NSEC		134217728ULL
+#define BLK_STAT_NSEC_MASK	~(BLK_STAT_NSEC - 1)
+
+/*
+ * Upper 3 bits can be used elsewhere
+ */
+#define BLK_STAT_RES_BITS	3
+#define BLK_STAT_SHIFT		(64 - BLK_STAT_RES_BITS)
+#define BLK_STAT_TIME_MASK	((1ULL << BLK_STAT_SHIFT) - 1)
+#define BLK_STAT_MASK		~BLK_STAT_TIME_MASK
+
+enum {
+	BLK_STAT_READ	= 0,
+	BLK_STAT_WRITE,
+};
+
+void blk_stat_add(struct blk_rq_stat *, struct request *);
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
+void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
+void blk_stat_clear(struct request_queue *);
+void blk_stat_init(struct blk_rq_stat *);
+bool blk_stat_is_current(struct blk_rq_stat *);
+void blk_stat_set_issue_time(struct blk_issue_stat *);
+bool blk_stat_enable(struct request_queue *);
+
+static inline u64 __blk_stat_time(u64 time)
+{
+	return time & BLK_STAT_TIME_MASK;
+}
+
+static inline u64 blk_stat_time(struct blk_issue_stat *stat)
+{
+	return __blk_stat_time(stat->time);
+}
+
+#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f87a7e747d36..002af836aa87 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -13,6 +13,7 @@
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-wbt.h"
 
 struct queue_sysfs_entry {
 	struct attribute attr;
@@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
 	return count;
 }
 
+static ssize_t queue_var_store64(s64 *var, const char *page)
+{
+	int err;
+	s64 v;
+
+	err = kstrtos64(page, 10, &v);
+	if (err < 0)
+		return err;
+
+	*var = v;
+	return 0;
+}
+
 static ssize_t queue_requests_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(q->nr_requests, (page));
@@ -75,7 +89,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 
 static ssize_t queue_ra_show(struct request_queue *q, char *page)
 {
-	unsigned long ra_kb = q->backing_dev_info.ra_pages <<
+	unsigned long ra_kb = q->backing_dev_info->ra_pages <<
 					(PAGE_SHIFT - 10);
 
 	return queue_var_show(ra_kb, (page));
@@ -90,7 +104,7 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
 	if (ret < 0)
 		return ret;
 
-	q->backing_dev_info.ra_pages = ra_kb >> (PAGE_SHIFT - 10);
+	q->backing_dev_info->ra_pages = ra_kb >> (PAGE_SHIFT - 10);
 
 	return ret;
 }
@@ -107,6 +121,12 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
 	return queue_var_show(queue_max_segments(q), (page));
 }
 
+static ssize_t queue_max_discard_segments_show(struct request_queue *q,
+		char *page)
+{
+	return queue_var_show(queue_max_discard_segments(q), (page));
+}
+
 static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(q->limits.max_integrity_segments, (page));
@@ -130,6 +150,11 @@ static ssize_t queue_physical_block_size_show(struct request_queue *q, char *pag
 	return queue_var_show(queue_physical_block_size(q), page);
 }
 
+static ssize_t queue_chunk_sectors_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.chunk_sectors, page);
+}
+
 static ssize_t queue_io_min_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(queue_io_min(q), page);
@@ -192,6 +217,11 @@ static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
 		(unsigned long long)q->limits.max_write_same_sectors << 9);
 }
 
+static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%llu\n",
+		(unsigned long long)q->limits.max_write_zeroes_sectors << 9);
+}
 
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
@@ -212,6 +242,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 
 	spin_lock_irq(q->queue_lock);
 	q->limits.max_sectors = max_sectors_kb << 1;
+	q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
 	spin_unlock_irq(q->queue_lock);
 
 	return ret;
@@ -257,6 +288,18 @@ QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
 QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
 #undef QUEUE_SYSFS_BIT_FNS
 
+static ssize_t queue_zoned_show(struct request_queue *q, char *page)
+{
+	switch (blk_queue_zoned_model(q)) {
+	case BLK_ZONED_HA:
+		return sprintf(page, "host-aware\n");
+	case BLK_ZONED_HM:
+		return sprintf(page, "host-managed\n");
+	default:
+		return sprintf(page, "none\n");
+	}
+}
+
 static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
 {
 	return queue_var_show((blk_queue_nomerges(q) << 1) |
@@ -319,6 +362,38 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 	return ret;
 }
 
+static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
+{
+	int val;
+
+	if (q->poll_nsec == -1)
+		val = -1;
+	else
+		val = q->poll_nsec / 1000;
+
+	return sprintf(page, "%d\n", val);
+}
+
+static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
+				size_t count)
+{
+	int err, val;
+
+	if (!q->mq_ops || !q->mq_ops->poll)
+		return -EINVAL;
+
+	err = kstrtoint(page, 10, &val);
+	if (err < 0)
+		return err;
+
+	if (val == -1)
+		q->poll_nsec = -1;
+	else
+		q->poll_nsec = val * 1000;
+
+	return count;
+}
+
 static ssize_t queue_poll_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page);
@@ -347,6 +422,50 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
 	return ret;
 }
 
+static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
+{
+	if (!q->rq_wb)
+		return -EINVAL;
+
+	return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
+}
+
+static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
+				  size_t count)
+{
+	struct rq_wb *rwb;
+	ssize_t ret;
+	s64 val;
+
+	ret = queue_var_store64(&val, page);
+	if (ret < 0)
+		return ret;
+	if (val < -1)
+		return -EINVAL;
+
+	rwb = q->rq_wb;
+	if (!rwb) {
+		ret = wbt_init(q);
+		if (ret)
+			return ret;
+
+		rwb = q->rq_wb;
+		if (!rwb)
+			return -EINVAL;
+	}
+
+	if (val == -1)
+		rwb->min_lat_nsec = wbt_default_latency_nsec(q);
+	else if (val >= 0)
+		rwb->min_lat_nsec = val * 1000ULL;
+
+	if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
+		rwb->enable_state = WBT_STATE_ON_MANUAL;
+
+	wbt_update_limits(rwb);
+	return count;
+}
+
 static ssize_t queue_wc_show(struct request_queue *q, char *page)
 {
 	if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
@@ -384,6 +503,26 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
 	return queue_var_show(blk_queue_dax(q), page);
 }
 
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
+{
+	return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
+			pre, (long long) stat->nr_samples,
+			(long long) stat->mean, (long long) stat->min,
+			(long long) stat->max);
+}
+
+static ssize_t queue_stats_show(struct request_queue *q, char *page)
+{
+	struct blk_rq_stat stat[2];
+	ssize_t ret;
+
+	blk_queue_stat_get(q, stat);
+
+	ret = print_stat(page, &stat[BLK_STAT_READ], "read :");
+	ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:");
+	return ret;
+}
+
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_requests_show,
@@ -412,6 +551,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = {
 	.show = queue_max_segments_show,
 };
 
+static struct queue_sysfs_entry queue_max_discard_segments_entry = {
+	.attr = {.name = "max_discard_segments", .mode = S_IRUGO },
+	.show = queue_max_discard_segments_show,
+};
+
 static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
 	.attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
 	.show = queue_max_integrity_segments_show,
@@ -443,6 +587,11 @@ static struct queue_sysfs_entry queue_physical_block_size_entry = {
 	.show = queue_physical_block_size_show,
 };
 
+static struct queue_sysfs_entry queue_chunk_sectors_entry = {
+	.attr = {.name = "chunk_sectors", .mode = S_IRUGO },
+	.show = queue_chunk_sectors_show,
+};
+
 static struct queue_sysfs_entry queue_io_min_entry = {
 	.attr = {.name = "minimum_io_size", .mode = S_IRUGO },
 	.show = queue_io_min_show,
@@ -479,12 +628,22 @@ static struct queue_sysfs_entry queue_write_same_max_entry = {
 	.show = queue_write_same_max_show,
 };
 
+static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
+	.attr = {.name = "write_zeroes_max_bytes", .mode = S_IRUGO },
+	.show = queue_write_zeroes_max_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_show_nonrot,
 	.store = queue_store_nonrot,
 };
 
+static struct queue_sysfs_entry queue_zoned_entry = {
+	.attr = {.name = "zoned", .mode = S_IRUGO },
+	.show = queue_zoned_show,
+};
+
 static struct queue_sysfs_entry queue_nomerges_entry = {
 	.attr = {.name = "nomerges", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nomerges_show,
@@ -515,6 +674,12 @@ static struct queue_sysfs_entry queue_poll_entry = {
 	.store = queue_poll_store,
 };
 
+static struct queue_sysfs_entry queue_poll_delay_entry = {
+	.attr = {.name = "io_poll_delay", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_poll_delay_show,
+	.store = queue_poll_delay_store,
+};
+
 static struct queue_sysfs_entry queue_wc_entry = {
 	.attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_wc_show,
@@ -526,18 +691,31 @@ static struct queue_sysfs_entry queue_dax_entry = {
 	.show = queue_dax_show,
 };
 
+static struct queue_sysfs_entry queue_stats_entry = {
+	.attr = {.name = "stats", .mode = S_IRUGO },
+	.show = queue_stats_show,
+};
+
+static struct queue_sysfs_entry queue_wb_lat_entry = {
+	.attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_wb_lat_show,
+	.store = queue_wb_lat_store,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
 	&queue_max_segments_entry.attr,
+	&queue_max_discard_segments_entry.attr,
 	&queue_max_integrity_segments_entry.attr,
 	&queue_max_segment_size_entry.attr,
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
 	&queue_logical_block_size_entry.attr,
 	&queue_physical_block_size_entry.attr,
+	&queue_chunk_sectors_entry.attr,
 	&queue_io_min_entry.attr,
 	&queue_io_opt_entry.attr,
 	&queue_discard_granularity_entry.attr,
@@ -545,7 +723,9 @@ static struct attribute *default_attrs[] = {
 	&queue_discard_max_hw_entry.attr,
 	&queue_discard_zeroes_data_entry.attr,
 	&queue_write_same_max_entry.attr,
+	&queue_write_zeroes_max_entry.attr,
 	&queue_nonrot_entry.attr,
+	&queue_zoned_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
 	&queue_iostats_entry.attr,
@@ -553,6 +733,9 @@ static struct attribute *default_attrs[] = {
 	&queue_poll_entry.attr,
 	&queue_wc_entry.attr,
 	&queue_dax_entry.attr,
+	&queue_stats_entry.attr,
+	&queue_wb_lat_entry.attr,
+	&queue_poll_delay_entry.attr,
 	NULL,
 };
 
@@ -627,7 +810,8 @@ static void blk_release_queue(struct kobject *kobj)
 	struct request_queue *q =
 		container_of(kobj, struct request_queue, kobj);
 
-	bdi_exit(&q->backing_dev_info);
+	wbt_exit(q);
+	bdi_put(q->backing_dev_info);
 	blkcg_exit_queue(q);
 
 	if (q->elevator) {
@@ -642,13 +826,19 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
-	if (!q->mq_ops)
+	if (!q->mq_ops) {
+		if (q->exit_rq_fn)
+			q->exit_rq_fn(q, q->fq->flush_rq);
 		blk_free_flush_queue(q->fq);
-	else
+	} else {
 		blk_mq_release(q);
+	}
 
 	blk_trace_shutdown(q);
 
+	if (q->mq_ops)
+		blk_mq_debugfs_unregister(q);
+
 	if (q->bio_split)
 		bioset_free(q->bio_split);
 
@@ -667,6 +857,23 @@ struct kobj_type blk_queue_ktype = {
 	.release	= blk_release_queue,
 };
 
+static void blk_wb_init(struct request_queue *q)
+{
+#ifndef CONFIG_BLK_WBT_MQ
+	if (q->mq_ops)
+		return;
+#endif
+#ifndef CONFIG_BLK_WBT_SQ
+	if (q->request_fn)
+		return;
+#endif
+
+	/*
+	 * If this fails, we don't get throttling
+	 */
+	wbt_init(q);
+}
+
 int blk_register_queue(struct gendisk *disk)
 {
 	int ret;
@@ -695,30 +902,36 @@ int blk_register_queue(struct gendisk *disk)
 	if (ret)
 		return ret;
 
+	if (q->mq_ops)
+		blk_mq_register_dev(dev, q);
+
+	/* Prevent changes through sysfs until registration is completed. */
+	mutex_lock(&q->sysfs_lock);
+
 	ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
 	if (ret < 0) {
 		blk_trace_remove_sysfs(dev);
-		return ret;
+		goto unlock;
 	}
 
 	kobject_uevent(&q->kobj, KOBJ_ADD);
 
-	if (q->mq_ops)
-		blk_mq_register_disk(disk);
-
-	if (!q->request_fn)
-		return 0;
-
-	ret = elv_register_queue(q);
-	if (ret) {
-		kobject_uevent(&q->kobj, KOBJ_REMOVE);
-		kobject_del(&q->kobj);
-		blk_trace_remove_sysfs(dev);
-		kobject_put(&dev->kobj);
-		return ret;
+	blk_wb_init(q);
+
+	if (q->request_fn || (q->mq_ops && q->elevator)) {
+		ret = elv_register_queue(q);
+		if (ret) {
+			kobject_uevent(&q->kobj, KOBJ_REMOVE);
+			kobject_del(&q->kobj);
+			blk_trace_remove_sysfs(dev);
+			kobject_put(&dev->kobj);
+			goto unlock;
+		}
 	}
-
-	return 0;
+	ret = 0;
+unlock:
+	mutex_unlock(&q->sysfs_lock);
+	return ret;
 }
 
 void blk_unregister_queue(struct gendisk *disk)
@@ -729,9 +942,9 @@ void blk_unregister_queue(struct gendisk *disk)
 		return;
 
 	if (q->mq_ops)
-		blk_mq_unregister_disk(disk);
+		blk_mq_unregister_dev(disk_to_dev(disk), q);
 
-	if (q->request_fn)
+	if (q->request_fn || (q->mq_ops && q->elevator))
 		elv_unregister_queue(q);
 
 	kobject_uevent(&q->kobj, KOBJ_REMOVE);
diff --git a/block/blk-tag.c b/block/blk-tag.c
index f0344e6939d5..07cc329fa4b0 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -270,8 +270,9 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
 	BUG_ON(tag >= bqt->real_max_depth);
 
 	list_del_init(&rq->queuelist);
-	rq->cmd_flags &= ~REQ_QUEUED;
+	rq->rq_flags &= ~RQF_QUEUED;
 	rq->tag = -1;
+	rq->internal_tag = -1;
 
 	if (unlikely(bqt->tag_index[tag] == NULL))
 		printk(KERN_ERR "%s: tag %d is missing\n",
@@ -316,7 +317,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	unsigned max_depth;
 	int tag;
 
-	if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
+	if (unlikely((rq->rq_flags & RQF_QUEUED))) {
 		printk(KERN_ERR
 		       "%s: request %p for device [%s] already tagged %d",
 		       __func__, rq,
@@ -371,7 +372,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	 */
 
 	bqt->next_tag = (tag + 1) % bqt->max_depth;
-	rq->cmd_flags |= REQ_QUEUED;
+	rq->rq_flags |= RQF_QUEUED;
 	rq->tag = tag;
 	bqt->tag_index[tag] = rq;
 	blk_start_request(rq);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a3ea8260c94c..82fd0cc394eb 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -818,13 +818,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	tg->io_disp[rw]++;
 
 	/*
-	 * REQ_THROTTLED is used to prevent the same bio to be throttled
+	 * BIO_THROTTLED is used to prevent the same bio to be throttled
 	 * more than once as a throttled bio will go through blk-throtl the
 	 * second time when it eventually gets issued.  Set it when a bio
 	 * is being charged to a tg.
 	 */
-	if (!(bio->bi_opf & REQ_THROTTLED))
-		bio->bi_opf |= REQ_THROTTLED;
+	if (!bio_flagged(bio, BIO_THROTTLED))
+		bio_set_flag(bio, BIO_THROTTLED);
 }
 
 /**
@@ -866,10 +866,12 @@ static void tg_update_disptime(struct throtl_grp *tg)
 	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
 	struct bio *bio;
 
-	if ((bio = throtl_peek_queued(&sq->queued[READ])))
+	bio = throtl_peek_queued(&sq->queued[READ]);
+	if (bio)
 		tg_may_dispatch(tg, bio, &read_wait);
 
-	if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
+	bio = throtl_peek_queued(&sq->queued[WRITE]);
+	if (bio)
 		tg_may_dispatch(tg, bio, &write_wait);
 
 	min_wait = min(read_wait, write_wait);
@@ -1401,7 +1403,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
 	/* see throtl_charge_bio() */
-	if ((bio->bi_opf & REQ_THROTTLED) || !tg->has_rules[rw])
+	if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw])
 		goto out;
 
 	spin_lock_irq(q->queue_lock);
@@ -1480,7 +1482,7 @@ out:
 	 * being issued.
 	 */
 	if (!throttled)
-		bio->bi_opf &= ~REQ_THROTTLED;
+		bio_clear_flag(bio, BIO_THROTTLED);
 	return throttled;
 }
 
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
new file mode 100644
index 000000000000..1aedb1f7ee0c
--- /dev/null
+++ b/block/blk-wbt.c
@@ -0,0 +1,751 @@
+/*
+ * buffered writeback throttling. loosely based on CoDel. We can't drop
+ * packets for IO scheduling, so the logic is something like this:
+ *
+ * - Monitor latencies in a defined window of time.
+ * - If the minimum latency in the above window exceeds some target, increment
+ *   scaling step and scale down queue depth by a factor of 2x. The monitoring
+ *   window is then shrunk to 100 / sqrt(scaling step + 1).
+ * - For any window where we don't have solid data on what the latencies
+ *   look like, retain status quo.
+ * - If latencies look good, decrement scaling step.
+ * - If we're only doing writes, allow the scaling step to go negative. This
+ *   will temporarily boost write performance, snapping back to a stable
+ *   scaling step of 0 if reads show up or the heavy writers finish. Unlike
+ *   positive scaling steps where we shrink the monitoring window, a negative
+ *   scaling step retains the default step==0 window size.
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <linux/swap.h>
+
+#include "blk-wbt.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/wbt.h>
+
+enum {
+	/*
+	 * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
+	 * from here depending on device stats
+	 */
+	RWB_DEF_DEPTH	= 16,
+
+	/*
+	 * 100msec window
+	 */
+	RWB_WINDOW_NSEC		= 100 * 1000 * 1000ULL,
+
+	/*
+	 * Disregard stats, if we don't meet this minimum
+	 */
+	RWB_MIN_WRITE_SAMPLES	= 3,
+
+	/*
+	 * If we have this number of consecutive windows with not enough
+	 * information to scale up or down, scale up.
+	 */
+	RWB_UNKNOWN_BUMP	= 5,
+};
+
+static inline bool rwb_enabled(struct rq_wb *rwb)
+{
+	return rwb && rwb->wb_normal != 0;
+}
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, int below)
+{
+	int cur = atomic_read(v);
+
+	for (;;) {
+		int old;
+
+		if (cur >= below)
+			return false;
+		old = atomic_cmpxchg(v, cur, cur + 1);
+		if (old == cur)
+			break;
+		cur = old;
+	}
+
+	return true;
+}
+
+static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
+{
+	if (rwb_enabled(rwb)) {
+		const unsigned long cur = jiffies;
+
+		if (cur != *var)
+			*var = cur;
+	}
+}
+
+/*
+ * If a task was rate throttled in balance_dirty_pages() within the last
+ * second or so, use that to indicate a higher cleaning rate.
+ */
+static bool wb_recent_wait(struct rq_wb *rwb)
+{
+	struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb;
+
+	return time_before(jiffies, wb->dirty_sleep + HZ);
+}
+
+static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd)
+{
+	return &rwb->rq_wait[is_kswapd];
+}
+
+static void rwb_wake_all(struct rq_wb *rwb)
+{
+	int i;
+
+	for (i = 0; i < WBT_NUM_RWQ; i++) {
+		struct rq_wait *rqw = &rwb->rq_wait[i];
+
+		if (waitqueue_active(&rqw->wait))
+			wake_up_all(&rqw->wait);
+	}
+}
+
+void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
+{
+	struct rq_wait *rqw;
+	int inflight, limit;
+
+	if (!(wb_acct & WBT_TRACKED))
+		return;
+
+	rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
+	inflight = atomic_dec_return(&rqw->inflight);
+
+	/*
+	 * wbt got disabled with IO in flight. Wake up any potential
+	 * waiters, we don't have to do more than that.
+	 */
+	if (unlikely(!rwb_enabled(rwb))) {
+		rwb_wake_all(rwb);
+		return;
+	}
+
+	/*
+	 * If the device does write back caching, drop further down
+	 * before we wake people up.
+	 */
+	if (rwb->wc && !wb_recent_wait(rwb))
+		limit = 0;
+	else
+		limit = rwb->wb_normal;
+
+	/*
+	 * Don't wake anyone up if we are above the normal limit.
+	 */
+	if (inflight && inflight >= limit)
+		return;
+
+	if (waitqueue_active(&rqw->wait)) {
+		int diff = limit - inflight;
+
+		if (!inflight || diff >= rwb->wb_background / 2)
+			wake_up_all(&rqw->wait);
+	}
+}
+
+/*
+ * Called on completion of a request. Note that it's also called when
+ * a request is merged, when the request gets freed.
+ */
+void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+	if (!rwb)
+		return;
+
+	if (!wbt_is_tracked(stat)) {
+		if (rwb->sync_cookie == stat) {
+			rwb->sync_issue = 0;
+			rwb->sync_cookie = NULL;
+		}
+
+		if (wbt_is_read(stat))
+			wb_timestamp(rwb, &rwb->last_comp);
+		wbt_clear_state(stat);
+	} else {
+		WARN_ON_ONCE(stat == rwb->sync_cookie);
+		__wbt_done(rwb, wbt_stat_to_mask(stat));
+		wbt_clear_state(stat);
+	}
+}
+
+/*
+ * Return true, if we can't increase the depth further by scaling
+ */
+static bool calc_wb_limits(struct rq_wb *rwb)
+{
+	unsigned int depth;
+	bool ret = false;
+
+	if (!rwb->min_lat_nsec) {
+		rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
+		return false;
+	}
+
+	/*
+	 * For QD=1 devices, this is a special case. It's important for those
+	 * to have one request ready when one completes, so force a depth of
+	 * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
+	 * since the device can't have more than that in flight. If we're
+	 * scaling down, then keep a setting of 1/1/1.
+	 */
+	if (rwb->queue_depth == 1) {
+		if (rwb->scale_step > 0)
+			rwb->wb_max = rwb->wb_normal = 1;
+		else {
+			rwb->wb_max = rwb->wb_normal = 2;
+			ret = true;
+		}
+		rwb->wb_background = 1;
+	} else {
+		/*
+		 * scale_step == 0 is our default state. If we have suffered
+		 * latency spikes, step will be > 0, and we shrink the
+		 * allowed write depths. If step is < 0, we're only doing
+		 * writes, and we allow a temporarily higher depth to
+		 * increase performance.
+		 */
+		depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
+		if (rwb->scale_step > 0)
+			depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
+		else if (rwb->scale_step < 0) {
+			unsigned int maxd = 3 * rwb->queue_depth / 4;
+
+			depth = 1 + ((depth - 1) << -rwb->scale_step);
+			if (depth > maxd) {
+				depth = maxd;
+				ret = true;
+			}
+		}
+
+		/*
+		 * Set our max/normal/bg queue depths based on how far
+		 * we have scaled down (->scale_step).
+		 */
+		rwb->wb_max = depth;
+		rwb->wb_normal = (rwb->wb_max + 1) / 2;
+		rwb->wb_background = (rwb->wb_max + 3) / 4;
+	}
+
+	return ret;
+}
+
+static inline bool stat_sample_valid(struct blk_rq_stat *stat)
+{
+	/*
+	 * We need at least one read sample, and a minimum of
+	 * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
+	 * that it's writes impacting us, and not just some sole read on
+	 * a device that is in a lower power state.
+	 */
+	return stat[BLK_STAT_READ].nr_samples >= 1 &&
+		stat[BLK_STAT_WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES;
+}
+
+static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
+{
+	u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
+
+	if (!issue || !rwb->sync_cookie)
+		return 0;
+
+	now = ktime_to_ns(ktime_get());
+	return now - issue;
+}
+
+enum {
+	LAT_OK = 1,
+	LAT_UNKNOWN,
+	LAT_UNKNOWN_WRITES,
+	LAT_EXCEEDED,
+};
+
+static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
+{
+	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
+	u64 thislat;
+
+	/*
+	 * If our stored sync issue exceeds the window size, or it
+	 * exceeds our min target AND we haven't logged any entries,
+	 * flag the latency as exceeded. wbt works off completion latencies,
+	 * but for a flooded device, a single sync IO can take a long time
+	 * to complete after being issued. If this time exceeds our
+	 * monitoring window AND we didn't see any other completions in that
+	 * window, then count that sync IO as a violation of the latency.
+	 */
+	thislat = rwb_sync_issue_lat(rwb);
+	if (thislat > rwb->cur_win_nsec ||
+	    (thislat > rwb->min_lat_nsec && !stat[BLK_STAT_READ].nr_samples)) {
+		trace_wbt_lat(bdi, thislat);
+		return LAT_EXCEEDED;
+	}
+
+	/*
+	 * No read/write mix, if stat isn't valid
+	 */
+	if (!stat_sample_valid(stat)) {
+		/*
+		 * If we had writes in this stat window and the window is
+		 * current, we're only doing writes. If a task recently
+		 * waited or still has writes in flights, consider us doing
+		 * just writes as well.
+		 */
+		if ((stat[BLK_STAT_WRITE].nr_samples && blk_stat_is_current(stat)) ||
+		    wb_recent_wait(rwb) || wbt_inflight(rwb))
+			return LAT_UNKNOWN_WRITES;
+		return LAT_UNKNOWN;
+	}
+
+	/*
+	 * If the 'min' latency exceeds our target, step down.
+	 */
+	if (stat[BLK_STAT_READ].min > rwb->min_lat_nsec) {
+		trace_wbt_lat(bdi, stat[BLK_STAT_READ].min);
+		trace_wbt_stat(bdi, stat);
+		return LAT_EXCEEDED;
+	}
+
+	if (rwb->scale_step)
+		trace_wbt_stat(bdi, stat);
+
+	return LAT_OK;
+}
+
+static int latency_exceeded(struct rq_wb *rwb)
+{
+	struct blk_rq_stat stat[2];
+
+	blk_queue_stat_get(rwb->queue, stat);
+	return __latency_exceeded(rwb, stat);
+}
+
+static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
+{
+	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
+
+	trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
+			rwb->wb_background, rwb->wb_normal, rwb->wb_max);
+}
+
+static void scale_up(struct rq_wb *rwb)
+{
+	/*
+	 * Hit max in previous round, stop here
+	 */
+	if (rwb->scaled_max)
+		return;
+
+	rwb->scale_step--;
+	rwb->unknown_cnt = 0;
+	blk_stat_clear(rwb->queue);
+
+	rwb->scaled_max = calc_wb_limits(rwb);
+
+	rwb_wake_all(rwb);
+
+	rwb_trace_step(rwb, "step up");
+}
+
+/*
+ * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
+ * had a latency violation.
+ */
+static void scale_down(struct rq_wb *rwb, bool hard_throttle)
+{
+	/*
+	 * Stop scaling down when we've hit the limit. This also prevents
+	 * ->scale_step from going to crazy values, if the device can't
+	 * keep up.
+	 */
+	if (rwb->wb_max == 1)
+		return;
+
+	if (rwb->scale_step < 0 && hard_throttle)
+		rwb->scale_step = 0;
+	else
+		rwb->scale_step++;
+
+	rwb->scaled_max = false;
+	rwb->unknown_cnt = 0;
+	blk_stat_clear(rwb->queue);
+	calc_wb_limits(rwb);
+	rwb_trace_step(rwb, "step down");
+}
+
+static void rwb_arm_timer(struct rq_wb *rwb)
+{
+	unsigned long expires;
+
+	if (rwb->scale_step > 0) {
+		/*
+		 * We should speed this up, using some variant of a fast
+		 * integer inverse square root calculation. Since we only do
+		 * this for every window expiration, it's not a huge deal,
+		 * though.
+		 */
+		rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
+					int_sqrt((rwb->scale_step + 1) << 8));
+	} else {
+		/*
+		 * For step < 0, we don't want to increase/decrease the
+		 * window size.
+		 */
+		rwb->cur_win_nsec = rwb->win_nsec;
+	}
+
+	expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
+	mod_timer(&rwb->window_timer, expires);
+}
+
+static void wb_timer_fn(unsigned long data)
+{
+	struct rq_wb *rwb = (struct rq_wb *) data;
+	unsigned int inflight = wbt_inflight(rwb);
+	int status;
+
+	status = latency_exceeded(rwb);
+
+	trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
+			inflight);
+
+	/*
+	 * If we exceeded the latency target, step down. If we did not,
+	 * step one level up. If we don't know enough to say either exceeded
+	 * or ok, then don't do anything.
+	 */
+	switch (status) {
+	case LAT_EXCEEDED:
+		scale_down(rwb, true);
+		break;
+	case LAT_OK:
+		scale_up(rwb);
+		break;
+	case LAT_UNKNOWN_WRITES:
+		/*
+		 * We started a the center step, but don't have a valid
+		 * read/write sample, but we do have writes going on.
+		 * Allow step to go negative, to increase write perf.
+		 */
+		scale_up(rwb);
+		break;
+	case LAT_UNKNOWN:
+		if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
+			break;
+		/*
+		 * We get here when previously scaled reduced depth, and we
+		 * currently don't have a valid read/write sample. For that
+		 * case, slowly return to center state (step == 0).
+		 */
+		if (rwb->scale_step > 0)
+			scale_up(rwb);
+		else if (rwb->scale_step < 0)
+			scale_down(rwb, false);
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * Re-arm timer, if we have IO in flight
+	 */
+	if (rwb->scale_step || inflight)
+		rwb_arm_timer(rwb);
+}
+
+void wbt_update_limits(struct rq_wb *rwb)
+{
+	rwb->scale_step = 0;
+	rwb->scaled_max = false;
+	calc_wb_limits(rwb);
+
+	rwb_wake_all(rwb);
+}
+
+static bool close_io(struct rq_wb *rwb)
+{
+	const unsigned long now = jiffies;
+
+	return time_before(now, rwb->last_issue + HZ / 10) ||
+		time_before(now, rwb->last_comp + HZ / 10);
+}
+
+#define REQ_HIPRIO	(REQ_SYNC | REQ_META | REQ_PRIO)
+
+static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
+{
+	unsigned int limit;
+
+	/*
+	 * At this point we know it's a buffered write. If this is
+	 * kswapd trying to free memory, or REQ_SYNC is set, set, then
+	 * it's WB_SYNC_ALL writeback, and we'll use the max limit for
+	 * that. If the write is marked as a background write, then use
+	 * the idle limit, or go to normal if we haven't had competing
+	 * IO for a bit.
+	 */
+	if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
+		limit = rwb->wb_max;
+	else if ((rw & REQ_BACKGROUND) || close_io(rwb)) {
+		/*
+		 * If less than 100ms since we completed unrelated IO,
+		 * limit us to half the depth for background writeback.
+		 */
+		limit = rwb->wb_background;
+	} else
+		limit = rwb->wb_normal;
+
+	return limit;
+}
+
+static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
+			     wait_queue_t *wait, unsigned long rw)
+{
+	/*
+	 * inc it here even if disabled, since we'll dec it at completion.
+	 * this only happens if the task was sleeping in __wbt_wait(),
+	 * and someone turned it off at the same time.
+	 */
+	if (!rwb_enabled(rwb)) {
+		atomic_inc(&rqw->inflight);
+		return true;
+	}
+
+	/*
+	 * If the waitqueue is already active and we are not the next
+	 * in line to be woken up, wait for our turn.
+	 */
+	if (waitqueue_active(&rqw->wait) &&
+	    rqw->wait.task_list.next != &wait->task_list)
+		return false;
+
+	return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
+}
+
+/*
+ * Block if we will exceed our limit, or if we are currently waiting for
+ * the timer to kick off queuing again.
+ */
+static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
+	__releases(lock)
+	__acquires(lock)
+{
+	struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
+	DEFINE_WAIT(wait);
+
+	if (may_queue(rwb, rqw, &wait, rw))
+		return;
+
+	do {
+		prepare_to_wait_exclusive(&rqw->wait, &wait,
+						TASK_UNINTERRUPTIBLE);
+
+		if (may_queue(rwb, rqw, &wait, rw))
+			break;
+
+		if (lock) {
+			spin_unlock_irq(lock);
+			io_schedule();
+			spin_lock_irq(lock);
+		} else
+			io_schedule();
+	} while (1);
+
+	finish_wait(&rqw->wait, &wait);
+}
+
+static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
+{
+	const int op = bio_op(bio);
+
+	/*
+	 * If not a WRITE, do nothing
+	 */
+	if (op != REQ_OP_WRITE)
+		return false;
+
+	/*
+	 * Don't throttle WRITE_ODIRECT
+	 */
+	if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE))
+		return false;
+
+	return true;
+}
+
+/*
+ * Returns true if the IO request should be accounted, false if not.
+ * May sleep, if we have exceeded the writeback limits. Caller can pass
+ * in an irq held spinlock, if it holds one when calling this function.
+ * If we do sleep, we'll release and re-grab it.
+ */
+enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
+{
+	unsigned int ret = 0;
+
+	if (!rwb_enabled(rwb))
+		return 0;
+
+	if (bio_op(bio) == REQ_OP_READ)
+		ret = WBT_READ;
+
+	if (!wbt_should_throttle(rwb, bio)) {
+		if (ret & WBT_READ)
+			wb_timestamp(rwb, &rwb->last_issue);
+		return ret;
+	}
+
+	__wbt_wait(rwb, bio->bi_opf, lock);
+
+	if (!timer_pending(&rwb->window_timer))
+		rwb_arm_timer(rwb);
+
+	if (current_is_kswapd())
+		ret |= WBT_KSWAPD;
+
+	return ret | WBT_TRACKED;
+}
+
+void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+	if (!rwb_enabled(rwb))
+		return;
+
+	/*
+	 * Track sync issue, in case it takes a long time to complete. Allows
+	 * us to react quicker, if a sync IO takes a long time to complete.
+	 * Note that this is just a hint. 'stat' can go away when the
+	 * request completes, so it's important we never dereference it. We
+	 * only use the address to compare with, which is why we store the
+	 * sync_issue time locally.
+	 */
+	if (wbt_is_read(stat) && !rwb->sync_issue) {
+		rwb->sync_cookie = stat;
+		rwb->sync_issue = blk_stat_time(stat);
+	}
+}
+
+void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+	if (!rwb_enabled(rwb))
+		return;
+	if (stat == rwb->sync_cookie) {
+		rwb->sync_issue = 0;
+		rwb->sync_cookie = NULL;
+	}
+}
+
+void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+{
+	if (rwb) {
+		rwb->queue_depth = depth;
+		wbt_update_limits(rwb);
+	}
+}
+
+void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
+{
+	if (rwb)
+		rwb->wc = write_cache_on;
+}
+
+ /*
+ * Disable wbt, if enabled by default. Only called from CFQ, if we have
+ * cgroups enabled
+ */
+void wbt_disable_default(struct request_queue *q)
+{
+	struct rq_wb *rwb = q->rq_wb;
+
+	if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) {
+		del_timer_sync(&rwb->window_timer);
+		rwb->win_nsec = rwb->min_lat_nsec = 0;
+		wbt_update_limits(rwb);
+	}
+}
+EXPORT_SYMBOL_GPL(wbt_disable_default);
+
+u64 wbt_default_latency_nsec(struct request_queue *q)
+{
+	/*
+	 * We default to 2msec for non-rotational storage, and 75msec
+	 * for rotational storage.
+	 */
+	if (blk_queue_nonrot(q))
+		return 2000000ULL;
+	else
+		return 75000000ULL;
+}
+
+int wbt_init(struct request_queue *q)
+{
+	struct rq_wb *rwb;
+	int i;
+
+	/*
+	 * For now, we depend on the stats window being larger than
+	 * our monitoring window. Ensure that this isn't inadvertently
+	 * violated.
+	 */
+	BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC);
+	BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
+
+	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
+	if (!rwb)
+		return -ENOMEM;
+
+	for (i = 0; i < WBT_NUM_RWQ; i++) {
+		atomic_set(&rwb->rq_wait[i].inflight, 0);
+		init_waitqueue_head(&rwb->rq_wait[i].wait);
+	}
+
+	setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
+	rwb->wc = 1;
+	rwb->queue_depth = RWB_DEF_DEPTH;
+	rwb->last_comp = rwb->last_issue = jiffies;
+	rwb->queue = q;
+	rwb->win_nsec = RWB_WINDOW_NSEC;
+	rwb->enable_state = WBT_STATE_ON_DEFAULT;
+	wbt_update_limits(rwb);
+
+	/*
+	 * Assign rwb, and turn on stats tracking for this queue
+	 */
+	q->rq_wb = rwb;
+	blk_stat_enable(q);
+
+	rwb->min_lat_nsec = wbt_default_latency_nsec(q);
+
+	wbt_set_queue_depth(rwb, blk_queue_depth(q));
+	wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+
+	return 0;
+}
+
+void wbt_exit(struct request_queue *q)
+{
+	struct rq_wb *rwb = q->rq_wb;
+
+	if (rwb) {
+		del_timer_sync(&rwb->window_timer);
+		q->rq_wb = NULL;
+		kfree(rwb);
+	}
+}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
new file mode 100644
index 000000000000..65f1de519f67
--- /dev/null
+++ b/block/blk-wbt.h
@@ -0,0 +1,171 @@
+#ifndef WB_THROTTLE_H
+#define WB_THROTTLE_H
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/wait.h>
+#include <linux/timer.h>
+#include <linux/ktime.h>
+
+#include "blk-stat.h"
+
+enum wbt_flags {
+	WBT_TRACKED		= 1,	/* write, tracked for throttling */
+	WBT_READ		= 2,	/* read */
+	WBT_KSWAPD		= 4,	/* write, from kswapd */
+
+	WBT_NR_BITS		= 3,	/* number of bits */
+};
+
+enum {
+	WBT_NUM_RWQ		= 2,
+};
+
+/*
+ * Enable states. Either off, or on by default (done at init time),
+ * or on through manual setup in sysfs.
+ */
+enum {
+	WBT_STATE_ON_DEFAULT	= 1,
+	WBT_STATE_ON_MANUAL	= 2,
+};
+
+static inline void wbt_clear_state(struct blk_issue_stat *stat)
+{
+	stat->time &= BLK_STAT_TIME_MASK;
+}
+
+static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat)
+{
+	return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT;
+}
+
+static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct)
+{
+	stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT;
+}
+
+static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
+{
+	return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED;
+}
+
+static inline bool wbt_is_read(struct blk_issue_stat *stat)
+{
+	return (stat->time >> BLK_STAT_SHIFT) & WBT_READ;
+}
+
+struct rq_wait {
+	wait_queue_head_t wait;
+	atomic_t inflight;
+};
+
+struct rq_wb {
+	/*
+	 * Settings that govern how we throttle
+	 */
+	unsigned int wb_background;		/* background writeback */
+	unsigned int wb_normal;			/* normal writeback */
+	unsigned int wb_max;			/* max throughput writeback */
+	int scale_step;
+	bool scaled_max;
+
+	short enable_state;			/* WBT_STATE_* */
+
+	/*
+	 * Number of consecutive periods where we don't have enough
+	 * information to make a firm scale up/down decision.
+	 */
+	unsigned int unknown_cnt;
+
+	u64 win_nsec;				/* default window size */
+	u64 cur_win_nsec;			/* current window size */
+
+	struct timer_list window_timer;
+
+	s64 sync_issue;
+	void *sync_cookie;
+
+	unsigned int wc;
+	unsigned int queue_depth;
+
+	unsigned long last_issue;		/* last non-throttled issue */
+	unsigned long last_comp;		/* last non-throttled comp */
+	unsigned long min_lat_nsec;
+	struct request_queue *queue;
+	struct rq_wait rq_wait[WBT_NUM_RWQ];
+};
+
+static inline unsigned int wbt_inflight(struct rq_wb *rwb)
+{
+	unsigned int i, ret = 0;
+
+	for (i = 0; i < WBT_NUM_RWQ; i++)
+		ret += atomic_read(&rwb->rq_wait[i].inflight);
+
+	return ret;
+}
+
+#ifdef CONFIG_BLK_WBT
+
+void __wbt_done(struct rq_wb *, enum wbt_flags);
+void wbt_done(struct rq_wb *, struct blk_issue_stat *);
+enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *);
+int wbt_init(struct request_queue *);
+void wbt_exit(struct request_queue *);
+void wbt_update_limits(struct rq_wb *);
+void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
+void wbt_issue(struct rq_wb *, struct blk_issue_stat *);
+void wbt_disable_default(struct request_queue *);
+
+void wbt_set_queue_depth(struct rq_wb *, unsigned int);
+void wbt_set_write_cache(struct rq_wb *, bool);
+
+u64 wbt_default_latency_nsec(struct request_queue *);
+
+#else
+
+static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags)
+{
+}
+static inline void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+}
+static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio,
+				      spinlock_t *lock)
+{
+	return 0;
+}
+static inline int wbt_init(struct request_queue *q)
+{
+	return -EINVAL;
+}
+static inline void wbt_exit(struct request_queue *q)
+{
+}
+static inline void wbt_update_limits(struct rq_wb *rwb)
+{
+}
+static inline void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+}
+static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+}
+static inline void wbt_disable_default(struct request_queue *q)
+{
+}
+static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+{
+}
+static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc)
+{
+}
+static inline u64 wbt_default_latency_nsec(struct request_queue *q)
+{
+	return 0;
+}
+
+#endif /* CONFIG_BLK_WBT */
+
+#endif
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
new file mode 100644
index 000000000000..3bd15d8095b1
--- /dev/null
+++ b/block/blk-zoned.c
@@ -0,0 +1,348 @@
+/*
+ * Zoned block device handling
+ *
+ * Copyright (c) 2015, Hannes Reinecke
+ * Copyright (c) 2015, SUSE Linux GmbH
+ *
+ * Copyright (c) 2016, Damien Le Moal
+ * Copyright (c) 2016, Western Digital
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/blkdev.h>
+
+static inline sector_t blk_zone_start(struct request_queue *q,
+				      sector_t sector)
+{
+	sector_t zone_mask = blk_queue_zone_sectors(q) - 1;
+
+	return sector & ~zone_mask;
+}
+
+/*
+ * Check that a zone report belongs to the partition.
+ * If yes, fix its start sector and write pointer, copy it in the
+ * zone information array and return true. Return false otherwise.
+ */
+static bool blkdev_report_zone(struct block_device *bdev,
+			       struct blk_zone *rep,
+			       struct blk_zone *zone)
+{
+	sector_t offset = get_start_sect(bdev);
+
+	if (rep->start < offset)
+		return false;
+
+	rep->start -= offset;
+	if (rep->start + rep->len > bdev->bd_part->nr_sects)
+		return false;
+
+	if (rep->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		rep->wp = rep->start + rep->len;
+	else
+		rep->wp -= offset;
+	memcpy(zone, rep, sizeof(struct blk_zone));
+
+	return true;
+}
+
+/**
+ * blkdev_report_zones - Get zones information
+ * @bdev:	Target block device
+ * @sector:	Sector from which to report zones
+ * @zones:	Array of zone structures where to return the zones information
+ * @nr_zones:	Number of zone structures in the zone array
+ * @gfp_mask:	Memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *    Get zone information starting from the zone containing @sector.
+ *    The number of zone information reported may be less than the number
+ *    requested by @nr_zones. The number of zones actually reported is
+ *    returned in @nr_zones.
+ */
+int blkdev_report_zones(struct block_device *bdev,
+			sector_t sector,
+			struct blk_zone *zones,
+			unsigned int *nr_zones,
+			gfp_t gfp_mask)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct blk_zone_report_hdr *hdr;
+	unsigned int nrz = *nr_zones;
+	struct page *page;
+	unsigned int nr_rep;
+	size_t rep_bytes;
+	unsigned int nr_pages;
+	struct bio *bio;
+	struct bio_vec *bv;
+	unsigned int i, n, nz;
+	unsigned int ofst;
+	void *addr;
+	int ret;
+
+	if (!q)
+		return -ENXIO;
+
+	if (!blk_queue_is_zoned(q))
+		return -EOPNOTSUPP;
+
+	if (!nrz)
+		return 0;
+
+	if (sector > bdev->bd_part->nr_sects) {
+		*nr_zones = 0;
+		return 0;
+	}
+
+	/*
+	 * The zone report has a header. So make room for it in the
+	 * payload. Also make sure that the report fits in a single BIO
+	 * that will not be split down the stack.
+	 */
+	rep_bytes = sizeof(struct blk_zone_report_hdr) +
+		sizeof(struct blk_zone) * nrz;
+	rep_bytes = (rep_bytes + PAGE_SIZE - 1) & PAGE_MASK;
+	if (rep_bytes > (queue_max_sectors(q) << 9))
+		rep_bytes = queue_max_sectors(q) << 9;
+
+	nr_pages = min_t(unsigned int, BIO_MAX_PAGES,
+			 rep_bytes >> PAGE_SHIFT);
+	nr_pages = min_t(unsigned int, nr_pages,
+			 queue_max_segments(q));
+
+	bio = bio_alloc(gfp_mask, nr_pages);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_bdev = bdev;
+	bio->bi_iter.bi_sector = blk_zone_start(q, sector);
+	bio_set_op_attrs(bio, REQ_OP_ZONE_REPORT, 0);
+
+	for (i = 0; i < nr_pages; i++) {
+		page = alloc_page(gfp_mask);
+		if (!page) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		if (!bio_add_page(bio, page, PAGE_SIZE, 0)) {
+			__free_page(page);
+			break;
+		}
+	}
+
+	if (i == 0)
+		ret = -ENOMEM;
+	else
+		ret = submit_bio_wait(bio);
+	if (ret)
+		goto out;
+
+	/*
+	 * Process the report result: skip the header and go through the
+	 * reported zones to fixup and fixup the zone information for
+	 * partitions. At the same time, return the zone information into
+	 * the zone array.
+	 */
+	n = 0;
+	nz = 0;
+	nr_rep = 0;
+	bio_for_each_segment_all(bv, bio, i) {
+
+		if (!bv->bv_page)
+			break;
+
+		addr = kmap_atomic(bv->bv_page);
+
+		/* Get header in the first page */
+		ofst = 0;
+		if (!nr_rep) {
+			hdr = (struct blk_zone_report_hdr *) addr;
+			nr_rep = hdr->nr_zones;
+			ofst = sizeof(struct blk_zone_report_hdr);
+		}
+
+		/* Fixup and report zones */
+		while (ofst < bv->bv_len &&
+		       n < nr_rep && nz < nrz) {
+			if (blkdev_report_zone(bdev, addr + ofst, &zones[nz]))
+				nz++;
+			ofst += sizeof(struct blk_zone);
+			n++;
+		}
+
+		kunmap_atomic(addr);
+
+		if (n >= nr_rep || nz >= nrz)
+			break;
+
+	}
+
+	*nr_zones = nz;
+out:
+	bio_for_each_segment_all(bv, bio, i)
+		__free_page(bv->bv_page);
+	bio_put(bio);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_report_zones);
+
+/**
+ * blkdev_reset_zones - Reset zones write pointer
+ * @bdev:	Target block device
+ * @sector:	Start sector of the first zone to reset
+ * @nr_sectors:	Number of sectors, at least the length of one zone
+ * @gfp_mask:	Memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *    Reset the write pointer of the zones contained in the range
+ *    @sector..@sector+@nr_sectors. Specifying the entire disk sector range
+ *    is valid, but the specified range should not contain conventional zones.
+ */
+int blkdev_reset_zones(struct block_device *bdev,
+		       sector_t sector, sector_t nr_sectors,
+		       gfp_t gfp_mask)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	sector_t zone_sectors;
+	sector_t end_sector = sector + nr_sectors;
+	struct bio *bio;
+	int ret;
+
+	if (!q)
+		return -ENXIO;
+
+	if (!blk_queue_is_zoned(q))
+		return -EOPNOTSUPP;
+
+	if (end_sector > bdev->bd_part->nr_sects)
+		/* Out of range */
+		return -EINVAL;
+
+	/* Check alignment (handle eventual smaller last zone) */
+	zone_sectors = blk_queue_zone_sectors(q);
+	if (sector & (zone_sectors - 1))
+		return -EINVAL;
+
+	if ((nr_sectors & (zone_sectors - 1)) &&
+	    end_sector != bdev->bd_part->nr_sects)
+		return -EINVAL;
+
+	while (sector < end_sector) {
+
+		bio = bio_alloc(gfp_mask, 0);
+		bio->bi_iter.bi_sector = sector;
+		bio->bi_bdev = bdev;
+		bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0);
+
+		ret = submit_bio_wait(bio);
+		bio_put(bio);
+
+		if (ret)
+			return ret;
+
+		sector += zone_sectors;
+
+		/* This may take a while, so be nice to others */
+		cond_resched();
+
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkdev_reset_zones);
+
+/**
+ * BLKREPORTZONE ioctl processing.
+ * Called from blkdev_ioctl.
+ */
+int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
+			      unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	struct request_queue *q;
+	struct blk_zone_report rep;
+	struct blk_zone *zones;
+	int ret;
+
+	if (!argp)
+		return -EINVAL;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	if (!blk_queue_is_zoned(q))
+		return -ENOTTY;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
+		return -EFAULT;
+
+	if (!rep.nr_zones)
+		return -EINVAL;
+
+	zones = kcalloc(rep.nr_zones, sizeof(struct blk_zone), GFP_KERNEL);
+	if (!zones)
+		return -ENOMEM;
+
+	ret = blkdev_report_zones(bdev, rep.sector,
+				  zones, &rep.nr_zones,
+				  GFP_KERNEL);
+	if (ret)
+		goto out;
+
+	if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (rep.nr_zones) {
+		if (copy_to_user(argp + sizeof(struct blk_zone_report), zones,
+				 sizeof(struct blk_zone) * rep.nr_zones))
+			ret = -EFAULT;
+	}
+
+ out:
+	kfree(zones);
+
+	return ret;
+}
+
+/**
+ * BLKRESETZONE ioctl processing.
+ * Called from blkdev_ioctl.
+ */
+int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	struct request_queue *q;
+	struct blk_zone_range zrange;
+
+	if (!argp)
+		return -EINVAL;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	if (!blk_queue_is_zoned(q))
+		return -ENOTTY;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!(mode & FMODE_WRITE))
+		return -EBADF;
+
+	if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
+		return -EFAULT;
+
+	return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors,
+				  GFP_KERNEL);
+}
diff --git a/block/blk.h b/block/blk.h
index c37492f5edaa..d1ea4bd9b9a3 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -14,6 +14,10 @@
 /* Max future timer expiry for timeouts */
 #define BLK_MAX_TIMEOUT		(5 * HZ)
 
+#ifdef CONFIG_DEBUG_FS
+extern struct dentry *blk_debugfs_root;
+#endif
+
 struct blk_flush_queue {
 	unsigned int		flush_queue_delayed:1;
 	unsigned int		flush_pending_idx:1;
@@ -39,14 +43,9 @@ extern struct ida blk_queue_ida;
 static inline struct blk_flush_queue *blk_get_flush_queue(
 		struct request_queue *q, struct blk_mq_ctx *ctx)
 {
-	struct blk_mq_hw_ctx *hctx;
-
-	if (!q->mq_ops)
-		return q->fq;
-
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
-
-	return hctx->fq;
+	if (q->mq_ops)
+		return blk_mq_map_queue(q, ctx->cpu)->fq;
+	return q->fq;
 }
 
 static inline void __blk_get_queue(struct request_queue *q)
@@ -101,6 +100,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 			     struct bio *bio);
 bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 			    struct bio *bio);
+bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
+		struct bio *bio);
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 			    unsigned int *request_count,
 			    struct request **same_queue_rq);
@@ -116,6 +117,7 @@ void blk_account_io_done(struct request *req);
 enum rq_atomic_flags {
 	REQ_ATOM_COMPLETE = 0,
 	REQ_ATOM_STARTED,
+	REQ_ATOM_POLL_SLEPT,
 };
 
 /*
@@ -135,7 +137,7 @@ static inline void blk_clear_rq_complete(struct request *rq)
 /*
  * Internal elevator interface
  */
-#define ELV_ON_HASH(rq) ((rq)->cmd_flags & REQ_HASHED)
+#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)
 
 void blk_insert_flush(struct request *rq);
 
@@ -171,7 +173,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 			return NULL;
 		}
 		if (unlikely(blk_queue_bypass(q)) ||
-		    !q->elevator->type->ops.elevator_dispatch_fn(q, 0))
+		    !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
 			return NULL;
 	}
 }
@@ -180,16 +182,16 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_activate_req_fn)
-		e->type->ops.elevator_activate_req_fn(q, rq);
+	if (e->type->ops.sq.elevator_activate_req_fn)
+		e->type->ops.sq.elevator_activate_req_fn(q, rq);
 }
 
 static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_deactivate_req_fn)
-		e->type->ops.elevator_deactivate_req_fn(q, rq);
+	if (e->type->ops.sq.elevator_deactivate_req_fn)
+		e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
 }
 
 #ifdef CONFIG_FAIL_IO_TIMEOUT
@@ -208,14 +210,14 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 		     struct bio *bio);
 int ll_front_merge_fn(struct request_queue *q, struct request *req, 
 		      struct bio *bio);
-int attempt_back_merge(struct request_queue *q, struct request *rq);
-int attempt_front_merge(struct request_queue *q, struct request *rq);
+struct request *attempt_back_merge(struct request_queue *q, struct request *rq);
+struct request *attempt_front_merge(struct request_queue *q, struct request *rq);
 int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 				struct request *next);
 void blk_recalc_rq_segments(struct request *rq);
 void blk_rq_set_mixed_merge(struct request *rq);
 bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
-int blk_try_merge(struct request *rq, struct bio *bio);
+enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);
 
 void blk_queue_congestion_threshold(struct request_queue *q);
 
@@ -252,8 +254,15 @@ extern int blk_update_nr_requests(struct request_queue *, unsigned int);
 static inline int blk_do_io_stat(struct request *rq)
 {
 	return rq->rq_disk &&
-	       (rq->cmd_flags & REQ_IO_STAT) &&
-		(rq->cmd_type == REQ_TYPE_FS);
+	       (rq->rq_flags & RQF_IO_STAT) &&
+		!blk_rq_is_passthrough(rq);
+}
+
+static inline void req_set_nomerge(struct request_queue *q, struct request *req)
+{
+	req->cmd_flags |= REQ_NOMERGE;
+	if (req == q->last_merge)
+		q->last_merge = NULL;
 }
 
 /*
@@ -268,6 +277,22 @@ void ioc_clear_queue(struct request_queue *q);
 int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
 
 /**
+ * rq_ioc - determine io_context for request allocation
+ * @bio: request being allocated is for this bio (can be %NULL)
+ *
+ * Determine io_context to use for request allocation for @bio.  May return
+ * %NULL if %current->io_context doesn't exist.
+ */
+static inline struct io_context *rq_ioc(struct bio *bio)
+{
+#ifdef CONFIG_BLK_CGROUP
+	if (bio && bio->bi_ioc)
+		return bio->bi_ioc;
+#endif
+	return current->io_context;
+}
+
+/**
  * create_io_context - try to create task->io_context
  * @gfp_mask: allocation mask
  * @node: allocation node
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 650f427d915b..cd15f9dbb147 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -32,8 +32,13 @@
  * bsg_destroy_job - routine to teardown/delete a bsg job
  * @job: bsg_job that is to be torn down
  */
-static void bsg_destroy_job(struct bsg_job *job)
+static void bsg_destroy_job(struct kref *kref)
 {
+	struct bsg_job *job = container_of(kref, struct bsg_job, kref);
+	struct request *rq = job->req;
+
+	blk_end_request_all(rq, rq->errors);
+
 	put_device(job->dev);	/* release reference for the request */
 
 	kfree(job->request_payload.sg_list);
@@ -41,6 +46,18 @@ static void bsg_destroy_job(struct bsg_job *job)
 	kfree(job);
 }
 
+void bsg_job_put(struct bsg_job *job)
+{
+	kref_put(&job->kref, bsg_destroy_job);
+}
+EXPORT_SYMBOL_GPL(bsg_job_put);
+
+int bsg_job_get(struct bsg_job *job)
+{
+	return kref_get_unless_zero(&job->kref);
+}
+EXPORT_SYMBOL_GPL(bsg_job_get);
+
 /**
  * bsg_job_done - completion routine for bsg requests
  * @job: bsg_job that is complete
@@ -54,22 +71,24 @@ void bsg_job_done(struct bsg_job *job, int result,
 {
 	struct request *req = job->req;
 	struct request *rsp = req->next_rq;
+	struct scsi_request *rq = scsi_req(req);
 	int err;
 
 	err = job->req->errors = result;
 	if (err < 0)
 		/* we're only returning the result field in the reply */
-		job->req->sense_len = sizeof(u32);
+		rq->sense_len = sizeof(u32);
 	else
-		job->req->sense_len = job->reply_len;
+		rq->sense_len = job->reply_len;
 	/* we assume all request payload was transferred, residual == 0 */
-	req->resid_len = 0;
+	rq->resid_len = 0;
 
 	if (rsp) {
-		WARN_ON(reply_payload_rcv_len > rsp->resid_len);
+		WARN_ON(reply_payload_rcv_len > scsi_req(rsp)->resid_len);
 
 		/* set reply (bidi) residual */
-		rsp->resid_len -= min(reply_payload_rcv_len, rsp->resid_len);
+		scsi_req(rsp)->resid_len -=
+			min(reply_payload_rcv_len, scsi_req(rsp)->resid_len);
 	}
 	blk_complete_request(req);
 }
@@ -83,8 +102,7 @@ static void bsg_softirq_done(struct request *rq)
 {
 	struct bsg_job *job = rq->special;
 
-	blk_end_request_all(rq, rq->errors);
-	bsg_destroy_job(job);
+	bsg_job_put(job);
 }
 
 static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
@@ -97,6 +115,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
 	if (!buf->sg_list)
 		return -ENOMEM;
 	sg_init_table(buf->sg_list, req->nr_phys_segments);
+	scsi_req(req)->resid_len = blk_rq_bytes(req);
 	buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list);
 	buf->payload_len = blk_rq_bytes(req);
 	return 0;
@@ -111,6 +130,7 @@ static int bsg_create_job(struct device *dev, struct request *req)
 {
 	struct request *rsp = req->next_rq;
 	struct request_queue *q = req->q;
+	struct scsi_request *rq = scsi_req(req);
 	struct bsg_job *job;
 	int ret;
 
@@ -124,9 +144,9 @@ static int bsg_create_job(struct device *dev, struct request *req)
 	job->req = req;
 	if (q->bsg_job_size)
 		job->dd_data = (void *)&job[1];
-	job->request = req->cmd;
-	job->request_len = req->cmd_len;
-	job->reply = req->sense;
+	job->request = rq->cmd;
+	job->request_len = rq->cmd_len;
+	job->reply = rq->sense;
 	job->reply_len = SCSI_SENSE_BUFFERSIZE;	/* Size of sense buffer
 						 * allocated */
 	if (req->bio) {
@@ -142,6 +162,7 @@ static int bsg_create_job(struct device *dev, struct request *req)
 	job->dev = dev;
 	/* take a reference for the request */
 	get_device(job->dev);
+	kref_init(&job->kref);
 	return 0;
 
 failjob_rls_rqst_payload:
@@ -160,7 +181,9 @@ failjob_rls_job:
  *
  * Drivers/subsys should pass this to the queue init function.
  */
-void bsg_request_fn(struct request_queue *q)
+static void bsg_request_fn(struct request_queue *q)
+	__releases(q->queue_lock)
+	__acquires(q->queue_lock)
 {
 	struct device *dev = q->queuedata;
 	struct request *req;
@@ -195,24 +218,30 @@ void bsg_request_fn(struct request_queue *q)
 	put_device(dev);
 	spin_lock_irq(q->queue_lock);
 }
-EXPORT_SYMBOL_GPL(bsg_request_fn);
 
 /**
  * bsg_setup_queue - Create and add the bsg hooks so we can receive requests
  * @dev: device to attach bsg device to
- * @q: request queue setup by caller
  * @name: device to give bsg device
  * @job_fn: bsg job handler
  * @dd_job_size: size of LLD data needed for each job
- *
- * The caller should have setup the reuqest queue with bsg_request_fn
- * as the request_fn.
  */
-int bsg_setup_queue(struct device *dev, struct request_queue *q,
-		    char *name, bsg_job_fn *job_fn, int dd_job_size)
+struct request_queue *bsg_setup_queue(struct device *dev, char *name,
+		bsg_job_fn *job_fn, int dd_job_size)
 {
+	struct request_queue *q;
 	int ret;
 
+	q = blk_alloc_queue(GFP_KERNEL);
+	if (!q)
+		return ERR_PTR(-ENOMEM);
+	q->cmd_size = sizeof(struct scsi_request);
+	q->request_fn = bsg_request_fn;
+
+	ret = blk_init_allocated_queue(q);
+	if (ret)
+		goto out_cleanup_queue;
+
 	q->queuedata = dev;
 	q->bsg_job_size = dd_job_size;
 	q->bsg_job_fn = job_fn;
@@ -224,9 +253,12 @@ int bsg_setup_queue(struct device *dev, struct request_queue *q,
 	if (ret) {
 		printk(KERN_ERR "%s: bsg interface failed to "
 		       "initialize - register queue\n", dev->kobj.name);
-		return ret;
+		goto out_cleanup_queue;
 	}
 
-	return 0;
+	return q;
+out_cleanup_queue:
+	blk_cleanup_queue(q);
+	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL_GPL(bsg_setup_queue);
diff --git a/block/bsg.c b/block/bsg.c
index d214e929ce18..a9a8b8e0446f 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -85,7 +85,6 @@ struct bsg_command {
 	struct bio *bidi_bio;
 	int err;
 	struct sg_io_v4 hdr;
-	char sense[SCSI_SENSE_BUFFERSIZE];
 };
 
 static void bsg_free_command(struct bsg_command *bc)
@@ -140,18 +139,20 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 				struct sg_io_v4 *hdr, struct bsg_device *bd,
 				fmode_t has_write_perm)
 {
+	struct scsi_request *req = scsi_req(rq);
+
 	if (hdr->request_len > BLK_MAX_CDB) {
-		rq->cmd = kzalloc(hdr->request_len, GFP_KERNEL);
-		if (!rq->cmd)
+		req->cmd = kzalloc(hdr->request_len, GFP_KERNEL);
+		if (!req->cmd)
 			return -ENOMEM;
 	}
 
-	if (copy_from_user(rq->cmd, (void __user *)(unsigned long)hdr->request,
+	if (copy_from_user(req->cmd, (void __user *)(unsigned long)hdr->request,
 			   hdr->request_len))
 		return -EFAULT;
 
 	if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) {
-		if (blk_verify_command(rq->cmd, has_write_perm))
+		if (blk_verify_command(req->cmd, has_write_perm))
 			return -EPERM;
 	} else if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
@@ -159,7 +160,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 	/*
 	 * fill in request structure
 	 */
-	rq->cmd_len = hdr->request_len;
+	req->cmd_len = hdr->request_len;
 
 	rq->timeout = msecs_to_jiffies(hdr->timeout);
 	if (!rq->timeout)
@@ -176,7 +177,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
  * Check if sg_io_v4 from user is allowed and valid
  */
 static int
-bsg_validate_sgv4_hdr(struct request_queue *q, struct sg_io_v4 *hdr, int *rw)
+bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op)
 {
 	int ret = 0;
 
@@ -197,7 +198,7 @@ bsg_validate_sgv4_hdr(struct request_queue *q, struct sg_io_v4 *hdr, int *rw)
 		ret = -EINVAL;
 	}
 
-	*rw = hdr->dout_xfer_len ? WRITE : READ;
+	*op = hdr->dout_xfer_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN;
 	return ret;
 }
 
@@ -205,13 +206,12 @@ bsg_validate_sgv4_hdr(struct request_queue *q, struct sg_io_v4 *hdr, int *rw)
  * map sg_io_v4 to a request.
  */
 static struct request *
-bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
-	    u8 *sense)
+bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
 {
 	struct request_queue *q = bd->queue;
 	struct request *rq, *next_rq = NULL;
-	int ret, rw;
-	unsigned int dxfer_len;
+	int ret;
+	unsigned int op, dxfer_len;
 	void __user *dxferp = NULL;
 	struct bsg_class_device *bcd = &q->bsg_dev;
 
@@ -226,36 +226,35 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
 		hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
 		hdr->din_xfer_len);
 
-	ret = bsg_validate_sgv4_hdr(q, hdr, &rw);
+	ret = bsg_validate_sgv4_hdr(hdr, &op);
 	if (ret)
 		return ERR_PTR(ret);
 
 	/*
 	 * map scatter-gather elements separately and string them to request
 	 */
-	rq = blk_get_request(q, rw, GFP_KERNEL);
+	rq = blk_get_request(q, op, GFP_KERNEL);
 	if (IS_ERR(rq))
 		return rq;
-	blk_rq_set_block_pc(rq);
+	scsi_req_init(rq);
 
 	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm);
 	if (ret)
 		goto out;
 
-	if (rw == WRITE && hdr->din_xfer_len) {
+	if (op == REQ_OP_SCSI_OUT && hdr->din_xfer_len) {
 		if (!test_bit(QUEUE_FLAG_BIDI, &q->queue_flags)) {
 			ret = -EOPNOTSUPP;
 			goto out;
 		}
 
-		next_rq = blk_get_request(q, READ, GFP_KERNEL);
+		next_rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
 		if (IS_ERR(next_rq)) {
 			ret = PTR_ERR(next_rq);
 			next_rq = NULL;
 			goto out;
 		}
 		rq->next_rq = next_rq;
-		next_rq->cmd_type = rq->cmd_type;
 
 		dxferp = (void __user *)(unsigned long)hdr->din_xferp;
 		ret =  blk_rq_map_user(q, next_rq, NULL, dxferp,
@@ -280,13 +279,9 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
 			goto out;
 	}
 
-	rq->sense = sense;
-	rq->sense_len = 0;
-
 	return rq;
 out:
-	if (rq->cmd != rq->__cmd)
-		kfree(rq->cmd);
+	scsi_req_free_cmd(scsi_req(rq));
 	blk_put_request(rq);
 	if (next_rq) {
 		blk_rq_unmap_user(next_rq->bio);
@@ -393,6 +388,7 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
 static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 				    struct bio *bio, struct bio *bidi_bio)
 {
+	struct scsi_request *req = scsi_req(rq);
 	int ret = 0;
 
 	dprintk("rq %p bio %p 0x%x\n", rq, bio, rq->errors);
@@ -407,12 +403,12 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 		hdr->info |= SG_INFO_CHECK;
 	hdr->response_len = 0;
 
-	if (rq->sense_len && hdr->response) {
+	if (req->sense_len && hdr->response) {
 		int len = min_t(unsigned int, hdr->max_response_len,
-					rq->sense_len);
+					req->sense_len);
 
 		ret = copy_to_user((void __user *)(unsigned long)hdr->response,
-				   rq->sense, len);
+				   req->sense, len);
 		if (!ret)
 			hdr->response_len = len;
 		else
@@ -420,14 +416,14 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 	}
 
 	if (rq->next_rq) {
-		hdr->dout_resid = rq->resid_len;
-		hdr->din_resid = rq->next_rq->resid_len;
+		hdr->dout_resid = req->resid_len;
+		hdr->din_resid = scsi_req(rq->next_rq)->resid_len;
 		blk_rq_unmap_user(bidi_bio);
 		blk_put_request(rq->next_rq);
 	} else if (rq_data_dir(rq) == READ)
-		hdr->din_resid = rq->resid_len;
+		hdr->din_resid = req->resid_len;
 	else
-		hdr->dout_resid = rq->resid_len;
+		hdr->dout_resid = req->resid_len;
 
 	/*
 	 * If the request generated a negative error number, return it
@@ -439,8 +435,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 		ret = rq->errors;
 
 	blk_rq_unmap_user(bio);
-	if (rq->cmd != rq->__cmd)
-		kfree(rq->cmd);
+	scsi_req_free_cmd(req);
 	blk_put_request(rq);
 
 	return ret;
@@ -625,7 +620,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf,
 		/*
 		 * get a request, fill in the blanks, and add to request queue
 		 */
-		rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm, bc->sense);
+		rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm);
 		if (IS_ERR(rq)) {
 			ret = PTR_ERR(rq);
 			rq = NULL;
@@ -655,6 +650,9 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 
 	dprintk("%s: write %Zd bytes\n", bd->name, count);
 
+	if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+		return -EINVAL;
+
 	bsg_set_block(bd, file);
 
 	bytes_written = 0;
@@ -908,12 +906,11 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		struct bio *bio, *bidi_bio = NULL;
 		struct sg_io_v4 hdr;
 		int at_head;
-		u8 sense[SCSI_SENSE_BUFFERSIZE];
 
 		if (copy_from_user(&hdr, uarg, sizeof(hdr)))
 			return -EFAULT;
 
-		rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE, sense);
+		rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE);
 		if (IS_ERR(rq))
 			return PTR_ERR(rq);
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index cc2f6dbd4303..137944777859 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -16,6 +16,7 @@
 #include <linux/blktrace_api.h>
 #include <linux/blk-cgroup.h>
 #include "blk.h"
+#include "blk-wbt.h"
 
 /*
  * tunables
@@ -667,10 +668,10 @@ static inline void cfqg_put(struct cfq_group *cfqg)
 } while (0)
 
 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
-					    struct cfq_group *curr_cfqg, int op,
-					    int op_flags)
+					    struct cfq_group *curr_cfqg,
+					    unsigned int op)
 {
-	blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, 1);
+	blkg_rwstat_add(&cfqg->stats.queued, op, 1);
 	cfqg_stats_end_empty_time(&cfqg->stats);
 	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
 }
@@ -684,30 +685,29 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 #endif
 }
 
-static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op,
-					       int op_flags)
+static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
+					       unsigned int op)
 {
-	blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, -1);
+	blkg_rwstat_add(&cfqg->stats.queued, op, -1);
 }
 
-static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op,
-					       int op_flags)
+static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
+					       unsigned int op)
 {
-	blkg_rwstat_add(&cfqg->stats.merged, op, op_flags, 1);
+	blkg_rwstat_add(&cfqg->stats.merged, op, 1);
 }
 
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-			uint64_t start_time, uint64_t io_start_time, int op,
-			int op_flags)
+			uint64_t start_time, uint64_t io_start_time,
+			unsigned int op)
 {
 	struct cfqg_stats *stats = &cfqg->stats;
 	unsigned long long now = sched_clock();
 
 	if (time_after64(now, io_start_time))
-		blkg_rwstat_add(&stats->service_time, op, op_flags,
-				now - io_start_time);
+		blkg_rwstat_add(&stats->service_time, op, now - io_start_time);
 	if (time_after64(io_start_time, start_time))
-		blkg_rwstat_add(&stats->wait_time, op, op_flags,
+		blkg_rwstat_add(&stats->wait_time, op,
 				io_start_time - start_time);
 }
 
@@ -786,16 +786,16 @@ static inline void cfqg_put(struct cfq_group *cfqg) { }
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
 
 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
-			struct cfq_group *curr_cfqg, int op, int op_flags) { }
+			struct cfq_group *curr_cfqg, unsigned int op) { }
 static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 			uint64_t time, unsigned long unaccounted_time) { }
-static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op,
-			int op_flags) { }
-static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op,
-			int op_flags) { }
+static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
+			unsigned int op) { }
+static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
+			unsigned int op) { }
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-			uint64_t start_time, uint64_t io_start_time, int op,
-			int op_flags) { }
+			uint64_t start_time, uint64_t io_start_time,
+			unsigned int op) { }
 
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
 
@@ -913,15 +913,6 @@ static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
 }
 
 /*
- * We regard a request as SYNC, if it's either a read or has the SYNC bit
- * set (in which case it could also be direct WRITE).
- */
-static inline bool cfq_bio_sync(struct bio *bio)
-{
-	return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC);
-}
-
-/*
  * scheduler run of queue, if there are requests pending and no one in the
  * driver that will restart queueing
  */
@@ -1596,7 +1587,7 @@ static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
 {
 	struct cfq_group_data *cgd;
 
-	cgd = kzalloc(sizeof(*cgd), GFP_KERNEL);
+	cgd = kzalloc(sizeof(*cgd), gfp);
 	if (!cgd)
 		return NULL;
 	return &cgd->cpd;
@@ -2474,10 +2465,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
-	cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags);
+	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
 	cfq_add_rq_rb(rq);
 	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
-				 req_op(rq), rq->cmd_flags);
+				 rq->cmd_flags);
 }
 
 static struct request *
@@ -2491,7 +2482,7 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 	if (!cic)
 		return NULL;
 
-	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
+	cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf));
 	if (cfqq)
 		return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
 
@@ -2530,14 +2521,14 @@ static void cfq_remove_request(struct request *rq)
 	cfq_del_rq_rb(rq);
 
 	cfqq->cfqd->rq_queued--;
-	cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags);
+	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
 	if (rq->cmd_flags & REQ_PRIO) {
 		WARN_ON(!cfqq->prio_pending);
 		cfqq->prio_pending--;
 	}
 }
 
-static int cfq_merge(struct request_queue *q, struct request **req,
+static enum elv_merge cfq_merge(struct request_queue *q, struct request **req,
 		     struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
@@ -2553,7 +2544,7 @@ static int cfq_merge(struct request_queue *q, struct request **req,
 }
 
 static void cfq_merged_request(struct request_queue *q, struct request *req,
-			       int type)
+			       enum elv_merge type)
 {
 	if (type == ELEVATOR_FRONT_MERGE) {
 		struct cfq_queue *cfqq = RQ_CFQQ(req);
@@ -2565,7 +2556,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
 static void cfq_bio_merged(struct request_queue *q, struct request *req,
 				struct bio *bio)
 {
-	cfqg_stats_update_io_merged(RQ_CFQG(req), bio_op(bio), bio->bi_opf);
+	cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf);
 }
 
 static void
@@ -2588,7 +2579,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 	if (cfqq->next_rq == next)
 		cfqq->next_rq = rq;
 	cfq_remove_request(next);
-	cfqg_stats_update_io_merged(RQ_CFQG(rq), req_op(next), next->cmd_flags);
+	cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
 
 	cfqq = RQ_CFQQ(next);
 	/*
@@ -2605,13 +2596,14 @@ static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq,
 			       struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
+	bool is_sync = op_is_sync(bio->bi_opf);
 	struct cfq_io_cq *cic;
 	struct cfq_queue *cfqq;
 
 	/*
 	 * Disallow merge of a sync bio into an async request.
 	 */
-	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
+	if (is_sync && !rq_is_sync(rq))
 		return false;
 
 	/*
@@ -2622,7 +2614,7 @@ static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq,
 	if (!cic)
 		return false;
 
-	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
+	cfqq = cic_to_cfqq(cic, is_sync);
 	return cfqq == RQ_CFQQ(rq);
 }
 
@@ -2757,9 +2749,11 @@ static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
 	if (!cfqg)
 		return NULL;
 
-	for_each_cfqg_st(cfqg, i, j, st)
-		if ((cfqq = cfq_rb_first(st)) != NULL)
+	for_each_cfqg_st(cfqg, i, j, st) {
+		cfqq = cfq_rb_first(st);
+		if (cfqq)
 			return cfqq;
+	}
 	return NULL;
 }
 
@@ -3042,7 +3036,6 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
 	if (ktime_get_ns() < rq->fifo_time)
 		rq = NULL;
 
-	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
 	return rq;
 }
 
@@ -3420,6 +3413,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	unsigned int max_dispatch;
 
+	if (cfq_cfqq_must_dispatch(cfqq))
+		return true;
+
 	/*
 	 * Drain async requests before we start sync IO
 	 */
@@ -3511,15 +3507,20 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
 
+	rq = cfq_check_fifo(cfqq);
+	if (rq)
+		cfq_mark_cfqq_must_dispatch(cfqq);
+
 	if (!cfq_may_dispatch(cfqd, cfqq))
 		return false;
 
 	/*
 	 * follow expired path, else get first next available
 	 */
-	rq = cfq_check_fifo(cfqq);
 	if (!rq)
 		rq = cfqq->next_rq;
+	else
+		cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
 
 	/*
 	 * insert request into driver dispatch list
@@ -3759,14 +3760,16 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
+static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 {
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct cfq_queue *cfqq;
 	uint64_t serial_nr;
+	bool nonroot_cg;
 
 	rcu_read_lock();
 	serial_nr = bio_blkcg(bio)->css.serial_nr;
+	nonroot_cg = bio_blkcg(bio) != &blkcg_root;
 	rcu_read_unlock();
 
 	/*
@@ -3774,7 +3777,7 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 	 * spuriously on a newly created cic but there's no harm.
 	 */
 	if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
-		return;
+		return nonroot_cg;
 
 	/*
 	 * Drop reference to queues.  New queues will be assigned in new
@@ -3795,9 +3798,13 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 	}
 
 	cic->blkcg_serial_nr = serial_nr;
+	return nonroot_cg;
 }
 #else
-static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
+static inline bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
+{
+	return false;
+}
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 
 static struct cfq_queue **
@@ -3847,13 +3854,16 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 			goto out;
 	}
 
-	cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO,
+	cfqq = kmem_cache_alloc_node(cfq_pool,
+				     GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
 				     cfqd->queue->node);
 	if (!cfqq) {
 		cfqq = &cfqd->oom_cfqq;
 		goto out;
 	}
 
+	/* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */
+	cfqq->ioprio_class = IOPRIO_CLASS_NONE;
 	cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
 	cfq_init_prio_data(cfqq, cic);
 	cfq_link_cfqq_cfqg(cfqq, cfqg);
@@ -3916,6 +3926,12 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
 }
 
+static inline bool req_noidle(struct request *req)
+{
+	return req_op(req) == REQ_OP_WRITE &&
+		(req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC;
+}
+
 /*
  * Disable idle window if the process thinks too long or seeks so much that
  * it doesn't matter
@@ -3937,7 +3953,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
 		cfq_mark_cfqq_deep(cfqq);
 
-	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
+	if (cfqq->next_rq && req_noidle(cfqq->next_rq))
 		enable_idle = 0;
 	else if (!atomic_read(&cic->icq.ioc->active_ref) ||
 		 !cfqd->cfq_slice_idle ||
@@ -3989,7 +4005,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	 * if the new request is sync, but the currently running queue is
 	 * not, let the sync request have priority.
 	 */
-	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
+	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
 		return true;
 
 	/*
@@ -4135,7 +4151,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)];
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 	cfq_add_rq_rb(rq);
-	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, req_op(rq),
+	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
 				 rq->cmd_flags);
 	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
@@ -4222,8 +4238,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	const int sync = rq_is_sync(rq);
 	u64 now = ktime_get_ns();
 
-	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
-		     !!(rq->cmd_flags & REQ_NOIDLE));
+	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq));
 
 	cfq_update_hw_tag(cfqd);
 
@@ -4233,8 +4248,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	cfqq->dispatched--;
 	(RQ_CFQG(rq))->dispatched--;
 	cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
-				     rq_io_start_time_ns(rq), req_op(rq),
-				     rq->cmd_flags);
+				     rq_io_start_time_ns(rq), rq->cmd_flags);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
@@ -4312,14 +4326,14 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		cfq_schedule_dispatch(cfqd);
 }
 
-static void cfqq_boost_on_prio(struct cfq_queue *cfqq, int op_flags)
+static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op)
 {
 	/*
 	 * If REQ_PRIO is set, boost class and prio level, if it's below
 	 * BE/NORM. If prio is not set, restore the potentially boosted
 	 * class/prio level.
 	 */
-	if (!(op_flags & REQ_PRIO)) {
+	if (!(op & REQ_PRIO)) {
 		cfqq->ioprio_class = cfqq->org_ioprio_class;
 		cfqq->ioprio = cfqq->org_ioprio;
 	} else {
@@ -4340,7 +4354,7 @@ static inline int __cfq_may_queue(struct cfq_queue *cfqq)
 	return ELV_MQUEUE_MAY;
 }
 
-static int cfq_may_queue(struct request_queue *q, int op, int op_flags)
+static int cfq_may_queue(struct request_queue *q, unsigned int op)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct task_struct *tsk = current;
@@ -4357,10 +4371,10 @@ static int cfq_may_queue(struct request_queue *q, int op, int op_flags)
 	if (!cic)
 		return ELV_MQUEUE_MAY;
 
-	cfqq = cic_to_cfqq(cic, rw_is_sync(op, op_flags));
+	cfqq = cic_to_cfqq(cic, op_is_sync(op));
 	if (cfqq) {
 		cfq_init_prio_data(cfqq, cic);
-		cfqq_boost_on_prio(cfqq, op_flags);
+		cfqq_boost_on_prio(cfqq, op);
 
 		return __cfq_may_queue(cfqq);
 	}
@@ -4434,11 +4448,12 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 	const int rw = rq_data_dir(rq);
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
+	bool disable_wbt;
 
 	spin_lock_irq(q->queue_lock);
 
 	check_ioprio_changed(cic, bio);
-	check_blkcg_changed(cic, bio);
+	disable_wbt = check_blkcg_changed(cic, bio);
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
@@ -4474,6 +4489,10 @@ new_queue:
 	rq->elv.priv[0] = cfqq;
 	rq->elv.priv[1] = cfqq->cfqg;
 	spin_unlock_irq(q->queue_lock);
+
+	if (disable_wbt)
+		wbt_disable_default(q);
+
 	return 0;
 }
 
@@ -4823,7 +4842,7 @@ static struct elv_fs_entry cfq_attrs[] = {
 };
 
 static struct elevator_type iosched_cfq = {
-	.ops = {
+	.ops.sq = {
 		.elevator_merge_fn = 		cfq_merge,
 		.elevator_merged_fn =		cfq_merged_request,
 		.elevator_merge_req_fn =	cfq_merged_requests,
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 556826ac7cb4..570021a0dc1c 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -661,7 +661,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	struct block_device *bdev = inode->i_bdev;
 	struct gendisk *disk = bdev->bd_disk;
 	fmode_t mode = file->f_mode;
-	struct backing_dev_info *bdi;
 	loff_t size;
 	unsigned int max_sectors;
 
@@ -708,9 +707,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKFRAGET:
 		if (!arg)
 			return -EINVAL;
-		bdi = blk_get_backing_dev_info(bdev);
 		return compat_put_long(arg,
-				       (bdi->ra_pages * PAGE_SIZE) / 512);
+			       (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512);
 	case BLKROGET: /* compatible */
 		return compat_put_int(arg, bdev_read_only(bdev) != 0);
 	case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
@@ -728,8 +726,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKFRASET:
 		if (!capable(CAP_SYS_ADMIN))
 			return -EACCES;
-		bdi = blk_get_backing_dev_info(bdev);
-		bdi->ra_pages = (arg * 512) / PAGE_SIZE;
+		bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE;
 		return 0;
 	case BLKGETSIZE:
 		size = i_size_read(bdev->bd_inode);
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 55e0bb6d7da7..c68f6bbc0dcd 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -120,12 +120,11 @@ static void deadline_remove_request(struct request_queue *q, struct request *rq)
 	deadline_del_rq_rb(dd, rq);
 }
 
-static int
+static enum elv_merge
 deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
 	struct request *__rq;
-	int ret;
 
 	/*
 	 * check for front merge
@@ -138,20 +137,17 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
 			BUG_ON(sector != blk_rq_pos(__rq));
 
 			if (elv_bio_merge_ok(__rq, bio)) {
-				ret = ELEVATOR_FRONT_MERGE;
-				goto out;
+				*req = __rq;
+				return ELEVATOR_FRONT_MERGE;
 			}
 		}
 	}
 
 	return ELEVATOR_NO_MERGE;
-out:
-	*req = __rq;
-	return ret;
 }
 
 static void deadline_merged_request(struct request_queue *q,
-				    struct request *req, int type)
+				    struct request *req, enum elv_merge type)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
 
@@ -439,7 +435,7 @@ static struct elv_fs_entry deadline_attrs[] = {
 };
 
 static struct elevator_type iosched_deadline = {
-	.ops = {
+	.ops.sq = {
 		.elevator_merge_fn = 		deadline_merge,
 		.elevator_merged_fn =		deadline_merged_request,
 		.elevator_merge_req_fn =	deadline_merged_requests,
diff --git a/block/elevator.c b/block/elevator.c
index f7d973a56fd7..699d10f71a2c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -40,6 +40,7 @@
 #include <trace/events/block.h>
 
 #include "blk.h"
+#include "blk-mq-sched.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
@@ -58,8 +59,10 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_allow_bio_merge_fn)
-		return e->type->ops.elevator_allow_bio_merge_fn(q, rq, bio);
+	if (e->uses_mq && e->type->ops.mq.allow_merge)
+		return e->type->ops.mq.allow_merge(q, rq, bio);
+	else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn)
+		return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
 
 	return 1;
 }
@@ -163,6 +166,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
 	kobject_init(&eq->kobj, &elv_ktype);
 	mutex_init(&eq->sysfs_lock);
 	hash_init(eq->hash);
+	eq->uses_mq = e->uses_mq;
 
 	return eq;
 }
@@ -203,11 +207,12 @@ int elevator_init(struct request_queue *q, char *name)
 	}
 
 	/*
-	 * Use the default elevator specified by config boot param or
-	 * config option.  Don't try to load modules as we could be running
-	 * off async and request_module() isn't allowed from async.
+	 * Use the default elevator specified by config boot param for
+	 * non-mq devices, or by config option. Don't try to load modules
+	 * as we could be running off async and request_module() isn't
+	 * allowed from async.
 	 */
-	if (!e && *chosen_elevator) {
+	if (!e && !q->mq_ops && *chosen_elevator) {
 		e = elevator_get(chosen_elevator, false);
 		if (!e)
 			printk(KERN_ERR "I/O scheduler %s not found\n",
@@ -215,18 +220,32 @@ int elevator_init(struct request_queue *q, char *name)
 	}
 
 	if (!e) {
-		e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
+		if (q->mq_ops && q->nr_hw_queues == 1)
+			e = elevator_get(CONFIG_DEFAULT_SQ_IOSCHED, false);
+		else if (q->mq_ops)
+			e = elevator_get(CONFIG_DEFAULT_MQ_IOSCHED, false);
+		else
+			e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
+
 		if (!e) {
 			printk(KERN_ERR
 				"Default I/O scheduler not found. " \
-				"Using noop.\n");
+				"Using noop/none.\n");
 			e = elevator_get("noop", false);
 		}
 	}
 
-	err = e->ops.elevator_init_fn(q, e);
-	if (err)
+	if (e->uses_mq) {
+		err = blk_mq_sched_setup(q);
+		if (!err)
+			err = e->ops.mq.init_sched(q, e);
+	} else
+		err = e->ops.sq.elevator_init_fn(q, e);
+	if (err) {
+		if (e->uses_mq)
+			blk_mq_sched_teardown(q);
 		elevator_put(e);
+	}
 	return err;
 }
 EXPORT_SYMBOL(elevator_init);
@@ -234,8 +253,10 @@ EXPORT_SYMBOL(elevator_init);
 void elevator_exit(struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
-	if (e->type->ops.elevator_exit_fn)
-		e->type->ops.elevator_exit_fn(e);
+	if (e->uses_mq && e->type->ops.mq.exit_sched)
+		e->type->ops.mq.exit_sched(e);
+	else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
+		e->type->ops.sq.elevator_exit_fn(e);
 	mutex_unlock(&e->sysfs_lock);
 
 	kobject_put(&e->kobj);
@@ -245,31 +266,33 @@ EXPORT_SYMBOL(elevator_exit);
 static inline void __elv_rqhash_del(struct request *rq)
 {
 	hash_del(&rq->hash);
-	rq->cmd_flags &= ~REQ_HASHED;
+	rq->rq_flags &= ~RQF_HASHED;
 }
 
-static void elv_rqhash_del(struct request_queue *q, struct request *rq)
+void elv_rqhash_del(struct request_queue *q, struct request *rq)
 {
 	if (ELV_ON_HASH(rq))
 		__elv_rqhash_del(rq);
 }
+EXPORT_SYMBOL_GPL(elv_rqhash_del);
 
-static void elv_rqhash_add(struct request_queue *q, struct request *rq)
+void elv_rqhash_add(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
 	BUG_ON(ELV_ON_HASH(rq));
 	hash_add(e->hash, &rq->hash, rq_hash_key(rq));
-	rq->cmd_flags |= REQ_HASHED;
+	rq->rq_flags |= RQF_HASHED;
 }
+EXPORT_SYMBOL_GPL(elv_rqhash_add);
 
-static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
+void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
 {
 	__elv_rqhash_del(rq);
 	elv_rqhash_add(q, rq);
 }
 
-static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
+struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
 {
 	struct elevator_queue *e = q->elevator;
 	struct hlist_node *next;
@@ -352,7 +375,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 {
 	sector_t boundary;
 	struct list_head *entry;
-	int stop_flags;
 
 	if (q->last_merge == rq)
 		q->last_merge = NULL;
@@ -362,7 +384,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 	q->nr_sorted--;
 
 	boundary = q->end_sector;
-	stop_flags = REQ_SOFTBARRIER | REQ_STARTED;
 	list_for_each_prev(entry, &q->queue_head) {
 		struct request *pos = list_entry_rq(entry);
 
@@ -370,7 +391,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 			break;
 		if (rq_data_dir(rq) != rq_data_dir(pos))
 			break;
-		if (pos->cmd_flags & stop_flags)
+		if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER))
 			break;
 		if (blk_rq_pos(rq) >= boundary) {
 			if (blk_rq_pos(pos) < boundary)
@@ -407,11 +428,11 @@ void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL(elv_dispatch_add_tail);
 
-int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
+enum elv_merge elv_merge(struct request_queue *q, struct request **req,
+		struct bio *bio)
 {
 	struct elevator_queue *e = q->elevator;
 	struct request *__rq;
-	int ret;
 
 	/*
 	 * Levels of merges:
@@ -426,7 +447,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 	 * First try one-hit cache.
 	 */
 	if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
-		ret = blk_try_merge(q->last_merge, bio);
+		enum elv_merge ret = blk_try_merge(q->last_merge, bio);
+
 		if (ret != ELEVATOR_NO_MERGE) {
 			*req = q->last_merge;
 			return ret;
@@ -445,8 +467,10 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 		return ELEVATOR_BACK_MERGE;
 	}
 
-	if (e->type->ops.elevator_merge_fn)
-		return e->type->ops.elevator_merge_fn(q, req, bio);
+	if (e->uses_mq && e->type->ops.mq.request_merge)
+		return e->type->ops.mq.request_merge(q, req, bio);
+	else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
+		return e->type->ops.sq.elevator_merge_fn(q, req, bio);
 
 	return ELEVATOR_NO_MERGE;
 }
@@ -458,8 +482,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
  *
  * Returns true if we merged, false otherwise
  */
-static bool elv_attempt_insert_merge(struct request_queue *q,
-				     struct request *rq)
+bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
 {
 	struct request *__rq;
 	bool ret;
@@ -493,12 +516,15 @@ static bool elv_attempt_insert_merge(struct request_queue *q,
 	return ret;
 }
 
-void elv_merged_request(struct request_queue *q, struct request *rq, int type)
+void elv_merged_request(struct request_queue *q, struct request *rq,
+		enum elv_merge type)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_merged_fn)
-		e->type->ops.elevator_merged_fn(q, rq, type);
+	if (e->uses_mq && e->type->ops.mq.request_merged)
+		e->type->ops.mq.request_merged(q, rq, type);
+	else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
+		e->type->ops.sq.elevator_merged_fn(q, rq, type);
 
 	if (type == ELEVATOR_BACK_MERGE)
 		elv_rqhash_reposition(q, rq);
@@ -510,10 +536,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 			     struct request *next)
 {
 	struct elevator_queue *e = q->elevator;
-	const int next_sorted = next->cmd_flags & REQ_SORTED;
-
-	if (next_sorted && e->type->ops.elevator_merge_req_fn)
-		e->type->ops.elevator_merge_req_fn(q, rq, next);
+	bool next_sorted = false;
+
+	if (e->uses_mq && e->type->ops.mq.requests_merged)
+		e->type->ops.mq.requests_merged(q, rq, next);
+	else if (e->type->ops.sq.elevator_merge_req_fn) {
+		next_sorted = (__force bool)(next->rq_flags & RQF_SORTED);
+		if (next_sorted)
+			e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
+	}
 
 	elv_rqhash_reposition(q, rq);
 
@@ -530,20 +561,23 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_bio_merged_fn)
-		e->type->ops.elevator_bio_merged_fn(q, rq, bio);
+	if (WARN_ON_ONCE(e->uses_mq))
+		return;
+
+	if (e->type->ops.sq.elevator_bio_merged_fn)
+		e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
 }
 
 #ifdef CONFIG_PM
 static void blk_pm_requeue_request(struct request *rq)
 {
-	if (rq->q->dev && !(rq->cmd_flags & REQ_PM))
+	if (rq->q->dev && !(rq->rq_flags & RQF_PM))
 		rq->q->nr_pending--;
 }
 
 static void blk_pm_add_request(struct request_queue *q, struct request *rq)
 {
-	if (q->dev && !(rq->cmd_flags & REQ_PM) && q->nr_pending++ == 0 &&
+	if (q->dev && !(rq->rq_flags & RQF_PM) && q->nr_pending++ == 0 &&
 	    (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING))
 		pm_request_resume(q->dev);
 }
@@ -563,11 +597,11 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 	 */
 	if (blk_account_rq(rq)) {
 		q->in_flight[rq_is_sync(rq)]--;
-		if (rq->cmd_flags & REQ_SORTED)
+		if (rq->rq_flags & RQF_SORTED)
 			elv_deactivate_rq(q, rq);
 	}
 
-	rq->cmd_flags &= ~REQ_STARTED;
+	rq->rq_flags &= ~RQF_STARTED;
 
 	blk_pm_requeue_request(rq);
 
@@ -576,11 +610,15 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 
 void elv_drain_elevator(struct request_queue *q)
 {
+	struct elevator_queue *e = q->elevator;
 	static int printed;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return;
+
 	lockdep_assert_held(q->queue_lock);
 
-	while (q->elevator->type->ops.elevator_dispatch_fn(q, 1))
+	while (e->type->ops.sq.elevator_dispatch_fn(q, 1))
 		;
 	if (q->nr_sorted && printed++ < 10) {
 		printk(KERN_ERR "%s: forced dispatching is broken "
@@ -597,13 +635,13 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 
 	rq->q = q;
 
-	if (rq->cmd_flags & REQ_SOFTBARRIER) {
+	if (rq->rq_flags & RQF_SOFTBARRIER) {
 		/* barriers are scheduling boundary, update end_sector */
-		if (rq->cmd_type == REQ_TYPE_FS) {
+		if (!blk_rq_is_passthrough(rq)) {
 			q->end_sector = rq_end_sector(rq);
 			q->boundary_rq = rq;
 		}
-	} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
+	} else if (!(rq->rq_flags & RQF_ELVPRIV) &&
 		    (where == ELEVATOR_INSERT_SORT ||
 		     where == ELEVATOR_INSERT_SORT_MERGE))
 		where = ELEVATOR_INSERT_BACK;
@@ -611,12 +649,12 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 	switch (where) {
 	case ELEVATOR_INSERT_REQUEUE:
 	case ELEVATOR_INSERT_FRONT:
-		rq->cmd_flags |= REQ_SOFTBARRIER;
+		rq->rq_flags |= RQF_SOFTBARRIER;
 		list_add(&rq->queuelist, &q->queue_head);
 		break;
 
 	case ELEVATOR_INSERT_BACK:
-		rq->cmd_flags |= REQ_SOFTBARRIER;
+		rq->rq_flags |= RQF_SOFTBARRIER;
 		elv_drain_elevator(q);
 		list_add_tail(&rq->queuelist, &q->queue_head);
 		/*
@@ -641,8 +679,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 		if (elv_attempt_insert_merge(q, rq))
 			break;
 	case ELEVATOR_INSERT_SORT:
-		BUG_ON(rq->cmd_type != REQ_TYPE_FS);
-		rq->cmd_flags |= REQ_SORTED;
+		BUG_ON(blk_rq_is_passthrough(rq));
+		rq->rq_flags |= RQF_SORTED;
 		q->nr_sorted++;
 		if (rq_mergeable(rq)) {
 			elv_rqhash_add(q, rq);
@@ -655,11 +693,11 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 		 * rq cannot be accessed after calling
 		 * elevator_add_req_fn.
 		 */
-		q->elevator->type->ops.elevator_add_req_fn(q, rq);
+		q->elevator->type->ops.sq.elevator_add_req_fn(q, rq);
 		break;
 
 	case ELEVATOR_INSERT_FLUSH:
-		rq->cmd_flags |= REQ_SOFTBARRIER;
+		rq->rq_flags |= RQF_SOFTBARRIER;
 		blk_insert_flush(rq);
 		break;
 	default:
@@ -684,8 +722,11 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_latter_req_fn)
-		return e->type->ops.elevator_latter_req_fn(q, rq);
+	if (e->uses_mq && e->type->ops.mq.next_request)
+		return e->type->ops.mq.next_request(q, rq);
+	else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
+		return e->type->ops.sq.elevator_latter_req_fn(q, rq);
+
 	return NULL;
 }
 
@@ -693,8 +734,10 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_former_req_fn)
-		return e->type->ops.elevator_former_req_fn(q, rq);
+	if (e->uses_mq && e->type->ops.mq.former_request)
+		return e->type->ops.mq.former_request(q, rq);
+	if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
+		return e->type->ops.sq.elevator_former_req_fn(q, rq);
 	return NULL;
 }
 
@@ -703,8 +746,11 @@ int elv_set_request(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_set_req_fn)
-		return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask);
+	if (WARN_ON_ONCE(e->uses_mq))
+		return 0;
+
+	if (e->type->ops.sq.elevator_set_req_fn)
+		return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
 	return 0;
 }
 
@@ -712,16 +758,22 @@ void elv_put_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_put_req_fn)
-		e->type->ops.elevator_put_req_fn(rq);
+	if (WARN_ON_ONCE(e->uses_mq))
+		return;
+
+	if (e->type->ops.sq.elevator_put_req_fn)
+		e->type->ops.sq.elevator_put_req_fn(rq);
 }
 
-int elv_may_queue(struct request_queue *q, int op, int op_flags)
+int elv_may_queue(struct request_queue *q, unsigned int op)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_may_queue_fn)
-		return e->type->ops.elevator_may_queue_fn(q, op, op_flags);
+	if (WARN_ON_ONCE(e->uses_mq))
+		return 0;
+
+	if (e->type->ops.sq.elevator_may_queue_fn)
+		return e->type->ops.sq.elevator_may_queue_fn(q, op);
 
 	return ELV_MQUEUE_MAY;
 }
@@ -730,14 +782,17 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return;
+
 	/*
 	 * request is released from the driver, io must be done
 	 */
 	if (blk_account_rq(rq)) {
 		q->in_flight[rq_is_sync(rq)]--;
-		if ((rq->cmd_flags & REQ_SORTED) &&
-		    e->type->ops.elevator_completed_req_fn)
-			e->type->ops.elevator_completed_req_fn(q, rq);
+		if ((rq->rq_flags & RQF_SORTED) &&
+		    e->type->ops.sq.elevator_completed_req_fn)
+			e->type->ops.sq.elevator_completed_req_fn(q, rq);
 	}
 }
 
@@ -805,8 +860,8 @@ int elv_register_queue(struct request_queue *q)
 		}
 		kobject_uevent(&e->kobj, KOBJ_ADD);
 		e->registered = 1;
-		if (e->type->ops.elevator_registered_fn)
-			e->type->ops.elevator_registered_fn(q);
+		if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
+			e->type->ops.sq.elevator_registered_fn(q);
 	}
 	return error;
 }
@@ -893,9 +948,14 @@ EXPORT_SYMBOL_GPL(elv_unregister);
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
 	struct elevator_queue *old = q->elevator;
-	bool registered = old->registered;
+	bool old_registered = false;
 	int err;
 
+	if (q->mq_ops) {
+		blk_mq_freeze_queue(q);
+		blk_mq_quiesce_queue(q);
+	}
+
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data.
 	 * Block layer doesn't call into a quiesced elevator - all requests
@@ -903,42 +963,76 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	 * using INSERT_BACK.  All requests have SOFTBARRIER set and no
 	 * merge happens either.
 	 */
-	blk_queue_bypass_start(q);
+	if (old) {
+		old_registered = old->registered;
+
+		if (old->uses_mq)
+			blk_mq_sched_teardown(q);
+
+		if (!q->mq_ops)
+			blk_queue_bypass_start(q);
 
-	/* unregister and clear all auxiliary data of the old elevator */
-	if (registered)
-		elv_unregister_queue(q);
+		/* unregister and clear all auxiliary data of the old elevator */
+		if (old_registered)
+			elv_unregister_queue(q);
 
-	spin_lock_irq(q->queue_lock);
-	ioc_clear_queue(q);
-	spin_unlock_irq(q->queue_lock);
+		spin_lock_irq(q->queue_lock);
+		ioc_clear_queue(q);
+		spin_unlock_irq(q->queue_lock);
+	}
 
 	/* allocate, init and register new elevator */
-	err = new_e->ops.elevator_init_fn(q, new_e);
-	if (err)
-		goto fail_init;
+	if (new_e) {
+		if (new_e->uses_mq) {
+			err = blk_mq_sched_setup(q);
+			if (!err)
+				err = new_e->ops.mq.init_sched(q, new_e);
+		} else
+			err = new_e->ops.sq.elevator_init_fn(q, new_e);
+		if (err)
+			goto fail_init;
 
-	if (registered) {
 		err = elv_register_queue(q);
 		if (err)
 			goto fail_register;
-	}
+	} else
+		q->elevator = NULL;
 
 	/* done, kill the old one and finish */
-	elevator_exit(old);
-	blk_queue_bypass_end(q);
+	if (old) {
+		elevator_exit(old);
+		if (!q->mq_ops)
+			blk_queue_bypass_end(q);
+	}
 
-	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+	if (q->mq_ops) {
+		blk_mq_unfreeze_queue(q);
+		blk_mq_start_stopped_hw_queues(q, true);
+	}
+
+	if (new_e)
+		blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+	else
+		blk_add_trace_msg(q, "elv switch: none");
 
 	return 0;
 
 fail_register:
+	if (q->mq_ops)
+		blk_mq_sched_teardown(q);
 	elevator_exit(q->elevator);
 fail_init:
 	/* switch failed, restore and re-register old elevator */
-	q->elevator = old;
-	elv_register_queue(q);
-	blk_queue_bypass_end(q);
+	if (old) {
+		q->elevator = old;
+		elv_register_queue(q);
+		if (!q->mq_ops)
+			blk_queue_bypass_end(q);
+	}
+	if (q->mq_ops) {
+		blk_mq_unfreeze_queue(q);
+		blk_mq_start_stopped_hw_queues(q, true);
+	}
 
 	return err;
 }
@@ -951,8 +1045,11 @@ static int __elevator_change(struct request_queue *q, const char *name)
 	char elevator_name[ELV_NAME_MAX];
 	struct elevator_type *e;
 
-	if (!q->elevator)
-		return -ENXIO;
+	/*
+	 * Special case for mq, turn off scheduling
+	 */
+	if (q->mq_ops && !strncmp(name, "none", 4))
+		return elevator_switch(q, NULL);
 
 	strlcpy(elevator_name, name, sizeof(elevator_name));
 	e = elevator_get(strstrip(elevator_name), true);
@@ -961,11 +1058,21 @@ static int __elevator_change(struct request_queue *q, const char *name)
 		return -EINVAL;
 	}
 
-	if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
+	if (q->elevator &&
+	    !strcmp(elevator_name, q->elevator->type->elevator_name)) {
 		elevator_put(e);
 		return 0;
 	}
 
+	if (!e->uses_mq && q->mq_ops) {
+		elevator_put(e);
+		return -EINVAL;
+	}
+	if (e->uses_mq && !q->mq_ops) {
+		elevator_put(e);
+		return -EINVAL;
+	}
+
 	return elevator_switch(q, e);
 }
 
@@ -987,7 +1094,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 {
 	int ret;
 
-	if (!q->elevator)
+	if (!(q->mq_ops || q->request_fn))
 		return count;
 
 	ret = __elevator_change(q, name);
@@ -1001,24 +1108,34 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 ssize_t elv_iosched_show(struct request_queue *q, char *name)
 {
 	struct elevator_queue *e = q->elevator;
-	struct elevator_type *elv;
+	struct elevator_type *elv = NULL;
 	struct elevator_type *__e;
 	int len = 0;
 
-	if (!q->elevator || !blk_queue_stackable(q))
+	if (!blk_queue_stackable(q))
 		return sprintf(name, "none\n");
 
-	elv = e->type;
+	if (!q->elevator)
+		len += sprintf(name+len, "[none] ");
+	else
+		elv = e->type;
 
 	spin_lock(&elv_list_lock);
 	list_for_each_entry(__e, &elv_list, list) {
-		if (!strcmp(elv->elevator_name, __e->elevator_name))
+		if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
 			len += sprintf(name+len, "[%s] ", elv->elevator_name);
-		else
+			continue;
+		}
+		if (__e->uses_mq && q->mq_ops)
+			len += sprintf(name+len, "%s ", __e->elevator_name);
+		else if (!__e->uses_mq && !q->mq_ops)
 			len += sprintf(name+len, "%s ", __e->elevator_name);
 	}
 	spin_unlock(&elv_list_lock);
 
+	if (q->mq_ops && q->elevator)
+		len += sprintf(name+len, "none");
+
 	len += sprintf(len+name, "\n");
 	return len;
 }
diff --git a/block/genhd.c b/block/genhd.c
index fcd6d4fae657..3631cd480295 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -572,6 +572,20 @@ exit:
 	disk_part_iter_exit(&piter);
 }
 
+void put_disk_devt(struct disk_devt *disk_devt)
+{
+	if (disk_devt && atomic_dec_and_test(&disk_devt->count))
+		disk_devt->release(disk_devt);
+}
+EXPORT_SYMBOL(put_disk_devt);
+
+void get_disk_devt(struct disk_devt *disk_devt)
+{
+	if (disk_devt)
+		atomic_inc(&disk_devt->count);
+}
+EXPORT_SYMBOL(get_disk_devt);
+
 /**
  * device_add_disk - add partitioning information to kernel list
  * @parent: parent device for the disk
@@ -612,8 +626,15 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 
 	disk_alloc_events(disk);
 
+	/*
+	 * Take a reference on the devt and assign it to queue since it
+	 * must not be reallocated while the bdi is registered
+	 */
+	disk->queue->disk_devt = disk->disk_devt;
+	get_disk_devt(disk->disk_devt);
+
 	/* Register BDI before referencing it from bdev */
-	bdi = &disk->queue->backing_dev_info;
+	bdi = disk->queue->backing_dev_info;
 	bdi_register_owner(bdi, disk_to_dev(disk));
 
 	blk_register_region(disk_devt(disk), disk->minors, NULL,
@@ -648,6 +669,8 @@ void del_gendisk(struct gendisk *disk)
 	disk_part_iter_init(&piter, disk,
 			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
 	while ((part = disk_part_iter_next(&piter))) {
+		bdev_unhash_inode(MKDEV(disk->major,
+					disk->first_minor + part->partno));
 		invalidate_partition(disk, part->partno);
 		delete_partition(disk, part->partno);
 	}
diff --git a/block/ioctl.c b/block/ioctl.c
index ed2397f8de9d..7b88820b93d9 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -8,7 +8,7 @@
 #include <linux/fs.h>
 #include <linux/blktrace_api.h>
 #include <linux/pr.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
 {
@@ -45,6 +45,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 				    || pstart < 0 || plength < 0 || partno > 65535)
 					return -EINVAL;
 			}
+			/* check if partition is aligned to blocksize */
+			if (p.start & (bdev_logical_block_size(bdev) - 1))
+				return -EINVAL;
 
 			mutex_lock(&bdev->bd_mutex);
 
@@ -225,7 +228,8 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
 		unsigned long arg)
 {
 	uint64_t range[2];
-	uint64_t start, len;
+	struct address_space *mapping;
+	uint64_t start, end, len;
 
 	if (!(mode & FMODE_WRITE))
 		return -EBADF;
@@ -235,18 +239,23 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
 
 	start = range[0];
 	len = range[1];
+	end = start + len - 1;
 
 	if (start & 511)
 		return -EINVAL;
 	if (len & 511)
 		return -EINVAL;
-	start >>= 9;
-	len >>= 9;
-
-	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
+	if (end >= (uint64_t)i_size_read(bdev->bd_inode))
 		return -EINVAL;
+	if (end < start)
+		return -EINVAL;
+
+	/* Invalidate the page cache, including dirty pages */
+	mapping = bdev->bd_inode->i_mapping;
+	truncate_inode_pages_range(mapping, start, end);
 
-	return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false);
+	return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
+				    false);
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
@@ -496,7 +505,6 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode,
 int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 			unsigned long arg)
 {
-	struct backing_dev_info *bdi;
 	void __user *argp = (void __user *)arg;
 	loff_t size;
 	unsigned int max_sectors;
@@ -513,14 +521,17 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 				BLKDEV_DISCARD_SECURE);
 	case BLKZEROOUT:
 		return blk_ioctl_zeroout(bdev, mode, arg);
+	case BLKREPORTZONE:
+		return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
+	case BLKRESETZONE:
+		return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg);
 	case HDIO_GETGEO:
 		return blkdev_getgeo(bdev, argp);
 	case BLKRAGET:
 	case BLKFRAGET:
 		if (!arg)
 			return -EINVAL;
-		bdi = blk_get_backing_dev_info(bdev);
-		return put_long(arg, (bdi->ra_pages * PAGE_SIZE) / 512);
+		return put_long(arg, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512);
 	case BLKROGET:
 		return put_int(arg, bdev_read_only(bdev) != 0);
 	case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
@@ -547,8 +558,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKFRASET:
 		if(!capable(CAP_SYS_ADMIN))
 			return -EACCES;
-		bdi = blk_get_backing_dev_info(bdev);
-		bdi->ra_pages = (arg * 512) / PAGE_SIZE;
+		bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE;
 		return 0;
 	case BLKBSZSET:
 		return blkdev_bszset(bdev, mode, argp);
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
new file mode 100644
index 000000000000..236121633ca0
--- /dev/null
+++ b/block/mq-deadline.c
@@ -0,0 +1,556 @@
+/*
+ *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
+ *  for the blk-mq scheduling framework
+ *
+ *  Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/elevator.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+#include <linux/sbitmap.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
+
+/*
+ * See Documentation/block/deadline-iosched.txt
+ */
+static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
+static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+static const int writes_starved = 2;    /* max times reads can starve a write */
+static const int fifo_batch = 16;       /* # of sequential requests treated as one
+				     by the above parameters. For throughput. */
+
+struct deadline_data {
+	/*
+	 * run time data
+	 */
+
+	/*
+	 * requests (deadline_rq s) are present on both sort_list and fifo_list
+	 */
+	struct rb_root sort_list[2];
+	struct list_head fifo_list[2];
+
+	/*
+	 * next in sort order. read, write or both are NULL
+	 */
+	struct request *next_rq[2];
+	unsigned int batching;		/* number of sequential requests made */
+	unsigned int starved;		/* times reads have starved writes */
+
+	/*
+	 * settings that change how the i/o scheduler behaves
+	 */
+	int fifo_expire[2];
+	int fifo_batch;
+	int writes_starved;
+	int front_merges;
+
+	spinlock_t lock;
+	struct list_head dispatch;
+};
+
+static inline struct rb_root *
+deadline_rb_root(struct deadline_data *dd, struct request *rq)
+{
+	return &dd->sort_list[rq_data_dir(rq)];
+}
+
+/*
+ * get the request after `rq' in sector-sorted order
+ */
+static inline struct request *
+deadline_latter_request(struct request *rq)
+{
+	struct rb_node *node = rb_next(&rq->rb_node);
+
+	if (node)
+		return rb_entry_rq(node);
+
+	return NULL;
+}
+
+static void
+deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
+{
+	struct rb_root *root = deadline_rb_root(dd, rq);
+
+	elv_rb_add(root, rq);
+}
+
+static inline void
+deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
+{
+	const int data_dir = rq_data_dir(rq);
+
+	if (dd->next_rq[data_dir] == rq)
+		dd->next_rq[data_dir] = deadline_latter_request(rq);
+
+	elv_rb_del(deadline_rb_root(dd, rq), rq);
+}
+
+/*
+ * remove rq from rbtree and fifo.
+ */
+static void deadline_remove_request(struct request_queue *q, struct request *rq)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+
+	list_del_init(&rq->queuelist);
+
+	/*
+	 * We might not be on the rbtree, if we are doing an insert merge
+	 */
+	if (!RB_EMPTY_NODE(&rq->rb_node))
+		deadline_del_rq_rb(dd, rq);
+
+	elv_rqhash_del(q, rq);
+	if (q->last_merge == rq)
+		q->last_merge = NULL;
+}
+
+static void dd_request_merged(struct request_queue *q, struct request *req,
+			      enum elv_merge type)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+
+	/*
+	 * if the merge was a front merge, we need to reposition request
+	 */
+	if (type == ELEVATOR_FRONT_MERGE) {
+		elv_rb_del(deadline_rb_root(dd, req), req);
+		deadline_add_rq_rb(dd, req);
+	}
+}
+
+static void dd_merged_requests(struct request_queue *q, struct request *req,
+			       struct request *next)
+{
+	/*
+	 * if next expires before rq, assign its expire time to rq
+	 * and move into next position (next will be deleted) in fifo
+	 */
+	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
+		if (time_before((unsigned long)next->fifo_time,
+				(unsigned long)req->fifo_time)) {
+			list_move(&req->queuelist, &next->queuelist);
+			req->fifo_time = next->fifo_time;
+		}
+	}
+
+	/*
+	 * kill knowledge of next, this one is a goner
+	 */
+	deadline_remove_request(q, next);
+}
+
+/*
+ * move an entry to dispatch queue
+ */
+static void
+deadline_move_request(struct deadline_data *dd, struct request *rq)
+{
+	const int data_dir = rq_data_dir(rq);
+
+	dd->next_rq[READ] = NULL;
+	dd->next_rq[WRITE] = NULL;
+	dd->next_rq[data_dir] = deadline_latter_request(rq);
+
+	/*
+	 * take it off the sort and fifo list
+	 */
+	deadline_remove_request(rq->q, rq);
+}
+
+/*
+ * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
+ * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
+ */
+static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
+{
+	struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
+
+	/*
+	 * rq is expired!
+	 */
+	if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * deadline_dispatch_requests selects the best request according to
+ * read/write expire, fifo_batch, etc
+ */
+static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+	struct request *rq;
+	bool reads, writes;
+	int data_dir;
+
+	if (!list_empty(&dd->dispatch)) {
+		rq = list_first_entry(&dd->dispatch, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		goto done;
+	}
+
+	reads = !list_empty(&dd->fifo_list[READ]);
+	writes = !list_empty(&dd->fifo_list[WRITE]);
+
+	/*
+	 * batches are currently reads XOR writes
+	 */
+	if (dd->next_rq[WRITE])
+		rq = dd->next_rq[WRITE];
+	else
+		rq = dd->next_rq[READ];
+
+	if (rq && dd->batching < dd->fifo_batch)
+		/* we have a next request are still entitled to batch */
+		goto dispatch_request;
+
+	/*
+	 * at this point we are not running a batch. select the appropriate
+	 * data direction (read / write)
+	 */
+
+	if (reads) {
+		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
+
+		if (writes && (dd->starved++ >= dd->writes_starved))
+			goto dispatch_writes;
+
+		data_dir = READ;
+
+		goto dispatch_find_request;
+	}
+
+	/*
+	 * there are either no reads or writes have been starved
+	 */
+
+	if (writes) {
+dispatch_writes:
+		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
+
+		dd->starved = 0;
+
+		data_dir = WRITE;
+
+		goto dispatch_find_request;
+	}
+
+	return NULL;
+
+dispatch_find_request:
+	/*
+	 * we are not running a batch, find best request for selected data_dir
+	 */
+	if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+		/*
+		 * A deadline has expired, the last request was in the other
+		 * direction, or we have run out of higher-sectored requests.
+		 * Start again from the request with the earliest expiry time.
+		 */
+		rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+	} else {
+		/*
+		 * The last req was the same dir and we have a next request in
+		 * sort order. No expired requests so continue on from here.
+		 */
+		rq = dd->next_rq[data_dir];
+	}
+
+	dd->batching = 0;
+
+dispatch_request:
+	/*
+	 * rq is the selected appropriate request.
+	 */
+	dd->batching++;
+	deadline_move_request(dd, rq);
+done:
+	rq->rq_flags |= RQF_STARTED;
+	return rq;
+}
+
+static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+	struct request *rq;
+
+	spin_lock(&dd->lock);
+	rq = __dd_dispatch_request(hctx);
+	spin_unlock(&dd->lock);
+
+	return rq;
+}
+
+static void dd_exit_queue(struct elevator_queue *e)
+{
+	struct deadline_data *dd = e->elevator_data;
+
+	BUG_ON(!list_empty(&dd->fifo_list[READ]));
+	BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
+
+	kfree(dd);
+}
+
+/*
+ * initialize elevator private data (deadline_data).
+ */
+static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
+{
+	struct deadline_data *dd;
+	struct elevator_queue *eq;
+
+	eq = elevator_alloc(q, e);
+	if (!eq)
+		return -ENOMEM;
+
+	dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
+	if (!dd) {
+		kobject_put(&eq->kobj);
+		return -ENOMEM;
+	}
+	eq->elevator_data = dd;
+
+	INIT_LIST_HEAD(&dd->fifo_list[READ]);
+	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
+	dd->sort_list[READ] = RB_ROOT;
+	dd->sort_list[WRITE] = RB_ROOT;
+	dd->fifo_expire[READ] = read_expire;
+	dd->fifo_expire[WRITE] = write_expire;
+	dd->writes_starved = writes_starved;
+	dd->front_merges = 1;
+	dd->fifo_batch = fifo_batch;
+	spin_lock_init(&dd->lock);
+	INIT_LIST_HEAD(&dd->dispatch);
+
+	q->elevator = eq;
+	return 0;
+}
+
+static int dd_request_merge(struct request_queue *q, struct request **rq,
+			    struct bio *bio)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+	sector_t sector = bio_end_sector(bio);
+	struct request *__rq;
+
+	if (!dd->front_merges)
+		return ELEVATOR_NO_MERGE;
+
+	__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
+	if (__rq) {
+		BUG_ON(sector != blk_rq_pos(__rq));
+
+		if (elv_bio_merge_ok(__rq, bio)) {
+			*rq = __rq;
+			return ELEVATOR_FRONT_MERGE;
+		}
+	}
+
+	return ELEVATOR_NO_MERGE;
+}
+
+static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+{
+	struct request_queue *q = hctx->queue;
+	struct deadline_data *dd = q->elevator->elevator_data;
+	struct request *free = NULL;
+	bool ret;
+
+	spin_lock(&dd->lock);
+	ret = blk_mq_sched_try_merge(q, bio, &free);
+	spin_unlock(&dd->lock);
+
+	if (free)
+		blk_mq_free_request(free);
+
+	return ret;
+}
+
+/*
+ * add rq to rbtree and fifo
+ */
+static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+			      bool at_head)
+{
+	struct request_queue *q = hctx->queue;
+	struct deadline_data *dd = q->elevator->elevator_data;
+	const int data_dir = rq_data_dir(rq);
+
+	if (blk_mq_sched_try_insert_merge(q, rq))
+		return;
+
+	blk_mq_sched_request_inserted(rq);
+
+	if (at_head || blk_rq_is_passthrough(rq)) {
+		if (at_head)
+			list_add(&rq->queuelist, &dd->dispatch);
+		else
+			list_add_tail(&rq->queuelist, &dd->dispatch);
+	} else {
+		deadline_add_rq_rb(dd, rq);
+
+		if (rq_mergeable(rq)) {
+			elv_rqhash_add(q, rq);
+			if (!q->last_merge)
+				q->last_merge = rq;
+		}
+
+		/*
+		 * set expire time and add to fifo list
+		 */
+		rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
+		list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
+	}
+}
+
+static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
+			       struct list_head *list, bool at_head)
+{
+	struct request_queue *q = hctx->queue;
+	struct deadline_data *dd = q->elevator->elevator_data;
+
+	spin_lock(&dd->lock);
+	while (!list_empty(list)) {
+		struct request *rq;
+
+		rq = list_first_entry(list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		dd_insert_request(hctx, rq, at_head);
+	}
+	spin_unlock(&dd->lock);
+}
+
+static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
+{
+	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+
+	return !list_empty_careful(&dd->dispatch) ||
+		!list_empty_careful(&dd->fifo_list[0]) ||
+		!list_empty_careful(&dd->fifo_list[1]);
+}
+
+/*
+ * sysfs parts below
+ */
+static ssize_t
+deadline_var_show(int var, char *page)
+{
+	return sprintf(page, "%d\n", var);
+}
+
+static ssize_t
+deadline_var_store(int *var, const char *page, size_t count)
+{
+	char *p = (char *) page;
+
+	*var = simple_strtol(p, &p, 10);
+	return count;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
+{									\
+	struct deadline_data *dd = e->elevator_data;			\
+	int __data = __VAR;						\
+	if (__CONV)							\
+		__data = jiffies_to_msecs(__data);			\
+	return deadline_var_show(__data, (page));			\
+}
+SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
+SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
+SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
+SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
+SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
+{									\
+	struct deadline_data *dd = e->elevator_data;			\
+	int __data;							\
+	int ret = deadline_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	if (__CONV)							\
+		*(__PTR) = msecs_to_jiffies(__data);			\
+	else								\
+		*(__PTR) = __data;					\
+	return ret;							\
+}
+STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
+STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
+STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
+#undef STORE_FUNCTION
+
+#define DD_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
+				      deadline_##name##_store)
+
+static struct elv_fs_entry deadline_attrs[] = {
+	DD_ATTR(read_expire),
+	DD_ATTR(write_expire),
+	DD_ATTR(writes_starved),
+	DD_ATTR(front_merges),
+	DD_ATTR(fifo_batch),
+	__ATTR_NULL
+};
+
+static struct elevator_type mq_deadline = {
+	.ops.mq = {
+		.insert_requests	= dd_insert_requests,
+		.dispatch_request	= dd_dispatch_request,
+		.next_request		= elv_rb_latter_request,
+		.former_request		= elv_rb_former_request,
+		.bio_merge		= dd_bio_merge,
+		.request_merge		= dd_request_merge,
+		.requests_merged	= dd_merged_requests,
+		.request_merged		= dd_request_merged,
+		.has_work		= dd_has_work,
+		.init_sched		= dd_init_queue,
+		.exit_sched		= dd_exit_queue,
+	},
+
+	.uses_mq	= true,
+	.elevator_attrs = deadline_attrs,
+	.elevator_name = "mq-deadline",
+	.elevator_owner = THIS_MODULE,
+};
+
+static int __init deadline_init(void)
+{
+	return elv_register(&mq_deadline);
+}
+
+static void __exit deadline_exit(void)
+{
+	elv_unregister(&mq_deadline);
+}
+
+module_init(deadline_init);
+module_exit(deadline_exit);
+
+MODULE_AUTHOR("Jens Axboe");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MQ deadline IO scheduler");
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index a163c487cf38..2d1b15d89b45 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -92,7 +92,7 @@ static void noop_exit_queue(struct elevator_queue *e)
 }
 
 static struct elevator_type elevator_noop = {
-	.ops = {
+	.ops.sq = {
 		.elevator_merge_req_fn		= noop_merged_requests,
 		.elevator_dispatch_fn		= noop_dispatch,
 		.elevator_add_req_fn		= noop_add_request,
diff --git a/block/opal_proto.h b/block/opal_proto.h
new file mode 100644
index 000000000000..f40c9acf8895
--- /dev/null
+++ b/block/opal_proto.h
@@ -0,0 +1,452 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Authors:
+ *    Rafael Antognolli <rafael.antognolli@intel.com>
+ *    Scott  Bauer      <scott.bauer@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/types.h>
+
+#ifndef _OPAL_PROTO_H
+#define _OPAL_PROTO_H
+
+/*
+ * These constant values come from:
+ * SPC-4 section
+ * 6.30 SECURITY PROTOCOL IN command / table 265.
+ */
+enum {
+	TCG_SECP_00 = 0,
+	TCG_SECP_01,
+};
+
+/*
+ * Token defs derived from:
+ * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * 3.2.2 Data Stream Encoding
+ */
+enum opal_response_token {
+	OPAL_DTA_TOKENID_BYTESTRING = 0xe0,
+	OPAL_DTA_TOKENID_SINT = 0xe1,
+	OPAL_DTA_TOKENID_UINT = 0xe2,
+	OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */
+	OPAL_DTA_TOKENID_INVALID = 0X0
+};
+
+#define DTAERROR_NO_METHOD_STATUS 0x89
+#define GENERIC_HOST_SESSION_NUM 0x41
+
+#define TPER_SYNC_SUPPORTED 0x01
+
+#define TINY_ATOM_DATA_MASK 0x3F
+#define TINY_ATOM_SIGNED 0x40
+
+#define SHORT_ATOM_ID 0x80
+#define SHORT_ATOM_BYTESTRING 0x20
+#define SHORT_ATOM_SIGNED 0x10
+#define SHORT_ATOM_LEN_MASK 0xF
+
+#define MEDIUM_ATOM_ID 0xC0
+#define MEDIUM_ATOM_BYTESTRING 0x10
+#define MEDIUM_ATOM_SIGNED 0x8
+#define MEDIUM_ATOM_LEN_MASK 0x7
+
+#define LONG_ATOM_ID 0xe0
+#define LONG_ATOM_BYTESTRING 0x2
+#define LONG_ATOM_SIGNED 0x1
+
+/* Derived from TCG Core spec 2.01 Section:
+ * 3.2.2.1
+ * Data Type
+ */
+#define TINY_ATOM_BYTE   0x7F
+#define SHORT_ATOM_BYTE  0xBF
+#define MEDIUM_ATOM_BYTE 0xDF
+#define LONG_ATOM_BYTE   0xE3
+
+#define OPAL_INVAL_PARAM 12
+#define OPAL_MANUFACTURED_INACTIVE 0x08
+#define OPAL_DISCOVERY_COMID 0x0001
+
+#define LOCKING_RANGE_NON_GLOBAL 0x03
+/*
+ * User IDs used in the TCG storage SSCs
+ * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Section: 6.3 Assigned UIDs
+ */
+#define OPAL_UID_LENGTH 8
+#define OPAL_METHOD_LENGTH 8
+#define OPAL_MSID_KEYLEN 15
+#define OPAL_UID_LENGTH_HALF 4
+
+/* Enum to index OPALUID array */
+enum opal_uid {
+	/* users */
+	OPAL_SMUID_UID,
+	OPAL_THISSP_UID,
+	OPAL_ADMINSP_UID,
+	OPAL_LOCKINGSP_UID,
+	OPAL_ENTERPRISE_LOCKINGSP_UID,
+	OPAL_ANYBODY_UID,
+	OPAL_SID_UID,
+	OPAL_ADMIN1_UID,
+	OPAL_USER1_UID,
+	OPAL_USER2_UID,
+	OPAL_PSID_UID,
+	OPAL_ENTERPRISE_BANDMASTER0_UID,
+	OPAL_ENTERPRISE_ERASEMASTER_UID,
+	/* tables */
+	OPAL_LOCKINGRANGE_GLOBAL,
+	OPAL_LOCKINGRANGE_ACE_RDLOCKED,
+	OPAL_LOCKINGRANGE_ACE_WRLOCKED,
+	OPAL_MBRCONTROL,
+	OPAL_MBR,
+	OPAL_AUTHORITY_TABLE,
+	OPAL_C_PIN_TABLE,
+	OPAL_LOCKING_INFO_TABLE,
+	OPAL_ENTERPRISE_LOCKING_INFO_TABLE,
+	/* C_PIN_TABLE object ID's */
+	OPAL_C_PIN_MSID,
+	OPAL_C_PIN_SID,
+	OPAL_C_PIN_ADMIN1,
+	/* half UID's (only first 4 bytes used) */
+	OPAL_HALF_UID_AUTHORITY_OBJ_REF,
+	OPAL_HALF_UID_BOOLEAN_ACE,
+	/* omitted optional parameter */
+	OPAL_UID_HEXFF,
+};
+
+#define OPAL_METHOD_LENGTH 8
+
+/* Enum for indexing the OPALMETHOD array */
+enum opal_method {
+	OPAL_PROPERTIES,
+	OPAL_STARTSESSION,
+	OPAL_REVERT,
+	OPAL_ACTIVATE,
+	OPAL_EGET,
+	OPAL_ESET,
+	OPAL_NEXT,
+	OPAL_EAUTHENTICATE,
+	OPAL_GETACL,
+	OPAL_GENKEY,
+	OPAL_REVERTSP,
+	OPAL_GET,
+	OPAL_SET,
+	OPAL_AUTHENTICATE,
+	OPAL_RANDOM,
+	OPAL_ERASE,
+};
+
+enum opal_token {
+	/* Boolean */
+	OPAL_TRUE = 0x01,
+	OPAL_FALSE = 0x00,
+	OPAL_BOOLEAN_EXPR = 0x03,
+	/* cellblocks */
+	OPAL_TABLE = 0x00,
+	OPAL_STARTROW = 0x01,
+	OPAL_ENDROW = 0x02,
+	OPAL_STARTCOLUMN = 0x03,
+	OPAL_ENDCOLUMN = 0x04,
+	OPAL_VALUES = 0x01,
+	/* authority table */
+	OPAL_PIN = 0x03,
+	/* locking tokens */
+	OPAL_RANGESTART = 0x03,
+	OPAL_RANGELENGTH = 0x04,
+	OPAL_READLOCKENABLED = 0x05,
+	OPAL_WRITELOCKENABLED = 0x06,
+	OPAL_READLOCKED = 0x07,
+	OPAL_WRITELOCKED = 0x08,
+	OPAL_ACTIVEKEY = 0x0A,
+	/* locking info table */
+	OPAL_MAXRANGES = 0x04,
+	 /* mbr control */
+	OPAL_MBRENABLE = 0x01,
+	OPAL_MBRDONE = 0x02,
+	/* properties */
+	OPAL_HOSTPROPERTIES = 0x00,
+	/* atoms */
+	OPAL_STARTLIST = 0xf0,
+	OPAL_ENDLIST = 0xf1,
+	OPAL_STARTNAME = 0xf2,
+	OPAL_ENDNAME = 0xf3,
+	OPAL_CALL = 0xf8,
+	OPAL_ENDOFDATA = 0xf9,
+	OPAL_ENDOFSESSION = 0xfa,
+	OPAL_STARTTRANSACTON = 0xfb,
+	OPAL_ENDTRANSACTON = 0xfC,
+	OPAL_EMPTYATOM = 0xff,
+	OPAL_WHERE = 0x00,
+};
+
+/* Locking state for a locking range */
+enum opal_lockingstate {
+	OPAL_LOCKING_READWRITE = 0x01,
+	OPAL_LOCKING_READONLY = 0x02,
+	OPAL_LOCKING_LOCKED = 0x03,
+};
+
+/* Packets derived from:
+ * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Secion: 3.2.3 ComPackets, Packets & Subpackets
+ */
+
+/* Comm Packet (header) for transmissions. */
+struct opal_compacket {
+	__be32 reserved0;
+	u8 extendedComID[4];
+	__be32 outstandingData;
+	__be32 minTransfer;
+	__be32 length;
+};
+
+/* Packet structure. */
+struct opal_packet {
+	__be32 tsn;
+	__be32 hsn;
+	__be32 seq_number;
+	__be16 reserved0;
+	__be16 ack_type;
+	__be32 acknowledgment;
+	__be32 length;
+};
+
+/* Data sub packet header */
+struct opal_data_subpacket {
+	u8 reserved0[6];
+	__be16 kind;
+	__be32 length;
+};
+
+/* header of a response */
+struct opal_header {
+	struct opal_compacket cp;
+	struct opal_packet pkt;
+	struct opal_data_subpacket subpkt;
+};
+
+#define FC_TPER       0x0001
+#define FC_LOCKING    0x0002
+#define FC_GEOMETRY   0x0003
+#define FC_ENTERPRISE 0x0100
+#define FC_DATASTORE  0x0202
+#define FC_SINGLEUSER 0x0201
+#define FC_OPALV100   0x0200
+#define FC_OPALV200   0x0203
+
+/*
+ * The Discovery 0 Header. As defined in
+ * Opal SSC Documentation
+ * Section: 3.3.5 Capability Discovery
+ */
+struct d0_header {
+	__be32 length; /* the length of the header 48 in 2.00.100 */
+	__be32 revision; /**< revision of the header 1 in 2.00.100 */
+	__be32 reserved01;
+	__be32 reserved02;
+	/*
+	 * the remainder of the structure is vendor specific and will not be
+	 * addressed now
+	 */
+	u8 ignored[32];
+};
+
+/*
+ * TPer Feature Descriptor. Contains flags indicating support for the
+ * TPer features described in the OPAL specification. The names match the
+ * OPAL terminology
+ *
+ * code == 0x001 in 2.00.100
+ */
+struct d0_tper_features {
+	/*
+	 * supported_features bits:
+	 * bit 7: reserved
+	 * bit 6: com ID management
+	 * bit 5: reserved
+	 * bit 4: streaming support
+	 * bit 3: buffer management
+	 * bit 2: ACK/NACK
+	 * bit 1: async
+	 * bit 0: sync
+	 */
+	u8 supported_features;
+	/*
+	 * bytes 5 through 15 are reserved, but we represent the first 3 as
+	 * u8 to keep the other two 32bits integers aligned.
+	 */
+	u8 reserved01[3];
+	__be32 reserved02;
+	__be32 reserved03;
+};
+
+/*
+ * Locking Feature Descriptor. Contains flags indicating support for the
+ * locking features described in the OPAL specification. The names match the
+ * OPAL terminology
+ *
+ * code == 0x0002 in 2.00.100
+ */
+struct d0_locking_features {
+	/*
+	 * supported_features bits:
+	 * bits 6-7: reserved
+	 * bit 5: MBR done
+	 * bit 4: MBR enabled
+	 * bit 3: media encryption
+	 * bit 2: locked
+	 * bit 1: locking enabled
+	 * bit 0: locking supported
+	 */
+	u8 supported_features;
+	/*
+	 * bytes 5 through 15 are reserved, but we represent the first 3 as
+	 * u8 to keep the other two 32bits integers aligned.
+	 */
+	u8 reserved01[3];
+	__be32 reserved02;
+	__be32 reserved03;
+};
+
+/*
+ * Geometry Feature Descriptor. Contains flags indicating support for the
+ * geometry features described in the OPAL specification. The names match the
+ * OPAL terminology
+ *
+ * code == 0x0003 in 2.00.100
+ */
+struct d0_geometry_features {
+	/*
+	 * skip 32 bits from header, needed to align the struct to 64 bits.
+	 */
+	u8 header[4];
+	/*
+	 * reserved01:
+	 * bits 1-6: reserved
+	 * bit 0: align
+	 */
+	u8 reserved01;
+	u8 reserved02[7];
+	__be32 logical_block_size;
+	__be64 alignment_granularity;
+	__be64 lowest_aligned_lba;
+};
+
+/*
+ * Enterprise SSC Feature
+ *
+ * code == 0x0100
+ */
+struct d0_enterprise_ssc {
+	__be16 baseComID;
+	__be16 numComIDs;
+	/* range_crossing:
+	 * bits 1-6: reserved
+	 * bit 0: range crossing
+	 */
+	u8 range_crossing;
+	u8 reserved01;
+	__be16 reserved02;
+	__be32 reserved03;
+	__be32 reserved04;
+};
+
+/*
+ * Opal V1 feature
+ *
+ * code == 0x0200
+ */
+struct d0_opal_v100 {
+	__be16 baseComID;
+	__be16 numComIDs;
+};
+
+/*
+ * Single User Mode feature
+ *
+ * code == 0x0201
+ */
+struct d0_single_user_mode {
+	__be32 num_locking_objects;
+	/* reserved01:
+	 * bit 0: any
+	 * bit 1: all
+	 * bit 2: policy
+	 * bits 3-7: reserved
+	 */
+	u8 reserved01;
+	u8 reserved02;
+	__be16 reserved03;
+	__be32 reserved04;
+};
+
+/*
+ * Additonal Datastores feature
+ *
+ * code == 0x0202
+ */
+struct d0_datastore_table {
+	__be16 reserved01;
+	__be16 max_tables;
+	__be32 max_size_tables;
+	__be32 table_size_alignment;
+};
+
+/*
+ * OPAL 2.0 feature
+ *
+ * code == 0x0203
+ */
+struct d0_opal_v200 {
+	__be16 baseComID;
+	__be16 numComIDs;
+	/* range_crossing:
+	 * bits 1-6: reserved
+	 * bit 0: range crossing
+	 */
+	u8 range_crossing;
+	/* num_locking_admin_auth:
+	 * not aligned to 16 bits, so use two u8.
+	 * stored in big endian:
+	 * 0: MSB
+	 * 1: LSB
+	 */
+	u8 num_locking_admin_auth[2];
+	/* num_locking_user_auth:
+	 * not aligned to 16 bits, so use two u8.
+	 * stored in big endian:
+	 * 0: MSB
+	 * 1: LSB
+	 */
+	u8 num_locking_user_auth[2];
+	u8 initialPIN;
+	u8 revertedPIN;
+	u8 reserved01;
+	__be32 reserved02;
+};
+
+/* Union of features used to parse the discovery 0 response */
+struct d0_features {
+	__be16 code;
+	/*
+	 * r_version bits:
+	 * bits 4-7: version
+	 * bits 0-3: reserved
+	 */
+	u8 r_version;
+	u8 length;
+	u8 features[];
+};
+
+#endif /* _OPAL_PROTO_H */
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 71d9ed9df8da..7afb9907821f 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -430,6 +430,56 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev)
 	return 0;
 }
 
+static bool part_zone_aligned(struct gendisk *disk,
+			      struct block_device *bdev,
+			      sector_t from, sector_t size)
+{
+	unsigned int zone_sectors = bdev_zone_sectors(bdev);
+
+	/*
+	 * If this function is called, then the disk is a zoned block device
+	 * (host-aware or host-managed). This can be detected even if the
+	 * zoned block device support is disabled (CONFIG_BLK_DEV_ZONED not
+	 * set). In this case, however, only host-aware devices will be seen
+	 * as a block device is not created for host-managed devices. Without
+	 * zoned block device support, host-aware drives can still be used as
+	 * regular block devices (no zone operation) and their zone size will
+	 * be reported as 0. Allow this case.
+	 */
+	if (!zone_sectors)
+		return true;
+
+	/*
+	 * Check partition start and size alignement. If the drive has a
+	 * smaller last runt zone, ignore it and allow the partition to
+	 * use it. Check the zone size too: it should be a power of 2 number
+	 * of sectors.
+	 */
+	if (WARN_ON_ONCE(!is_power_of_2(zone_sectors))) {
+		u32 rem;
+
+		div_u64_rem(from, zone_sectors, &rem);
+		if (rem)
+			return false;
+		if ((from + size) < get_capacity(disk)) {
+			div_u64_rem(size, zone_sectors, &rem);
+			if (rem)
+				return false;
+		}
+
+	} else {
+
+		if (from & (zone_sectors - 1))
+			return false;
+		if ((from + size) < get_capacity(disk) &&
+		    (size & (zone_sectors - 1)))
+			return false;
+
+	}
+
+	return true;
+}
+
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 {
 	struct parsed_partitions *state = NULL;
@@ -529,6 +579,21 @@ rescan:
 			}
 		}
 
+		/*
+		 * On a zoned block device, partitions should be aligned on the
+		 * device zone size (i.e. zone boundary crossing not allowed).
+		 * Otherwise, resetting the write pointer of the last zone of
+		 * one partition may impact the following partition.
+		 */
+		if (bdev_is_zoned(bdev) &&
+		    !part_zone_aligned(disk, bdev, from, size)) {
+			printk(KERN_WARNING
+			       "%s: p%d start %llu+%llu is not zone aligned\n",
+			       disk->disk_name, p, (unsigned long long) from,
+			       (unsigned long long) size);
+			continue;
+		}
+
 		part = add_partition(disk, p, from, size,
 				     state->parts[p].flags,
 				     &state->parts[p].info);
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index bcd86e5cd546..39f70d968754 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -293,7 +293,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
 	if (!gpt)
 		return NULL;
 
-	count = le32_to_cpu(gpt->num_partition_entries) *
+	count = (size_t)le32_to_cpu(gpt->num_partition_entries) *
                 le32_to_cpu(gpt->sizeof_partition_entry);
 	if (!count)
 		return NULL;
@@ -352,7 +352,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
 			gpt_header **gpt, gpt_entry **ptes)
 {
 	u32 crc, origcrc;
-	u64 lastlba;
+	u64 lastlba, pt_size;
 
 	if (!ptes)
 		return 0;
@@ -434,13 +434,20 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
 		goto fail;
 	}
 
+	/* Sanity check partition table size */
+	pt_size = (u64)le32_to_cpu((*gpt)->num_partition_entries) *
+		le32_to_cpu((*gpt)->sizeof_partition_entry);
+	if (pt_size > KMALLOC_MAX_SIZE) {
+		pr_debug("GUID Partition Table is too large: %llu > %lu bytes\n",
+			 (unsigned long long)pt_size, KMALLOC_MAX_SIZE);
+		goto fail;
+	}
+
 	if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
 		goto fail;
 
 	/* Check the GUID Partition Entry Array CRC */
-	crc = efi_crc32((const unsigned char *) (*ptes),
-			le32_to_cpu((*gpt)->num_partition_entries) *
-			le32_to_cpu((*gpt)->sizeof_partition_entry));
+	crc = efi_crc32((const unsigned char *) (*ptes), pt_size);
 
 	if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) {
 		pr_debug("GUID Partition Entry Array CRC check failed.\n");
diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c
index 47a61474e795..14b081af8d61 100644
--- a/block/partitions/ibm.c
+++ b/block/partitions/ibm.c
@@ -10,7 +10,7 @@
 #include <linux/slab.h>
 #include <asm/dasd.h>
 #include <asm/ebcdic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/vtoc.h>
 
 #include "check.h"
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 0774799942e0..2a2fc768b27a 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -28,7 +28,7 @@
 #include <linux/slab.h>
 #include <linux/times.h>
 #include <linux/uio.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
@@ -182,6 +182,9 @@ static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
 	__set_bit(WRITE_16, filter->write_ok);
 	__set_bit(WRITE_LONG, filter->write_ok);
 	__set_bit(WRITE_LONG_2, filter->write_ok);
+	__set_bit(WRITE_SAME, filter->write_ok);
+	__set_bit(WRITE_SAME_16, filter->write_ok);
+	__set_bit(WRITE_SAME_32, filter->write_ok);
 	__set_bit(ERASE, filter->write_ok);
 	__set_bit(GPCMD_MODE_SELECT_10, filter->write_ok);
 	__set_bit(MODE_SELECT, filter->write_ok);
@@ -227,15 +230,17 @@ EXPORT_SYMBOL(blk_verify_command);
 static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
 			     struct sg_io_hdr *hdr, fmode_t mode)
 {
-	if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len))
+	struct scsi_request *req = scsi_req(rq);
+
+	if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len))
 		return -EFAULT;
-	if (blk_verify_command(rq->cmd, mode & FMODE_WRITE))
+	if (blk_verify_command(req->cmd, mode & FMODE_WRITE))
 		return -EPERM;
 
 	/*
 	 * fill in request structure
 	 */
-	rq->cmd_len = hdr->cmd_len;
+	req->cmd_len = hdr->cmd_len;
 
 	rq->timeout = msecs_to_jiffies(hdr->timeout);
 	if (!rq->timeout)
@@ -251,6 +256,7 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
 static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
 				 struct bio *bio)
 {
+	struct scsi_request *req = scsi_req(rq);
 	int r, ret = 0;
 
 	/*
@@ -264,13 +270,13 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
 	hdr->info = 0;
 	if (hdr->masked_status || hdr->host_status || hdr->driver_status)
 		hdr->info |= SG_INFO_CHECK;
-	hdr->resid = rq->resid_len;
+	hdr->resid = req->resid_len;
 	hdr->sb_len_wr = 0;
 
-	if (rq->sense_len && hdr->sbp) {
-		int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len);
+	if (req->sense_len && hdr->sbp) {
+		int len = min((unsigned int) hdr->mx_sb_len, req->sense_len);
 
-		if (!copy_to_user(hdr->sbp, rq->sense, len))
+		if (!copy_to_user(hdr->sbp, req->sense, len))
 			hdr->sb_len_wr = len;
 		else
 			ret = -EFAULT;
@@ -291,7 +297,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	int writing = 0;
 	int at_head = 0;
 	struct request *rq;
-	char sense[SCSI_SENSE_BUFFERSIZE];
+	struct scsi_request *req;
 	struct bio *bio;
 
 	if (hdr->interface_id != 'S')
@@ -315,14 +321,16 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 		at_head = 1;
 
 	ret = -ENOMEM;
-	rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
+	rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
+			GFP_KERNEL);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
-	blk_rq_set_block_pc(rq);
+	req = scsi_req(rq);
+	scsi_req_init(rq);
 
 	if (hdr->cmd_len > BLK_MAX_CDB) {
-		rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL);
-		if (!rq->cmd)
+		req->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL);
+		if (!req->cmd)
 			goto out_put_request;
 	}
 
@@ -354,9 +362,6 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 		goto out_free_cdb;
 
 	bio = rq->bio;
-	memset(sense, 0, sizeof(sense));
-	rq->sense = sense;
-	rq->sense_len = 0;
 	rq->retries = 0;
 
 	start_time = jiffies;
@@ -372,8 +377,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	ret = blk_complete_sghdr_rq(rq, hdr, bio);
 
 out_free_cdb:
-	if (rq->cmd != rq->__cmd)
-		kfree(rq->cmd);
+	scsi_req_free_cmd(req);
 out_put_request:
 	blk_put_request(rq);
 	return ret;
@@ -417,9 +421,10 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		struct scsi_ioctl_command __user *sic)
 {
 	struct request *rq;
+	struct scsi_request *req;
 	int err;
 	unsigned int in_len, out_len, bytes, opcode, cmdlen;
-	char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE];
+	char *buffer = NULL;
 
 	if (!sic)
 		return -EINVAL;
@@ -444,12 +449,14 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 
 	}
 
-	rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_RECLAIM);
+	rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
+			__GFP_RECLAIM);
 	if (IS_ERR(rq)) {
 		err = PTR_ERR(rq);
 		goto error_free_buffer;
 	}
-	blk_rq_set_block_pc(rq);
+	req = scsi_req(rq);
+	scsi_req_init(rq);
 
 	cmdlen = COMMAND_SIZE(opcode);
 
@@ -457,14 +464,14 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 	 * get command and data to send to device, if any
 	 */
 	err = -EFAULT;
-	rq->cmd_len = cmdlen;
-	if (copy_from_user(rq->cmd, sic->data, cmdlen))
+	req->cmd_len = cmdlen;
+	if (copy_from_user(req->cmd, sic->data, cmdlen))
 		goto error;
 
 	if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
 		goto error;
 
-	err = blk_verify_command(rq->cmd, mode & FMODE_WRITE);
+	err = blk_verify_command(req->cmd, mode & FMODE_WRITE);
 	if (err)
 		goto error;
 
@@ -500,18 +507,14 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		goto error;
 	}
 
-	memset(sense, 0, sizeof(sense));
-	rq->sense = sense;
-	rq->sense_len = 0;
-
 	blk_execute_rq(q, disk, rq, 0);
 
 	err = rq->errors & 0xff;	/* only 8 bit SCSI status */
 	if (err) {
-		if (rq->sense_len && rq->sense) {
-			bytes = (OMAX_SB_LEN > rq->sense_len) ?
-				rq->sense_len : OMAX_SB_LEN;
-			if (copy_to_user(sic->data, rq->sense, bytes))
+		if (req->sense_len && req->sense) {
+			bytes = (OMAX_SB_LEN > req->sense_len) ?
+				req->sense_len : OMAX_SB_LEN;
+			if (copy_to_user(sic->data, req->sense, bytes))
 				err = -EFAULT;
 		}
 	} else {
@@ -536,14 +539,14 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	struct request *rq;
 	int err;
 
-	rq = blk_get_request(q, WRITE, __GFP_RECLAIM);
+	rq = blk_get_request(q, REQ_OP_SCSI_OUT, __GFP_RECLAIM);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
-	blk_rq_set_block_pc(rq);
+	scsi_req_init(rq);
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
-	rq->cmd[0] = cmd;
-	rq->cmd[4] = data;
-	rq->cmd_len = 6;
+	scsi_req(rq)->cmd[0] = cmd;
+	scsi_req(rq)->cmd[4] = data;
+	scsi_req(rq)->cmd_len = 6;
 	err = blk_execute_rq(q, bd_disk, rq, 0);
 	blk_put_request(rq);
 
@@ -740,6 +743,17 @@ int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
 }
 EXPORT_SYMBOL(scsi_cmd_blk_ioctl);
 
+void scsi_req_init(struct request *rq)
+{
+	struct scsi_request *req = scsi_req(rq);
+
+	memset(req->__cmd, 0, sizeof(req->__cmd));
+	req->cmd = req->__cmd;
+	req->cmd_len = BLK_MAX_CDB;
+	req->sense_len = 0;
+}
+EXPORT_SYMBOL(scsi_req_init);
+
 static int __init blk_scsi_ioctl_init(void)
 {
 	blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
diff --git a/block/sed-opal.c b/block/sed-opal.c
new file mode 100644
index 000000000000..d1c52ba4d62d
--- /dev/null
+++ b/block/sed-opal.c
@@ -0,0 +1,2488 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Authors:
+ *    Scott  Bauer      <scott.bauer@intel.com>
+ *    Rafael Antognolli <rafael.antognolli@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":OPAL: " fmt
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/genhd.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/sed-opal.h>
+#include <linux/sed-opal.h>
+#include <linux/string.h>
+#include <linux/kdev_t.h>
+
+#include "opal_proto.h"
+
+#define IO_BUFFER_LENGTH 2048
+#define MAX_TOKS 64
+
+typedef int (*opal_step)(struct opal_dev *dev);
+
+enum opal_atom_width {
+	OPAL_WIDTH_TINY,
+	OPAL_WIDTH_SHORT,
+	OPAL_WIDTH_MEDIUM,
+	OPAL_WIDTH_LONG,
+	OPAL_WIDTH_TOKEN
+};
+
+/*
+ * On the parsed response, we don't store again the toks that are already
+ * stored in the response buffer. Instead, for each token, we just store a
+ * pointer to the position in the buffer where the token starts, and the size
+ * of the token in bytes.
+ */
+struct opal_resp_tok {
+	const u8 *pos;
+	size_t len;
+	enum opal_response_token type;
+	enum opal_atom_width width;
+	union {
+		u64 u;
+		s64 s;
+	} stored;
+};
+
+/*
+ * From the response header it's not possible to know how many tokens there are
+ * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later
+ * if we start dealing with messages that have more than that, we can increase
+ * this number. This is done to avoid having to make two passes through the
+ * response, the first one counting how many tokens we have and the second one
+ * actually storing the positions.
+ */
+struct parsed_resp {
+	int num;
+	struct opal_resp_tok toks[MAX_TOKS];
+};
+
+struct opal_dev {
+	bool supported;
+
+	void *data;
+	sec_send_recv *send_recv;
+
+	const opal_step *funcs;
+	void **func_data;
+	int state;
+	struct mutex dev_lock;
+	u16 comid;
+	u32 hsn;
+	u32 tsn;
+	u64 align;
+	u64 lowest_lba;
+
+	size_t pos;
+	u8 cmd[IO_BUFFER_LENGTH];
+	u8 resp[IO_BUFFER_LENGTH];
+
+	struct parsed_resp parsed;
+	size_t prev_d_len;
+	void *prev_data;
+
+	struct list_head unlk_lst;
+};
+
+
+static const u8 opaluid[][OPAL_UID_LENGTH] = {
+	/* users */
+	[OPAL_SMUID_UID] =
+		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff },
+	[OPAL_THISSP_UID] =
+		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 },
+	[OPAL_ADMINSP_UID] =
+		{ 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x01 },
+	[OPAL_LOCKINGSP_UID] =
+		{ 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x02 },
+	[OPAL_ENTERPRISE_LOCKINGSP_UID] =
+		{ 0x00, 0x00, 0x02, 0x05, 0x00, 0x01, 0x00, 0x01 },
+	[OPAL_ANYBODY_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x01 },
+	[OPAL_SID_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06 },
+	[OPAL_ADMIN1_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0x00, 0x01 },
+	[OPAL_USER1_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x01 },
+	[OPAL_USER2_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x02 },
+	[OPAL_PSID_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0xff, 0x01 },
+	[OPAL_ENTERPRISE_BANDMASTER0_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x80, 0x01 },
+	[OPAL_ENTERPRISE_ERASEMASTER_UID] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x84, 0x01 },
+
+	/* tables */
+
+	[OPAL_LOCKINGRANGE_GLOBAL] =
+		{ 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 },
+	[OPAL_LOCKINGRANGE_ACE_RDLOCKED] =
+		{ 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 },
+	[OPAL_LOCKINGRANGE_ACE_WRLOCKED] =
+		{ 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE8, 0x01 },
+	[OPAL_MBRCONTROL] =
+		{ 0x00, 0x00, 0x08, 0x03, 0x00, 0x00, 0x00, 0x01 },
+	[OPAL_MBR] =
+		{ 0x00, 0x00, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00 },
+	[OPAL_AUTHORITY_TABLE] =
+		{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00},
+	[OPAL_C_PIN_TABLE] =
+		{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x00},
+	[OPAL_LOCKING_INFO_TABLE] =
+		{ 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 },
+	[OPAL_ENTERPRISE_LOCKING_INFO_TABLE] =
+		{ 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 },
+
+	/* C_PIN_TABLE object ID's */
+
+        [OPAL_C_PIN_MSID] =
+		{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02},
+	[OPAL_C_PIN_SID] =
+		{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01},
+	[OPAL_C_PIN_ADMIN1] =
+		{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01},
+
+	/* half UID's (only first 4 bytes used) */
+
+	[OPAL_HALF_UID_AUTHORITY_OBJ_REF] =
+		{ 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff },
+	[OPAL_HALF_UID_BOOLEAN_ACE] =
+		{ 0x00, 0x00, 0x04, 0x0E, 0xff, 0xff, 0xff, 0xff },
+
+	/* special value for omitted optional parameter */
+	[OPAL_UID_HEXFF] =
+		{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+};
+
+/*
+ * TCG Storage SSC Methods.
+ * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Section: 6.3 Assigned UIDs
+ */
+static const u8 opalmethod[][OPAL_UID_LENGTH] = {
+	[OPAL_PROPERTIES] =
+		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01 },
+	[OPAL_STARTSESSION] =
+		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x02 },
+	[OPAL_REVERT] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x02 },
+	[OPAL_ACTIVATE] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x03 },
+	[OPAL_EGET] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06 },
+	[OPAL_ESET] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07 },
+	[OPAL_NEXT] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08 },
+	[OPAL_EAUTHENTICATE] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0c },
+	[OPAL_GETACL] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0d },
+	[OPAL_GENKEY] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10 },
+	[OPAL_REVERTSP] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11 },
+	[OPAL_GET] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16 },
+	[OPAL_SET] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17 },
+	[OPAL_AUTHENTICATE] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c },
+	[OPAL_RANDOM] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 },
+	[OPAL_ERASE] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 },
+};
+
+typedef int (cont_fn)(struct opal_dev *dev);
+
+static int end_opal_session_error(struct opal_dev *dev);
+
+struct opal_suspend_data {
+	struct opal_lock_unlock unlk;
+	u8 lr;
+	struct list_head node;
+};
+
+/*
+ * Derived from:
+ * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Section: 5.1.5 Method Status Codes
+ */
+static const char * const opal_errors[] = {
+	"Success",
+	"Not Authorized",
+	"Unknown Error",
+	"SP Busy",
+	"SP Failed",
+	"SP Disabled",
+	"SP Frozen",
+	"No Sessions Available",
+	"Uniqueness Conflict",
+	"Insufficient Space",
+	"Insufficient Rows",
+	"Invalid Function",
+	"Invalid Parameter",
+	"Invalid Reference",
+	"Unknown Error",
+	"TPER Malfunction",
+	"Transaction Failure",
+	"Response Overflow",
+	"Authority Locked Out",
+};
+
+static const char *opal_error_to_human(int error)
+{
+	if (error == 0x3f)
+		return "Failed";
+
+	if (error >= ARRAY_SIZE(opal_errors) || error < 0)
+		return "Unknown Error";
+
+	return opal_errors[error];
+}
+
+static void print_buffer(const u8 *ptr, u32 length)
+{
+#ifdef DEBUG
+	print_hex_dump_bytes("OPAL: ", DUMP_PREFIX_OFFSET, ptr, length);
+	pr_debug("\n");
+#endif
+}
+
+static bool check_tper(const void *data)
+{
+	const struct d0_tper_features *tper = data;
+	u8 flags = tper->supported_features;
+
+	if (!(flags & TPER_SYNC_SUPPORTED)) {
+		pr_err("TPer sync not supported. flags = %d\n",
+		       tper->supported_features);
+		return false;
+	}
+
+	return true;
+}
+
+static bool check_sum(const void *data)
+{
+	const struct d0_single_user_mode *sum = data;
+	u32 nlo = be32_to_cpu(sum->num_locking_objects);
+
+	if (nlo == 0) {
+		pr_err("Need at least one locking object.\n");
+		return false;
+	}
+
+	pr_debug("Number of locking objects: %d\n", nlo);
+
+	return true;
+}
+
+static u16 get_comid_v100(const void *data)
+{
+	const struct d0_opal_v100 *v100 = data;
+
+	return be16_to_cpu(v100->baseComID);
+}
+
+static u16 get_comid_v200(const void *data)
+{
+	const struct d0_opal_v200 *v200 = data;
+
+	return be16_to_cpu(v200->baseComID);
+}
+
+static int opal_send_cmd(struct opal_dev *dev)
+{
+	return dev->send_recv(dev->data, dev->comid, TCG_SECP_01,
+			      dev->cmd, IO_BUFFER_LENGTH,
+			      true);
+}
+
+static int opal_recv_cmd(struct opal_dev *dev)
+{
+	return dev->send_recv(dev->data, dev->comid, TCG_SECP_01,
+			      dev->resp, IO_BUFFER_LENGTH,
+			      false);
+}
+
+static int opal_recv_check(struct opal_dev *dev)
+{
+	size_t buflen = IO_BUFFER_LENGTH;
+	void *buffer = dev->resp;
+	struct opal_header *hdr = buffer;
+	int ret;
+
+	do {
+		pr_debug("Sent OPAL command: outstanding=%d, minTransfer=%d\n",
+			 hdr->cp.outstandingData,
+			 hdr->cp.minTransfer);
+
+		if (hdr->cp.outstandingData == 0 ||
+		    hdr->cp.minTransfer != 0)
+			return 0;
+
+		memset(buffer, 0, buflen);
+		ret = opal_recv_cmd(dev);
+	} while (!ret);
+
+	return ret;
+}
+
+static int opal_send_recv(struct opal_dev *dev, cont_fn *cont)
+{
+	int ret;
+
+	ret = opal_send_cmd(dev);
+	if (ret)
+		return ret;
+	ret = opal_recv_cmd(dev);
+	if (ret)
+		return ret;
+	ret = opal_recv_check(dev);
+	if (ret)
+		return ret;
+	return cont(dev);
+}
+
+static void check_geometry(struct opal_dev *dev, const void *data)
+{
+	const struct d0_geometry_features *geo = data;
+
+	dev->align = geo->alignment_granularity;
+	dev->lowest_lba = geo->lowest_aligned_lba;
+}
+
+static int next(struct opal_dev *dev)
+{
+	opal_step func;
+	int error = 0;
+
+	do {
+		func = dev->funcs[dev->state];
+		if (!func)
+			break;
+
+		error = func(dev);
+		if (error) {
+			pr_err("Error on step function: %d with error %d: %s\n",
+			       dev->state, error,
+			       opal_error_to_human(error));
+
+			/* For each OPAL command we do a discovery0 then we
+			 * start some sort of session.
+			 * If we haven't passed state 1 then there was an error
+			 * on discovery0 or during the attempt to start a
+			 * session. Therefore we shouldn't attempt to terminate
+			 * a session, as one has not yet been created.
+			 */
+			if (dev->state > 1)
+				return end_opal_session_error(dev);
+		}
+		dev->state++;
+	} while (!error);
+
+	return error;
+}
+
+static int opal_discovery0_end(struct opal_dev *dev)
+{
+	bool found_com_id = false, supported = true, single_user = false;
+	const struct d0_header *hdr = (struct d0_header *)dev->resp;
+	const u8 *epos = dev->resp, *cpos = dev->resp;
+	u16 comid = 0;
+
+	print_buffer(dev->resp, be32_to_cpu(hdr->length));
+
+	epos += be32_to_cpu(hdr->length); /* end of buffer */
+	cpos += sizeof(*hdr); /* current position on buffer */
+
+	while (cpos < epos && supported) {
+		const struct d0_features *body =
+			(const struct d0_features *)cpos;
+
+		switch (be16_to_cpu(body->code)) {
+		case FC_TPER:
+			supported = check_tper(body->features);
+			break;
+		case FC_SINGLEUSER:
+			single_user = check_sum(body->features);
+			break;
+		case FC_GEOMETRY:
+			check_geometry(dev, body);
+			break;
+		case FC_LOCKING:
+		case FC_ENTERPRISE:
+		case FC_DATASTORE:
+			/* some ignored properties */
+			pr_debug("Found OPAL feature description: %d\n",
+				 be16_to_cpu(body->code));
+			break;
+		case FC_OPALV100:
+			comid = get_comid_v100(body->features);
+			found_com_id = true;
+			break;
+		case FC_OPALV200:
+			comid = get_comid_v200(body->features);
+			found_com_id = true;
+			break;
+		case 0xbfff ... 0xffff:
+			/* vendor specific, just ignore */
+			break;
+		default:
+			pr_debug("OPAL Unknown feature: %d\n",
+				 be16_to_cpu(body->code));
+
+		}
+		cpos += body->length + 4;
+	}
+
+	if (!supported) {
+		pr_debug("This device is not Opal enabled. Not Supported!\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (!single_user)
+		pr_debug("Device doesn't support single user mode\n");
+
+
+	if (!found_com_id) {
+		pr_debug("Could not find OPAL comid for device. Returning early\n");
+		return -EOPNOTSUPP;;
+	}
+
+	dev->comid = comid;
+
+	return 0;
+}
+
+static int opal_discovery0(struct opal_dev *dev)
+{
+	int ret;
+
+	memset(dev->resp, 0, IO_BUFFER_LENGTH);
+	dev->comid = OPAL_DISCOVERY_COMID;
+	ret = opal_recv_cmd(dev);
+	if (ret)
+		return ret;
+	return opal_discovery0_end(dev);
+}
+
+static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok)
+{
+	if (*err)
+		return;
+	if (cmd->pos >= IO_BUFFER_LENGTH - 1) {
+		pr_err("Error adding u8: end of buffer.\n");
+		*err = -ERANGE;
+		return;
+	}
+	cmd->cmd[cmd->pos++] = tok;
+}
+
+static void add_short_atom_header(struct opal_dev *cmd, bool bytestring,
+				  bool has_sign, int len)
+{
+	u8 atom;
+	int err = 0;
+
+	atom = SHORT_ATOM_ID;
+	atom |= bytestring ? SHORT_ATOM_BYTESTRING : 0;
+	atom |= has_sign ? SHORT_ATOM_SIGNED : 0;
+	atom |= len & SHORT_ATOM_LEN_MASK;
+
+	add_token_u8(&err, cmd, atom);
+}
+
+static void add_medium_atom_header(struct opal_dev *cmd, bool bytestring,
+				   bool has_sign, int len)
+{
+	u8 header0;
+
+	header0 = MEDIUM_ATOM_ID;
+	header0 |= bytestring ? MEDIUM_ATOM_BYTESTRING : 0;
+	header0 |= has_sign ? MEDIUM_ATOM_SIGNED : 0;
+	header0 |= (len >> 8) & MEDIUM_ATOM_LEN_MASK;
+	cmd->cmd[cmd->pos++] = header0;
+	cmd->cmd[cmd->pos++] = len;
+}
+
+static void add_token_u64(int *err, struct opal_dev *cmd, u64 number)
+{
+
+	size_t len;
+	int msb;
+	u8 n;
+
+	if (!(number & ~TINY_ATOM_DATA_MASK)) {
+		add_token_u8(err, cmd, number);
+		return;
+	}
+
+	msb = fls(number);
+	len = DIV_ROUND_UP(msb, 4);
+
+	if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) {
+		pr_err("Error adding u64: end of buffer.\n");
+		*err = -ERANGE;
+		return;
+	}
+	add_short_atom_header(cmd, false, false, len);
+	while (len--) {
+		n = number >> (len * 8);
+		add_token_u8(err, cmd, n);
+	}
+}
+
+static void add_token_bytestring(int *err, struct opal_dev *cmd,
+				 const u8 *bytestring, size_t len)
+{
+	size_t header_len = 1;
+	bool is_short_atom = true;
+
+	if (*err)
+		return;
+
+	if (len & ~SHORT_ATOM_LEN_MASK) {
+		header_len = 2;
+		is_short_atom = false;
+	}
+
+	if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) {
+		pr_err("Error adding bytestring: end of buffer.\n");
+		*err = -ERANGE;
+		return;
+	}
+
+	if (is_short_atom)
+		add_short_atom_header(cmd, true, false, len);
+	else
+		add_medium_atom_header(cmd, true, false, len);
+
+	memcpy(&cmd->cmd[cmd->pos], bytestring, len);
+	cmd->pos += len;
+
+}
+
+static int build_locking_range(u8 *buffer, size_t length, u8 lr)
+{
+	if (length > OPAL_UID_LENGTH) {
+		pr_err("Can't build locking range. Length OOB\n");
+		return -ERANGE;
+	}
+
+	memcpy(buffer, opaluid[OPAL_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH);
+
+	if (lr == 0)
+		return 0;
+	buffer[5] = LOCKING_RANGE_NON_GLOBAL;
+	buffer[7] = lr;
+
+	return 0;
+}
+
+static int build_locking_user(u8 *buffer, size_t length, u8 lr)
+{
+	if (length > OPAL_UID_LENGTH) {
+		pr_err("Can't build locking range user, Length OOB\n");
+		return -ERANGE;
+	}
+
+	memcpy(buffer, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH);
+
+	buffer[7] = lr + 1;
+
+	return 0;
+}
+
+static void set_comid(struct opal_dev *cmd, u16 comid)
+{
+	struct opal_header *hdr = (struct opal_header *)cmd->cmd;
+
+	hdr->cp.extendedComID[0] = comid >> 8;
+	hdr->cp.extendedComID[1] = comid;
+	hdr->cp.extendedComID[2] = 0;
+	hdr->cp.extendedComID[3] = 0;
+}
+
+static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
+{
+	struct opal_header *hdr;
+	int err = 0;
+
+	add_token_u8(&err, cmd, OPAL_ENDOFDATA);
+	add_token_u8(&err, cmd, OPAL_STARTLIST);
+	add_token_u8(&err, cmd, 0);
+	add_token_u8(&err, cmd, 0);
+	add_token_u8(&err, cmd, 0);
+	add_token_u8(&err, cmd, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error finalizing command.\n");
+		return -EFAULT;
+	}
+
+	hdr = (struct opal_header *) cmd->cmd;
+
+	hdr->pkt.tsn = cpu_to_be32(tsn);
+	hdr->pkt.hsn = cpu_to_be32(hsn);
+
+	hdr->subpkt.length = cpu_to_be32(cmd->pos - sizeof(*hdr));
+	while (cmd->pos % 4) {
+		if (cmd->pos >= IO_BUFFER_LENGTH) {
+			pr_err("Error: Buffer overrun\n");
+			return -ERANGE;
+		}
+		cmd->cmd[cmd->pos++] = 0;
+	}
+	hdr->pkt.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp) -
+				      sizeof(hdr->pkt));
+	hdr->cp.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp));
+
+	return 0;
+}
+
+static enum opal_response_token token_type(const struct parsed_resp *resp,
+					   int n)
+{
+	const struct opal_resp_tok *tok;
+
+	if (n >= resp->num) {
+		pr_err("Token number doesn't exist: %d, resp: %d\n",
+		       n, resp->num);
+		return OPAL_DTA_TOKENID_INVALID;
+	}
+
+	tok = &resp->toks[n];
+	if (tok->len == 0) {
+		pr_err("Token length must be non-zero\n");
+		return OPAL_DTA_TOKENID_INVALID;
+	}
+
+	return tok->type;
+}
+
+/*
+ * This function returns 0 in case of invalid token. One should call
+ * token_type() first to find out if the token is valid or not.
+ */
+static enum opal_token response_get_token(const struct parsed_resp *resp,
+					  int n)
+{
+	const struct opal_resp_tok *tok;
+
+	if (n >= resp->num) {
+		pr_err("Token number doesn't exist: %d, resp: %d\n",
+		       n, resp->num);
+		return 0;
+	}
+
+	tok = &resp->toks[n];
+	if (tok->len == 0) {
+		pr_err("Token length must be non-zero\n");
+		return 0;
+	}
+
+	return tok->pos[0];
+}
+
+static size_t response_parse_tiny(struct opal_resp_tok *tok,
+				  const u8 *pos)
+{
+	tok->pos = pos;
+	tok->len = 1;
+	tok->width = OPAL_WIDTH_TINY;
+
+	if (pos[0] & TINY_ATOM_SIGNED) {
+		tok->type = OPAL_DTA_TOKENID_SINT;
+	} else {
+		tok->type = OPAL_DTA_TOKENID_UINT;
+		tok->stored.u = pos[0] & 0x3f;
+	}
+
+	return tok->len;
+}
+
+static size_t response_parse_short(struct opal_resp_tok *tok,
+				   const u8 *pos)
+{
+	tok->pos = pos;
+	tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1;
+	tok->width = OPAL_WIDTH_SHORT;
+
+	if (pos[0] & SHORT_ATOM_BYTESTRING) {
+		tok->type = OPAL_DTA_TOKENID_BYTESTRING;
+	} else if (pos[0] & SHORT_ATOM_SIGNED) {
+		tok->type = OPAL_DTA_TOKENID_SINT;
+	} else {
+		u64 u_integer = 0;
+		int i, b = 0;
+
+		tok->type = OPAL_DTA_TOKENID_UINT;
+		if (tok->len > 9) {
+			pr_warn("uint64 with more than 8 bytes\n");
+			return -EINVAL;
+		}
+		for (i = tok->len - 1; i > 0; i--) {
+			u_integer |= ((u64)pos[i] << (8 * b));
+			b++;
+		}
+		tok->stored.u = u_integer;
+	}
+
+	return tok->len;
+}
+
+static size_t response_parse_medium(struct opal_resp_tok *tok,
+				    const u8 *pos)
+{
+	tok->pos = pos;
+	tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2;
+	tok->width = OPAL_WIDTH_MEDIUM;
+
+	if (pos[0] & MEDIUM_ATOM_BYTESTRING)
+		tok->type = OPAL_DTA_TOKENID_BYTESTRING;
+	else if (pos[0] & MEDIUM_ATOM_SIGNED)
+		tok->type = OPAL_DTA_TOKENID_SINT;
+	else
+		tok->type = OPAL_DTA_TOKENID_UINT;
+
+	return tok->len;
+}
+
+static size_t response_parse_long(struct opal_resp_tok *tok,
+				  const u8 *pos)
+{
+	tok->pos = pos;
+	tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4;
+	tok->width = OPAL_WIDTH_LONG;
+
+	if (pos[0] & LONG_ATOM_BYTESTRING)
+		tok->type = OPAL_DTA_TOKENID_BYTESTRING;
+	else if (pos[0] & LONG_ATOM_SIGNED)
+		tok->type = OPAL_DTA_TOKENID_SINT;
+	else
+		tok->type = OPAL_DTA_TOKENID_UINT;
+
+	return tok->len;
+}
+
+static size_t response_parse_token(struct opal_resp_tok *tok,
+				   const u8 *pos)
+{
+	tok->pos = pos;
+	tok->len = 1;
+	tok->type = OPAL_DTA_TOKENID_TOKEN;
+	tok->width = OPAL_WIDTH_TOKEN;
+
+	return tok->len;
+}
+
+static int response_parse(const u8 *buf, size_t length,
+			  struct parsed_resp *resp)
+{
+	const struct opal_header *hdr;
+	struct opal_resp_tok *iter;
+	int num_entries = 0;
+	int total;
+	size_t token_length;
+	const u8 *pos;
+
+	if (!buf)
+		return -EFAULT;
+
+	if (!resp)
+		return -EFAULT;
+
+	hdr = (struct opal_header *)buf;
+	pos = buf;
+	pos += sizeof(*hdr);
+
+	pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n",
+		 be32_to_cpu(hdr->cp.length),
+		 be32_to_cpu(hdr->pkt.length),
+		 be32_to_cpu(hdr->subpkt.length));
+
+	if (hdr->cp.length == 0 || hdr->pkt.length == 0 ||
+	    hdr->subpkt.length == 0) {
+		pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n",
+		       be32_to_cpu(hdr->cp.length),
+		       be32_to_cpu(hdr->pkt.length),
+		       be32_to_cpu(hdr->subpkt.length));
+		print_buffer(pos, sizeof(*hdr));
+		return -EINVAL;
+	}
+
+	if (pos > buf + length)
+		return -EFAULT;
+
+	iter = resp->toks;
+	total = be32_to_cpu(hdr->subpkt.length);
+	print_buffer(pos, total);
+	while (total > 0) {
+		if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */
+			token_length = response_parse_tiny(iter, pos);
+		else if (pos[0] <= SHORT_ATOM_BYTE) /* short atom */
+			token_length = response_parse_short(iter, pos);
+		else if (pos[0] <= MEDIUM_ATOM_BYTE) /* medium atom */
+			token_length = response_parse_medium(iter, pos);
+		else if (pos[0] <= LONG_ATOM_BYTE) /* long atom */
+			token_length = response_parse_long(iter, pos);
+		else /* TOKEN */
+			token_length = response_parse_token(iter, pos);
+
+		if (token_length == -EINVAL)
+			return -EINVAL;
+
+		pos += token_length;
+		total -= token_length;
+		iter++;
+		num_entries++;
+	}
+
+	if (num_entries == 0) {
+		pr_err("Couldn't parse response.\n");
+		return -EINVAL;
+	}
+	resp->num = num_entries;
+
+	return 0;
+}
+
+static size_t response_get_string(const struct parsed_resp *resp, int n,
+				  const char **store)
+{
+	*store = NULL;
+	if (!resp) {
+		pr_err("Response is NULL\n");
+		return 0;
+	}
+
+	if (n > resp->num) {
+		pr_err("Response has %d tokens. Can't access %d\n",
+		       resp->num, n);
+		return 0;
+	}
+
+	if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) {
+		pr_err("Token is not a byte string!\n");
+		return 0;
+	}
+
+	*store = resp->toks[n].pos + 1;
+	return resp->toks[n].len - 1;
+}
+
+static u64 response_get_u64(const struct parsed_resp *resp, int n)
+{
+	if (!resp) {
+		pr_err("Response is NULL\n");
+		return 0;
+	}
+
+	if (n > resp->num) {
+		pr_err("Response has %d tokens. Can't access %d\n",
+		       resp->num, n);
+		return 0;
+	}
+
+	if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) {
+		pr_err("Token is not unsigned it: %d\n",
+		       resp->toks[n].type);
+		return 0;
+	}
+
+	if (!(resp->toks[n].width == OPAL_WIDTH_TINY ||
+	      resp->toks[n].width == OPAL_WIDTH_SHORT)) {
+		pr_err("Atom is not short or tiny: %d\n",
+		       resp->toks[n].width);
+		return 0;
+	}
+
+	return resp->toks[n].stored.u;
+}
+
+static u8 response_status(const struct parsed_resp *resp)
+{
+	if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN &&
+	    response_get_token(resp, 0) == OPAL_ENDOFSESSION) {
+		return 0;
+	}
+
+	if (resp->num < 5)
+		return DTAERROR_NO_METHOD_STATUS;
+
+	if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN ||
+	    token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN ||
+	    response_get_token(resp, resp->num - 1) != OPAL_ENDLIST ||
+	    response_get_token(resp, resp->num - 5) != OPAL_STARTLIST)
+		return DTAERROR_NO_METHOD_STATUS;
+
+	return response_get_u64(resp, resp->num - 4);
+}
+
+/* Parses and checks for errors */
+static int parse_and_check_status(struct opal_dev *dev)
+{
+	int error;
+
+	print_buffer(dev->cmd, dev->pos);
+
+	error = response_parse(dev->resp, IO_BUFFER_LENGTH, &dev->parsed);
+	if (error) {
+		pr_err("Couldn't parse response.\n");
+		return error;
+	}
+
+	return response_status(&dev->parsed);
+}
+
+static void clear_opal_cmd(struct opal_dev *dev)
+{
+	dev->pos = sizeof(struct opal_header);
+	memset(dev->cmd, 0, IO_BUFFER_LENGTH);
+}
+
+static int start_opal_session_cont(struct opal_dev *dev)
+{
+	u32 hsn, tsn;
+	int error = 0;
+
+	error = parse_and_check_status(dev);
+	if (error)
+		return error;
+
+	hsn = response_get_u64(&dev->parsed, 4);
+	tsn = response_get_u64(&dev->parsed, 5);
+
+	if (hsn == 0 && tsn == 0) {
+		pr_err("Couldn't authenticate session\n");
+		return -EPERM;
+	}
+
+	dev->hsn = hsn;
+	dev->tsn = tsn;
+	return 0;
+}
+
+static void add_suspend_info(struct opal_dev *dev,
+			     struct opal_suspend_data *sus)
+{
+	struct opal_suspend_data *iter;
+
+	list_for_each_entry(iter, &dev->unlk_lst, node) {
+		if (iter->lr == sus->lr) {
+			list_del(&iter->node);
+			kfree(iter);
+			break;
+		}
+	}
+	list_add_tail(&sus->node, &dev->unlk_lst);
+}
+
+static int end_session_cont(struct opal_dev *dev)
+{
+	dev->hsn = 0;
+	dev->tsn = 0;
+	return parse_and_check_status(dev);
+}
+
+static int finalize_and_send(struct opal_dev *dev, cont_fn cont)
+{
+	int ret;
+
+	ret = cmd_finalize(dev, dev->hsn, dev->tsn);
+	if (ret) {
+		pr_err("Error finalizing command buffer: %d\n", ret);
+		return ret;
+	}
+
+	print_buffer(dev->cmd, dev->pos);
+
+	return opal_send_recv(dev, cont);
+}
+
+static int gen_key(struct opal_dev *dev)
+{
+	const u8 *method;
+	u8 uid[OPAL_UID_LENGTH];
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len));
+	method = opalmethod[OPAL_GENKEY];
+	kfree(dev->prev_data);
+	dev->prev_data = NULL;
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_GENKEY],
+			     OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error building gen key command\n");
+		return err;
+
+	}
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int get_active_key_cont(struct opal_dev *dev)
+{
+	const char *activekey;
+	size_t keylen;
+	int error = 0;
+
+	error = parse_and_check_status(dev);
+	if (error)
+		return error;
+	keylen = response_get_string(&dev->parsed, 4, &activekey);
+	if (!activekey) {
+		pr_err("%s: Couldn't extract the Activekey from the response\n",
+		       __func__);
+		return OPAL_INVAL_PARAM;
+	}
+	dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL);
+
+	if (!dev->prev_data)
+		return -ENOMEM;
+
+	dev->prev_d_len = keylen;
+
+	return 0;
+}
+
+static int get_active_key(struct opal_dev *dev)
+{
+	u8 uid[OPAL_UID_LENGTH];
+	int err = 0;
+	u8 *lr;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+	lr = dev->func_data[dev->state];
+
+	err = build_locking_range(uid, sizeof(uid), *lr);
+	if (err)
+		return err;
+
+	err = 0;
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 3); /* startCloumn */
+	add_token_u8(&err, dev, 10); /* ActiveKey */
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 4); /* endColumn */
+	add_token_u8(&err, dev, 10); /* ActiveKey */
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	if (err) {
+		pr_err("Error building get active key command\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, get_active_key_cont);
+}
+
+static int generic_lr_enable_disable(struct opal_dev *dev,
+				     u8 *uid, bool rle, bool wle,
+				     bool rl, bool wl)
+{
+	int err = 0;
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_VALUES);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 5); /* ReadLockEnabled */
+	add_token_u8(&err, dev, rle);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 6); /* WriteLockEnabled */
+	add_token_u8(&err, dev, wle);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_READLOCKED);
+	add_token_u8(&err, dev, rl);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_WRITELOCKED);
+	add_token_u8(&err, dev, wl);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	return err;
+}
+
+static inline int enable_global_lr(struct opal_dev *dev, u8 *uid,
+				   struct opal_user_lr_setup *setup)
+{
+	int err;
+
+	err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE,
+					0, 0);
+	if (err)
+		pr_err("Failed to create enable global lr command\n");
+	return err;
+}
+
+static int setup_locking_range(struct opal_dev *dev)
+{
+	u8 uid[OPAL_UID_LENGTH];
+	struct opal_user_lr_setup *setup;
+	u8 lr;
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	setup = dev->func_data[dev->state];
+	lr = setup->session.opal_key.lr;
+	err = build_locking_range(uid, sizeof(uid), lr);
+	if (err)
+		return err;
+
+	if (lr == 0)
+		err = enable_global_lr(dev, uid, setup);
+	else {
+		add_token_u8(&err, dev, OPAL_CALL);
+		add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+		add_token_bytestring(&err, dev, opalmethod[OPAL_SET],
+				     OPAL_UID_LENGTH);
+
+		add_token_u8(&err, dev, OPAL_STARTLIST);
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, OPAL_VALUES);
+		add_token_u8(&err, dev, OPAL_STARTLIST);
+
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, 3); /* Ranges Start */
+		add_token_u64(&err, dev, setup->range_start);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, 4); /* Ranges length */
+		add_token_u64(&err, dev, setup->range_length);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, 5); /*ReadLockEnabled */
+		add_token_u64(&err, dev, !!setup->RLE);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, 6); /*WriteLockEnabled*/
+		add_token_u64(&err, dev, !!setup->WLE);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+
+		add_token_u8(&err, dev, OPAL_ENDLIST);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+		add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	}
+	if (err) {
+		pr_err("Error building Setup Locking range command.\n");
+		return err;
+
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int start_generic_opal_session(struct opal_dev *dev,
+				      enum opal_uid auth,
+				      enum opal_uid sp_type,
+				      const char *key,
+				      u8 key_len)
+{
+	u32 hsn;
+	int err = 0;
+
+	if (key == NULL && auth != OPAL_ANYBODY_UID) {
+		pr_err("%s: Attempted to open ADMIN_SP Session without a Host" \
+		       "Challenge, and not as the Anybody UID\n", __func__);
+		return OPAL_INVAL_PARAM;
+	}
+
+	clear_opal_cmd(dev);
+
+	set_comid(dev, dev->comid);
+	hsn = GENERIC_HOST_SESSION_NUM;
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION],
+			     OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u64(&err, dev, hsn);
+	add_token_bytestring(&err, dev, opaluid[sp_type], OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, 1);
+
+	switch (auth) {
+	case OPAL_ANYBODY_UID:
+		add_token_u8(&err, dev, OPAL_ENDLIST);
+		break;
+	case OPAL_ADMIN1_UID:
+	case OPAL_SID_UID:
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, 0); /* HostChallenge */
+		add_token_bytestring(&err, dev, key, key_len);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, 3); /* HostSignAuth */
+		add_token_bytestring(&err, dev, opaluid[auth],
+				     OPAL_UID_LENGTH);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+		add_token_u8(&err, dev, OPAL_ENDLIST);
+		break;
+	default:
+		pr_err("Cannot start Admin SP session with auth %d\n", auth);
+		return OPAL_INVAL_PARAM;
+	}
+
+	if (err) {
+		pr_err("Error building start adminsp session command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, start_opal_session_cont);
+}
+
+static int start_anybodyASP_opal_session(struct opal_dev *dev)
+{
+	return start_generic_opal_session(dev, OPAL_ANYBODY_UID,
+					  OPAL_ADMINSP_UID, NULL, 0);
+}
+
+static int start_SIDASP_opal_session(struct opal_dev *dev)
+{
+	int ret;
+	const u8 *key = dev->prev_data;
+	struct opal_key *okey;
+
+	if (!key) {
+		okey = dev->func_data[dev->state];
+		ret = start_generic_opal_session(dev, OPAL_SID_UID,
+						 OPAL_ADMINSP_UID,
+						 okey->key,
+						 okey->key_len);
+	} else {
+		ret = start_generic_opal_session(dev, OPAL_SID_UID,
+						 OPAL_ADMINSP_UID,
+						 key, dev->prev_d_len);
+		kfree(key);
+		dev->prev_data = NULL;
+	}
+	return ret;
+}
+
+static inline int start_admin1LSP_opal_session(struct opal_dev *dev)
+{
+	struct opal_key *key = dev->func_data[dev->state];
+
+	return start_generic_opal_session(dev, OPAL_ADMIN1_UID,
+					  OPAL_LOCKINGSP_UID,
+					  key->key, key->key_len);
+}
+
+static int start_auth_opal_session(struct opal_dev *dev)
+{
+	u8 lk_ul_user[OPAL_UID_LENGTH];
+	int err = 0;
+
+	struct opal_session_info *session = dev->func_data[dev->state];
+	size_t keylen = session->opal_key.key_len;
+	u8 *key = session->opal_key.key;
+	u32 hsn = GENERIC_HOST_SESSION_NUM;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	if (session->sum) {
+		err = build_locking_user(lk_ul_user, sizeof(lk_ul_user),
+					 session->opal_key.lr);
+		if (err)
+			return err;
+
+	} else if (session->who != OPAL_ADMIN1 && !session->sum) {
+		err = build_locking_user(lk_ul_user, sizeof(lk_ul_user),
+					 session->who - 1);
+		if (err)
+			return err;
+	} else
+		memcpy(lk_ul_user, opaluid[OPAL_ADMIN1_UID], OPAL_UID_LENGTH);
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION],
+			     OPAL_UID_LENGTH);
+
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u64(&err, dev, hsn);
+	add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
+			     OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, 1);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 0);
+	add_token_bytestring(&err, dev, key, keylen);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 3);
+	add_token_bytestring(&err, dev, lk_ul_user, OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error building STARTSESSION command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, start_opal_session_cont);
+}
+
+static int revert_tper(struct opal_dev *dev)
+{
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_ADMINSP_UID],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_REVERT],
+			     OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	if (err) {
+		pr_err("Error building REVERT TPER command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int internal_activate_user(struct opal_dev *dev)
+{
+	struct opal_session_info *session = dev->func_data[dev->state];
+	u8 uid[OPAL_UID_LENGTH];
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	memcpy(uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH);
+	uid[7] = session->who;
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_VALUES);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 5); /* Enabled */
+	add_token_u8(&err, dev, OPAL_TRUE);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error building Activate UserN command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int erase_locking_range(struct opal_dev *dev)
+{
+	struct opal_session_info *session;
+	u8 uid[OPAL_UID_LENGTH];
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+	session = dev->func_data[dev->state];
+
+	if (build_locking_range(uid, sizeof(uid), session->opal_key.lr) < 0)
+		return -ERANGE;
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_ERASE],
+			     OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error building Erase Locking Range Command.\n");
+		return err;
+	}
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int set_mbr_done(struct opal_dev *dev)
+{
+	u8 mbr_done_tf = *(u8 *)dev->func_data[dev->state];
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_VALUES);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 2); /* Done */
+	add_token_u8(&err, dev, mbr_done_tf); /* Done T or F */
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error Building set MBR Done command\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int set_mbr_enable_disable(struct opal_dev *dev)
+{
+	u8 mbr_en_dis = *(u8 *)dev->func_data[dev->state];
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_VALUES);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 1);
+	add_token_u8(&err, dev, mbr_en_dis);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error Building set MBR done command\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid,
+			  struct opal_dev *dev)
+{
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, cpin_uid, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_SET],
+			     OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_VALUES);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 3); /* PIN */
+	add_token_bytestring(&err, dev, key, key_len);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	return err;
+}
+
+static int set_new_pw(struct opal_dev *dev)
+{
+	u8 cpin_uid[OPAL_UID_LENGTH];
+	struct opal_session_info *usr = dev->func_data[dev->state];
+
+
+	memcpy(cpin_uid, opaluid[OPAL_C_PIN_ADMIN1], OPAL_UID_LENGTH);
+
+	if (usr->who != OPAL_ADMIN1) {
+		cpin_uid[5] = 0x03;
+		if (usr->sum)
+			cpin_uid[7] = usr->opal_key.lr + 1;
+		else
+			cpin_uid[7] = usr->who;
+	}
+
+	if (generic_pw_cmd(usr->opal_key.key, usr->opal_key.key_len,
+			   cpin_uid, dev)) {
+		pr_err("Error building set password command.\n");
+		return -ERANGE;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int set_sid_cpin_pin(struct opal_dev *dev)
+{
+	u8 cpin_uid[OPAL_UID_LENGTH];
+	struct opal_key *key = dev->func_data[dev->state];
+
+	memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH);
+
+	if (generic_pw_cmd(key->key, key->key_len, cpin_uid, dev)) {
+		pr_err("Error building Set SID cpin\n");
+		return -ERANGE;
+	}
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int add_user_to_lr(struct opal_dev *dev)
+{
+	u8 lr_buffer[OPAL_UID_LENGTH];
+	u8 user_uid[OPAL_UID_LENGTH];
+	struct opal_lock_unlock *lkul;
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	lkul = dev->func_data[dev->state];
+
+	memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED],
+	       OPAL_UID_LENGTH);
+
+	if (lkul->l_state == OPAL_RW)
+		memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_WRLOCKED],
+		       OPAL_UID_LENGTH);
+
+	lr_buffer[7] = lkul->session.opal_key.lr;
+
+	memcpy(user_uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH);
+
+	user_uid[7] = lkul->session.who;
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_SET],
+			     OPAL_UID_LENGTH);
+
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_VALUES);
+
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 3);
+
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_bytestring(&err, dev,
+			     opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF],
+			     OPAL_UID_LENGTH/2);
+	add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_bytestring(&err, dev,
+			     opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF],
+			     OPAL_UID_LENGTH/2);
+	add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_bytestring(&err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE],
+			     OPAL_UID_LENGTH/2);
+	add_token_u8(&err, dev, 1);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error building add user to locking range command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int lock_unlock_locking_range(struct opal_dev *dev)
+{
+	u8 lr_buffer[OPAL_UID_LENGTH];
+	const u8 *method;
+	struct opal_lock_unlock *lkul;
+	u8 read_locked = 1, write_locked = 1;
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	method = opalmethod[OPAL_SET];
+	lkul = dev->func_data[dev->state];
+	if (build_locking_range(lr_buffer, sizeof(lr_buffer),
+				lkul->session.opal_key.lr) < 0)
+		return -ERANGE;
+
+	switch (lkul->l_state) {
+	case OPAL_RO:
+		read_locked = 0;
+		write_locked = 1;
+		break;
+	case OPAL_RW:
+		read_locked = 0;
+		write_locked = 0;
+		break;
+	case OPAL_LK:
+		/* vars are initalized to locked */
+		break;
+	default:
+		pr_err("Tried to set an invalid locking state... returning to uland\n");
+		return OPAL_INVAL_PARAM;
+	}
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_VALUES);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_READLOCKED);
+	add_token_u8(&err, dev, read_locked);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_WRITELOCKED);
+	add_token_u8(&err, dev, write_locked);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error building SET command.\n");
+		return err;
+	}
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+
+static int lock_unlock_locking_range_sum(struct opal_dev *dev)
+{
+	u8 lr_buffer[OPAL_UID_LENGTH];
+	u8 read_locked = 1, write_locked = 1;
+	const u8 *method;
+	struct opal_lock_unlock *lkul;
+	int ret;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	method = opalmethod[OPAL_SET];
+	lkul = dev->func_data[dev->state];
+	if (build_locking_range(lr_buffer, sizeof(lr_buffer),
+				lkul->session.opal_key.lr) < 0)
+		return -ERANGE;
+
+	switch (lkul->l_state) {
+	case OPAL_RO:
+		read_locked = 0;
+		write_locked = 1;
+		break;
+	case OPAL_RW:
+		read_locked = 0;
+		write_locked = 0;
+		break;
+	case OPAL_LK:
+		/* vars are initalized to locked */
+		break;
+	default:
+		pr_err("Tried to set an invalid locking state.\n");
+		return OPAL_INVAL_PARAM;
+	}
+	ret = generic_lr_enable_disable(dev, lr_buffer, 1, 1,
+					read_locked, write_locked);
+
+	if (ret < 0) {
+		pr_err("Error building SET command.\n");
+		return ret;
+	}
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int activate_lsp(struct opal_dev *dev)
+{
+	struct opal_lr_act *opal_act;
+	u8 user_lr[OPAL_UID_LENGTH];
+	u8 uint_3 = 0x83;
+	int err = 0, i;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	opal_act = dev->func_data[dev->state];
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_ACTIVATE],
+			     OPAL_UID_LENGTH);
+
+
+	if (opal_act->sum) {
+		err = build_locking_range(user_lr, sizeof(user_lr),
+					  opal_act->lr[0]);
+		if (err)
+			return err;
+
+		add_token_u8(&err, dev, OPAL_STARTLIST);
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u8(&err, dev, uint_3);
+		add_token_u8(&err, dev, 6);
+		add_token_u8(&err, dev, 0);
+		add_token_u8(&err, dev, 0);
+
+		add_token_u8(&err, dev, OPAL_STARTLIST);
+		add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH);
+		for (i = 1; i < opal_act->num_lrs; i++) {
+			user_lr[7] = opal_act->lr[i];
+			add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH);
+		}
+		add_token_u8(&err, dev, OPAL_ENDLIST);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+		add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	} else {
+		add_token_u8(&err, dev, OPAL_STARTLIST);
+		add_token_u8(&err, dev, OPAL_ENDLIST);
+	}
+
+	if (err) {
+		pr_err("Error building Activate LockingSP command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int get_lsp_lifecycle_cont(struct opal_dev *dev)
+{
+	u8 lc_status;
+	int error = 0;
+
+	error = parse_and_check_status(dev);
+	if (error)
+		return error;
+
+	lc_status = response_get_u64(&dev->parsed, 4);
+	/* 0x08 is Manufacured Inactive */
+	/* 0x09 is Manufactured */
+	if (lc_status != OPAL_MANUFACTURED_INACTIVE) {
+		pr_err("Couldn't determine the status of the Lifcycle state\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+/* Determine if we're in the Manufactured Inactive or Active state */
+static int get_lsp_lifecycle(struct opal_dev *dev)
+{
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
+
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 3); /* Start Column */
+	add_token_u8(&err, dev, 6); /* Lifecycle Column */
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 4); /* End Column */
+	add_token_u8(&err, dev, 6); /* Lifecycle Column */
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error Building GET Lifecycle Status command\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, get_lsp_lifecycle_cont);
+}
+
+static int get_msid_cpin_pin_cont(struct opal_dev *dev)
+{
+	const char *msid_pin;
+	size_t strlen;
+	int error = 0;
+
+	error = parse_and_check_status(dev);
+	if (error)
+		return error;
+
+	strlen = response_get_string(&dev->parsed, 4, &msid_pin);
+	if (!msid_pin) {
+		pr_err("%s: Couldn't extract PIN from response\n", __func__);
+		return OPAL_INVAL_PARAM;
+	}
+
+	dev->prev_data = kmemdup(msid_pin, strlen, GFP_KERNEL);
+	if (!dev->prev_data)
+		return -ENOMEM;
+
+	dev->prev_d_len = strlen;
+
+	return 0;
+}
+
+static int get_msid_cpin_pin(struct opal_dev *dev)
+{
+	int err = 0;
+
+	clear_opal_cmd(dev);
+	set_comid(dev, dev->comid);
+
+
+	add_token_u8(&err, dev, OPAL_CALL);
+	add_token_bytestring(&err, dev, opaluid[OPAL_C_PIN_MSID],
+			     OPAL_UID_LENGTH);
+	add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
+
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 3); /* Start Column */
+	add_token_u8(&err, dev, 3); /* PIN */
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, 4); /* End Column */
+	add_token_u8(&err, dev, 3); /* Lifecycle Column */
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+
+	if (err) {
+		pr_err("Error building Get MSID CPIN PIN command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, get_msid_cpin_pin_cont);
+}
+
+static int build_end_opal_session(struct opal_dev *dev)
+{
+	int err = 0;
+
+	clear_opal_cmd(dev);
+
+	set_comid(dev, dev->comid);
+	add_token_u8(&err, dev, OPAL_ENDOFSESSION);
+	return err;
+}
+
+static int end_opal_session(struct opal_dev *dev)
+{
+	int ret = build_end_opal_session(dev);
+
+	if (ret < 0)
+		return ret;
+	return finalize_and_send(dev, end_session_cont);
+}
+
+static int end_opal_session_error(struct opal_dev *dev)
+{
+	const opal_step error_end_session[] = {
+		end_opal_session,
+		NULL,
+	};
+	dev->funcs = error_end_session;
+	dev->state = 0;
+	return next(dev);
+}
+
+static inline void setup_opal_dev(struct opal_dev *dev,
+				  const opal_step *funcs)
+{
+	dev->state = 0;
+	dev->funcs = funcs;
+	dev->tsn = 0;
+	dev->hsn = 0;
+	dev->func_data = NULL;
+	dev->prev_data = NULL;
+}
+
+static int check_opal_support(struct opal_dev *dev)
+{
+	static const opal_step funcs[] = {
+		opal_discovery0,
+		NULL
+	};
+	int ret;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, funcs);
+	ret = next(dev);
+	dev->supported = !ret;
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv)
+{
+	struct opal_dev *dev;
+
+	dev = kmalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return NULL;
+
+	INIT_LIST_HEAD(&dev->unlk_lst);
+	mutex_init(&dev->dev_lock);
+	dev->data = data;
+	dev->send_recv = send_recv;
+	if (check_opal_support(dev) != 0) {
+		pr_debug("Opal is not supported on this device\n");
+		kfree(dev);
+		return NULL;
+	}
+	return dev;
+}
+EXPORT_SYMBOL(init_opal_dev);
+
+static int opal_secure_erase_locking_range(struct opal_dev *dev,
+					   struct opal_session_info *opal_session)
+{
+	void *data[3] = { NULL };
+	static const opal_step erase_funcs[] = {
+		opal_discovery0,
+		start_auth_opal_session,
+		get_active_key,
+		gen_key,
+		end_opal_session,
+		NULL,
+	};
+	int ret;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, erase_funcs);
+
+	dev->func_data = data;
+	dev->func_data[1] = opal_session;
+	dev->func_data[2] = &opal_session->opal_key.lr;
+
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_erase_locking_range(struct opal_dev *dev,
+				    struct opal_session_info *opal_session)
+{
+	void *data[3] = { NULL };
+	static const opal_step erase_funcs[] = {
+		opal_discovery0,
+		start_auth_opal_session,
+		erase_locking_range,
+		end_opal_session,
+		NULL,
+	};
+	int ret;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, erase_funcs);
+
+	dev->func_data = data;
+	dev->func_data[1] = opal_session;
+	dev->func_data[2] = opal_session;
+
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
+					  struct opal_mbr_data *opal_mbr)
+{
+	void *func_data[6] = { NULL };
+	static const opal_step mbr_funcs[] = {
+		opal_discovery0,
+		start_admin1LSP_opal_session,
+		set_mbr_done,
+		end_opal_session,
+		start_admin1LSP_opal_session,
+		set_mbr_enable_disable,
+		end_opal_session,
+		NULL,
+	};
+	int ret;
+
+	if (opal_mbr->enable_disable != OPAL_MBR_ENABLE &&
+	    opal_mbr->enable_disable != OPAL_MBR_DISABLE)
+		return -EINVAL;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, mbr_funcs);
+	dev->func_data = func_data;
+	dev->func_data[1] = &opal_mbr->key;
+	dev->func_data[2] = &opal_mbr->enable_disable;
+	dev->func_data[4] = &opal_mbr->key;
+	dev->func_data[5] = &opal_mbr->enable_disable;
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
+{
+	struct opal_suspend_data *suspend;
+
+	suspend = kzalloc(sizeof(*suspend), GFP_KERNEL);
+	if (!suspend)
+		return -ENOMEM;
+
+	suspend->unlk = *lk_unlk;
+	suspend->lr = lk_unlk->session.opal_key.lr;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, NULL);
+	add_suspend_info(dev, suspend);
+	mutex_unlock(&dev->dev_lock);
+	return 0;
+}
+
+static int opal_add_user_to_lr(struct opal_dev *dev,
+			       struct opal_lock_unlock *lk_unlk)
+{
+	void *func_data[3] = { NULL };
+	static const opal_step funcs[] = {
+		opal_discovery0,
+		start_admin1LSP_opal_session,
+		add_user_to_lr,
+		end_opal_session,
+		NULL
+	};
+	int ret;
+
+	if (lk_unlk->l_state != OPAL_RO &&
+	    lk_unlk->l_state != OPAL_RW) {
+		pr_err("Locking state was not RO or RW\n");
+		return -EINVAL;
+	}
+	if (lk_unlk->session.who < OPAL_USER1 &&
+	    lk_unlk->session.who > OPAL_USER9) {
+		pr_err("Authority was not within the range of users: %d\n",
+		       lk_unlk->session.who);
+		return -EINVAL;
+	}
+	if (lk_unlk->session.sum) {
+		pr_err("%s not supported in sum. Use setup locking range\n",
+		       __func__);
+		return -EINVAL;
+	}
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, funcs);
+	dev->func_data = func_data;
+	dev->func_data[1] = &lk_unlk->session.opal_key;
+	dev->func_data[2] = lk_unlk;
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal)
+{
+	void *data[2] = { NULL };
+	static const opal_step revert_funcs[] = {
+		opal_discovery0,
+		start_SIDASP_opal_session,
+		revert_tper, /* controller will terminate session */
+		NULL,
+	};
+	int ret;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, revert_funcs);
+	dev->func_data = data;
+	dev->func_data[1] = opal;
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int __opal_lock_unlock_sum(struct opal_dev *dev)
+{
+	static const opal_step ulk_funcs_sum[] = {
+		opal_discovery0,
+		start_auth_opal_session,
+		lock_unlock_locking_range_sum,
+		end_opal_session,
+		NULL
+	};
+
+	dev->funcs = ulk_funcs_sum;
+	return next(dev);
+}
+
+static int __opal_lock_unlock(struct opal_dev *dev)
+{
+	static const opal_step _unlock_funcs[] = {
+		opal_discovery0,
+		start_auth_opal_session,
+		lock_unlock_locking_range,
+		end_opal_session,
+		NULL
+	};
+
+	dev->funcs = _unlock_funcs;
+	return next(dev);
+}
+
+static int opal_lock_unlock(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
+{
+	void *func_data[3] = { NULL };
+	int ret;
+
+	if (lk_unlk->session.who < OPAL_ADMIN1 ||
+	    lk_unlk->session.who > OPAL_USER9)
+		return -EINVAL;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, NULL);
+	dev->func_data = func_data;
+	dev->func_data[1] = &lk_unlk->session;
+	dev->func_data[2] = lk_unlk;
+
+	if (lk_unlk->session.sum)
+		ret = __opal_lock_unlock_sum(dev);
+	else
+		ret = __opal_lock_unlock(dev);
+
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal)
+{
+	static const opal_step owner_funcs[] = {
+		opal_discovery0,
+		start_anybodyASP_opal_session,
+		get_msid_cpin_pin,
+		end_opal_session,
+		start_SIDASP_opal_session,
+		set_sid_cpin_pin,
+		end_opal_session,
+		NULL
+	};
+	void *data[6] = { NULL };
+	int ret;
+
+	if (!dev)
+		return -ENODEV;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, owner_funcs);
+	dev->func_data = data;
+	dev->func_data[4] = opal;
+	dev->func_data[5] = opal;
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_activate_lsp(struct opal_dev *dev, struct opal_lr_act *opal_lr_act)
+{
+	void *data[4] = { NULL };
+	static const opal_step active_funcs[] = {
+		opal_discovery0,
+		start_SIDASP_opal_session, /* Open session as SID auth */
+		get_lsp_lifecycle,
+		activate_lsp,
+		end_opal_session,
+		NULL
+	};
+	int ret;
+
+	if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS)
+		return -EINVAL;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, active_funcs);
+	dev->func_data = data;
+	dev->func_data[1] = &opal_lr_act->key;
+	dev->func_data[3] = opal_lr_act;
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_setup_locking_range(struct opal_dev *dev,
+				    struct opal_user_lr_setup *opal_lrs)
+{
+	void *data[3] = { NULL };
+	static const opal_step lr_funcs[] = {
+		opal_discovery0,
+		start_auth_opal_session,
+		setup_locking_range,
+		end_opal_session,
+		NULL,
+	};
+	int ret;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, lr_funcs);
+	dev->func_data = data;
+	dev->func_data[1] = &opal_lrs->session;
+	dev->func_data[2] = opal_lrs;
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
+{
+	static const opal_step pw_funcs[] = {
+		opal_discovery0,
+		start_auth_opal_session,
+		set_new_pw,
+		end_opal_session,
+		NULL
+	};
+	void *data[3] = { NULL };
+	int ret;
+
+	if (opal_pw->session.who < OPAL_ADMIN1 ||
+	    opal_pw->session.who > OPAL_USER9  ||
+	    opal_pw->new_user_pw.who < OPAL_ADMIN1 ||
+	    opal_pw->new_user_pw.who > OPAL_USER9)
+		return -EINVAL;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, pw_funcs);
+	dev->func_data = data;
+	dev->func_data[1] = (void *) &opal_pw->session;
+	dev->func_data[2] = (void *) &opal_pw->new_user_pw;
+
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+static int opal_activate_user(struct opal_dev *dev,
+			      struct opal_session_info *opal_session)
+{
+	static const opal_step act_funcs[] = {
+		opal_discovery0,
+		start_admin1LSP_opal_session,
+		internal_activate_user,
+		end_opal_session,
+		NULL
+	};
+	void *data[3] = { NULL };
+	int ret;
+
+	/* We can't activate Admin1 it's active as manufactured */
+	if (opal_session->who < OPAL_USER1 &&
+	    opal_session->who > OPAL_USER9) {
+		pr_err("Who was not a valid user: %d\n", opal_session->who);
+		return -EINVAL;
+	}
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, act_funcs);
+	dev->func_data = data;
+	dev->func_data[1] = &opal_session->opal_key;
+	dev->func_data[2] = opal_session;
+	ret = next(dev);
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
+bool opal_unlock_from_suspend(struct opal_dev *dev)
+{
+	struct opal_suspend_data *suspend;
+	void *func_data[3] = { NULL };
+	bool was_failure = false;
+	int ret = 0;
+
+	if (!dev)
+		return false;
+	if (!dev->supported)
+		return false;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev, NULL);
+	dev->func_data = func_data;
+
+	list_for_each_entry(suspend, &dev->unlk_lst, node) {
+		dev->state = 0;
+		dev->func_data[1] = &suspend->unlk.session;
+		dev->func_data[2] = &suspend->unlk;
+		dev->tsn = 0;
+		dev->hsn = 0;
+
+		if (suspend->unlk.session.sum)
+			ret = __opal_lock_unlock_sum(dev);
+		else
+			ret = __opal_lock_unlock(dev);
+		if (ret) {
+			pr_warn("Failed to unlock LR %hhu with sum %d\n",
+				suspend->unlk.session.opal_key.lr,
+				suspend->unlk.session.sum);
+			was_failure = true;
+		}
+	}
+	mutex_unlock(&dev->dev_lock);
+	return was_failure;
+}
+EXPORT_SYMBOL(opal_unlock_from_suspend);
+
+int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
+{
+	void *p;
+	int ret = -ENOTTY;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (!dev)
+		return -ENOTSUPP;
+	if (!dev->supported) {
+		pr_err("Not supported\n");
+		return -ENOTSUPP;
+	}
+
+	p = memdup_user(arg,  _IOC_SIZE(cmd));
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	switch (cmd) {
+	case IOC_OPAL_SAVE:
+		ret = opal_save(dev, p);
+		break;
+	case IOC_OPAL_LOCK_UNLOCK:
+		ret = opal_lock_unlock(dev, p);
+		break;
+	case IOC_OPAL_TAKE_OWNERSHIP:
+		ret = opal_take_ownership(dev, p);
+		break;
+	case IOC_OPAL_ACTIVATE_LSP:
+		ret = opal_activate_lsp(dev, p);
+		break;
+	case IOC_OPAL_SET_PW:
+		ret = opal_set_new_pw(dev, p);
+		break;
+	case IOC_OPAL_ACTIVATE_USR:
+		ret = opal_activate_user(dev, p);
+		break;
+	case IOC_OPAL_REVERT_TPR:
+		ret = opal_reverttper(dev, p);
+		break;
+	case IOC_OPAL_LR_SETUP:
+		ret = opal_setup_locking_range(dev, p);
+		break;
+	case IOC_OPAL_ADD_USR_TO_LR:
+		ret = opal_add_user_to_lr(dev, p);
+		break;
+	case IOC_OPAL_ENABLE_DISABLE_MBR:
+		ret = opal_enable_disable_shadow_mbr(dev, p);
+		break;
+	case IOC_OPAL_ERASE_LR:
+		ret = opal_erase_locking_range(dev, p);
+		break;
+	case IOC_OPAL_SECURE_ERASE_LR:
+		ret = opal_secure_erase_locking_range(dev, p);
+		break;
+	default:
+		pr_warn("No such Opal Ioctl %u\n", cmd);
+	}
+
+	kfree(p);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sed_ioctl);