From ec645dc96699ea6c37b6de86c84d7288ea9a4ddf Mon Sep 17 00:00:00 2001
From: Oleksandr Natalenko <oleksandr@natalenko.name>
Date: Sat, 17 Jul 2021 14:33:28 +0200
Subject: block: increase BLKCG_MAX_POLS

After mq-deadline learned to deal with cgroups, the BLKCG_MAX_POLS value
became too small for all the elevators to be registered properly. The
following issue is seen:

```
calling  bfq_init+0x0/0x8b @ 1
blkcg_policy_register: BLKCG_MAX_POLS too small
initcall bfq_init+0x0/0x8b returned -28 after 507 usecs
```

which renders BFQ non-functional.

Increase BLKCG_MAX_POLS to allow enough space for everyone.

Fixes: 08a9ad8bf607 ("block/mq-deadline: Add cgroup support")
Link: https://lore.kernel.org/lkml/8988303.mDXGIdCtx8@natalenko.name/
Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Link: https://lore.kernel.org/r/20210717123328.945810-1-oleksandr@natalenko.name
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c454fb446fd0..2e12320cb121 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -57,7 +57,7 @@ struct blk_keyslot_manager;
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
  */
-#define BLKCG_MAX_POLS		5
+#define BLKCG_MAX_POLS		6
 
 typedef void (rq_end_io_fn)(struct request *, blk_status_t);
 
-- 
cgit v1.2.3


From 7764656b108cd308c39e9a8554353b8f9ca232a3 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 5 Jul 2021 21:38:29 +0800
Subject: nvme-pci: don't WARN_ON in nvme_reset_work if ctrl.state is not
 RESETTING

Followling process:
nvme_probe
  nvme_reset_ctrl
    nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)
    queue_work(nvme_reset_wq, &ctrl->reset_work)

-------------->	nvme_remove
		  nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING)
worker_thread
  process_one_work
    nvme_reset_work
    WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)

, which will trigger WARN_ON in nvme_reset_work():
[  127.534298] WARNING: CPU: 0 PID: 139 at drivers/nvme/host/pci.c:2594
[  127.536161] CPU: 0 PID: 139 Comm: kworker/u8:7 Not tainted 5.13.0
[  127.552518] Call Trace:
[  127.552840]  ? kvm_sched_clock_read+0x25/0x40
[  127.553936]  ? native_send_call_func_single_ipi+0x1c/0x30
[  127.555117]  ? send_call_function_single_ipi+0x9b/0x130
[  127.556263]  ? __smp_call_single_queue+0x48/0x60
[  127.557278]  ? ttwu_queue_wakelist+0xfa/0x1c0
[  127.558231]  ? try_to_wake_up+0x265/0x9d0
[  127.559120]  ? ext4_end_io_rsv_work+0x160/0x290
[  127.560118]  process_one_work+0x28c/0x640
[  127.561002]  worker_thread+0x39a/0x700
[  127.561833]  ? rescuer_thread+0x580/0x580
[  127.562714]  kthread+0x18c/0x1e0
[  127.563444]  ? set_kthread_struct+0x70/0x70
[  127.564347]  ret_from_fork+0x1f/0x30

The preceding problem can be easily reproduced by executing following
script (based on blktests suite):
test() {
  pdev="$(_get_pci_dev_from_blkdev)"
  sysfs="/sys/bus/pci/devices/${pdev}"
  for ((i = 0; i < 10; i++)); do
    echo 1 > "$sysfs/remove"
    echo 1 > /sys/bus/pci/rescan
  done
}

Since the device ctrl could be updated as an non-RESETTING state by
repeating probe/remove in userspace (which is a normal situation), we
can replace stack dumping WARN_ON with a warnning message.

Fixes: 82b057caefaff ("nvme-pci: fix multiple ctrl removal schedulin")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
---
 drivers/nvme/host/pci.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 320051f5a3dd..51852085239e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2631,7 +2631,9 @@ static void nvme_reset_work(struct work_struct *work)
 	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
 	int result;
 
-	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) {
+	if (dev->ctrl.state != NVME_CTRL_RESETTING) {
+		dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
+			 dev->ctrl.state);
 		result = -ENODEV;
 		goto out;
 	}
-- 
cgit v1.2.3


From 5396fdac56d87d04e75e5068c0c92d33625f51e7 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Fri, 16 Jul 2021 13:30:35 +0200
Subject: nvme: fix refcounting imbalance when all paths are down

When the last path to a ns_head drops the current code
removes the ns_head from the subsystem list, but will only
delete the disk itself if the last reference to the ns_head
drops. This is causing an refcounting imbalance eg when
applications have a reference to the disk, as then they'll
never get notified that the disk is in fact dead.
This patch moves the call 'del_gendisk' into nvme_mpath_check_last_path(),
ensuring that the disk can be properly removed and applications get the
appropriate notifications.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c      | 14 +++++++++++---
 drivers/nvme/host/multipath.c |  9 ++++++++-
 drivers/nvme/host/nvme.h      | 11 ++---------
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 11779be42186..17c05a4595f0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3807,6 +3807,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 
 static void nvme_ns_remove(struct nvme_ns *ns)
 {
+	bool last_path = false;
+
 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
 		return;
 
@@ -3815,8 +3817,6 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 
 	mutex_lock(&ns->ctrl->subsys->lock);
 	list_del_rcu(&ns->siblings);
-	if (list_empty(&ns->head->list))
-		list_del_init(&ns->head->entry);
 	mutex_unlock(&ns->ctrl->subsys->lock);
 
 	synchronize_rcu(); /* guarantee not available in head->list */
@@ -3836,7 +3836,15 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 	list_del_init(&ns->list);
 	up_write(&ns->ctrl->namespaces_rwsem);
 
-	nvme_mpath_check_last_path(ns);
+	/* Synchronize with nvme_init_ns_head() */
+	mutex_lock(&ns->head->subsys->lock);
+	if (list_empty(&ns->head->list)) {
+		list_del_init(&ns->head->entry);
+		last_path = true;
+	}
+	mutex_unlock(&ns->head->subsys->lock);
+	if (last_path)
+		nvme_mpath_shutdown_disk(ns->head);
 	nvme_put_ns(ns);
 }
 
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 0ea5298469c3..3f32c5e86bfc 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -760,14 +760,21 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
 #endif
 }
 
-void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
 {
 	if (!head->disk)
 		return;
+	kblockd_schedule_work(&head->requeue_work);
 	if (head->disk->flags & GENHD_FL_UP) {
 		nvme_cdev_del(&head->cdev, &head->cdev_device);
 		del_gendisk(head->disk);
 	}
+}
+
+void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+{
+	if (!head->disk)
+		return;
 	blk_set_queue_dying(head->disk->queue);
 	/* make sure all pending bios are cleaned up */
 	kblockd_schedule_work(&head->requeue_work);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 18ef8dd03a90..5cd1fa3b8464 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -716,14 +716,7 @@ void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
 void nvme_mpath_stop(struct nvme_ctrl *ctrl);
 bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
-
-static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
-{
-	struct nvme_ns_head *head = ns->head;
-
-	if (head->disk && list_empty(&head->list))
-		kblockd_schedule_work(&head->requeue_work);
-}
+void nvme_mpath_shutdown_disk(struct nvme_ns_head *head);
 
 static inline void nvme_trace_bio_complete(struct request *req)
 {
@@ -772,7 +765,7 @@ static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
 static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
 {
 }
-static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
+static inline void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
 {
 }
 static inline void nvme_trace_bio_complete(struct request *req)
-- 
cgit v1.2.3


From 234211b8dd161fa25f192c78d5a8d2dd6bf920a0 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Mon, 19 Jul 2021 09:44:39 -0700
Subject: nvme: fix nvme_setup_command metadata trace event

The metadata address is set after the trace event, so the trace is not
capturing anything useful. Rather than logging the memory address, it's
useful to know if the command carries a metadata payload, so change the
trace event to log that true/false state instead.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/trace.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index daaf700eae79..35bac7a25422 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -56,7 +56,7 @@ TRACE_EVENT(nvme_setup_cmd,
 		__field(u8, fctype)
 		__field(u16, cid)
 		__field(u32, nsid)
-		__field(u64, metadata)
+		__field(bool, metadata)
 		__array(u8, cdw10, 24)
 	    ),
 	    TP_fast_assign(
@@ -66,13 +66,13 @@ TRACE_EVENT(nvme_setup_cmd,
 		__entry->flags = cmd->common.flags;
 		__entry->cid = cmd->common.command_id;
 		__entry->nsid = le32_to_cpu(cmd->common.nsid);
-		__entry->metadata = le64_to_cpu(cmd->common.metadata);
+		__entry->metadata = !!blk_integrity_rq(req);
 		__entry->fctype = cmd->fabrics.fctype;
 		__assign_disk_name(__entry->disk, req->rq_disk);
 		memcpy(__entry->cdw10, &cmd->common.cdw10,
 			sizeof(__entry->cdw10));
 	    ),
-	    TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
+	    TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%x, cmd=(%s %s)",
 		      __entry->ctrl_id, __print_disk_name(__entry->disk),
 		      __entry->qid, __entry->cid, __entry->nsid,
 		      __entry->flags, __entry->metadata,
-- 
cgit v1.2.3


From aaeb7bb061be545251606f4d9c82d710ca2a7c8e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 21 Jul 2021 10:00:11 +0200
Subject: nvme: set the PRACT bit when using Write Zeroes with T10 PI

When using Write Zeroes on a namespace that has protection
information enabled they behavior without the PRACT bit
counter-intuitive and will generally lead to validation failures
when reading the written blocks.  Fix this by always setting the
PRACT bit that generates matching PI data on the fly.

Fixes: 6e02318eaea5 ("nvme: add support for the Write Zeroes command")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/nvme/host/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 17c05a4595f0..dfd9dec0c1f6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -900,7 +900,10 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
 		cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
 	cmnd->write_zeroes.length =
 		cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
-	cmnd->write_zeroes.control = 0;
+	if (nvme_ns_has_pi(ns))
+		cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT);
+	else
+		cmnd->write_zeroes.control = 0;
 	return BLK_STS_OK;
 }
 
-- 
cgit v1.2.3