From a7e15f57766cf5e42b630612a144f15bf571a05a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 27 Jul 2020 17:32:09 -0700 Subject: nvme-rdma: fix controller reset hang during traffic [ Upstream commit 9f98772ba307dd89a3d17dc2589f213d3972fc64 ] commit fe35ec58f0d3 ("block: update hctx map when use multiple maps") exposed an issue where we may hang trying to wait for queue freeze during I/O. We call blk_mq_update_nr_hw_queues which in case of multiple queue maps (which we have now for default/read/poll) is attempting to freeze the queue. However we never started queue freeze when starting the reset, which means that we have inflight pending requests that entered the queue that we will not complete once the queue is quiesced. So start a freeze before we quiesce the queue, and unfreeze the queue after we successfully connected the I/O queues (and make sure to call blk_mq_update_nr_hw_queues only after we are sure that the queue was already frozen). This follows to how the pci driver handles resets. Fixes: fe35ec58f0d3 ("block: update hctx map when use multiple maps") Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/host/rdma.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/nvme/host/rdma.c') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 13506a87a444..af0cfd25ed7a 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -941,15 +941,20 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) ret = PTR_ERR(ctrl->ctrl.connect_q); goto out_free_tag_set; } - } else { - blk_mq_update_nr_hw_queues(&ctrl->tag_set, - ctrl->ctrl.queue_count - 1); } ret = nvme_rdma_start_io_queues(ctrl); if (ret) goto out_cleanup_connect_q; + if (!new) { + nvme_start_queues(&ctrl->ctrl); + nvme_wait_freeze(&ctrl->ctrl); + blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset, + ctrl->ctrl.queue_count - 1); + nvme_unfreeze(&ctrl->ctrl); + } + return 0; out_cleanup_connect_q: @@ -982,6 +987,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, bool remove) { if (ctrl->ctrl.queue_count > 1) { + nvme_start_freeze(&ctrl->ctrl); nvme_stop_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); if (ctrl->ctrl.tagset) { -- cgit v1.2.3 From b34732c06f4b4b1d75ada5bd77cc943d1cb77afa Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 22 Jul 2020 16:32:19 -0700 Subject: nvme: fix deadlock in disconnect during scan_work and/or ana_work [ Upstream commit ecca390e80561debbfdb4dc96bf94595136889fa ] A deadlock happens in the following scenario with multipath: 1) scan_work(nvme0) detects a new nsid while nvme0 is an optimized path to it, path nvme1 happens to be inaccessible. 2) Before scan_work is complete nvme0 disconnect is initiated nvme_delete_ctrl_sync() sets nvme0 state to NVME_CTRL_DELETING 3) scan_work(1) attempts to submit IO, but nvme_path_is_optimized() observes nvme0 is not LIVE. Since nvme1 is a possible path IO is requeued and scan_work hangs. -- Workqueue: nvme-wq nvme_scan_work [nvme_core] kernel: Call Trace: kernel: __schedule+0x2b9/0x6c0 kernel: schedule+0x42/0xb0 kernel: io_schedule+0x16/0x40 kernel: do_read_cache_page+0x438/0x830 kernel: read_cache_page+0x12/0x20 kernel: read_dev_sector+0x27/0xc0 kernel: read_lba+0xc1/0x220 kernel: efi_partition+0x1e6/0x708 kernel: check_partition+0x154/0x244 kernel: rescan_partitions+0xae/0x280 kernel: __blkdev_get+0x40f/0x560 kernel: blkdev_get+0x3d/0x140 kernel: __device_add_disk+0x388/0x480 kernel: device_add_disk+0x13/0x20 kernel: nvme_mpath_set_live+0x119/0x140 [nvme_core] kernel: nvme_update_ns_ana_state+0x5c/0x60 [nvme_core] kernel: nvme_set_ns_ana_state+0x1e/0x30 [nvme_core] kernel: nvme_parse_ana_log+0xa1/0x180 [nvme_core] kernel: nvme_mpath_add_disk+0x47/0x90 [nvme_core] kernel: nvme_validate_ns+0x396/0x940 [nvme_core] kernel: nvme_scan_work+0x24f/0x380 [nvme_core] kernel: process_one_work+0x1db/0x380 kernel: worker_thread+0x249/0x400 kernel: kthread+0x104/0x140 -- 4) Delete also hangs in flush_work(ctrl->scan_work) from nvme_remove_namespaces(). Similiarly a deadlock with ana_work may happen: if ana_work has started and calls nvme_mpath_set_live and device_add_disk, it will trigger I/O. When we trigger disconnect I/O will block because our accessible (optimized) path is disconnecting, but the alternate path is inaccessible, so I/O blocks. Then disconnect tries to flush the ana_work and hangs. [ 605.550896] Workqueue: nvme-wq nvme_ana_work [nvme_core] [ 605.552087] Call Trace: [ 605.552683] __schedule+0x2b9/0x6c0 [ 605.553507] schedule+0x42/0xb0 [ 605.554201] io_schedule+0x16/0x40 [ 605.555012] do_read_cache_page+0x438/0x830 [ 605.556925] read_cache_page+0x12/0x20 [ 605.557757] read_dev_sector+0x27/0xc0 [ 605.558587] amiga_partition+0x4d/0x4c5 [ 605.561278] check_partition+0x154/0x244 [ 605.562138] rescan_partitions+0xae/0x280 [ 605.563076] __blkdev_get+0x40f/0x560 [ 605.563830] blkdev_get+0x3d/0x140 [ 605.564500] __device_add_disk+0x388/0x480 [ 605.565316] device_add_disk+0x13/0x20 [ 605.566070] nvme_mpath_set_live+0x5e/0x130 [nvme_core] [ 605.567114] nvme_update_ns_ana_state+0x2c/0x30 [nvme_core] [ 605.568197] nvme_update_ana_state+0xca/0xe0 [nvme_core] [ 605.569360] nvme_parse_ana_log+0xa1/0x180 [nvme_core] [ 605.571385] nvme_read_ana_log+0x76/0x100 [nvme_core] [ 605.572376] nvme_ana_work+0x15/0x20 [nvme_core] [ 605.573330] process_one_work+0x1db/0x380 [ 605.574144] worker_thread+0x4d/0x400 [ 605.574896] kthread+0x104/0x140 [ 605.577205] ret_from_fork+0x35/0x40 [ 605.577955] INFO: task nvme:14044 blocked for more than 120 seconds. [ 605.579239] Tainted: G OE 5.3.5-050305-generic #201910071830 [ 605.580712] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 605.582320] nvme D 0 14044 14043 0x00000000 [ 605.583424] Call Trace: [ 605.583935] __schedule+0x2b9/0x6c0 [ 605.584625] schedule+0x42/0xb0 [ 605.585290] schedule_timeout+0x203/0x2f0 [ 605.588493] wait_for_completion+0xb1/0x120 [ 605.590066] __flush_work+0x123/0x1d0 [ 605.591758] __cancel_work_timer+0x10e/0x190 [ 605.593542] cancel_work_sync+0x10/0x20 [ 605.594347] nvme_mpath_stop+0x2f/0x40 [nvme_core] [ 605.595328] nvme_stop_ctrl+0x12/0x50 [nvme_core] [ 605.596262] nvme_do_delete_ctrl+0x3f/0x90 [nvme_core] [ 605.597333] nvme_sysfs_delete+0x5c/0x70 [nvme_core] [ 605.598320] dev_attr_store+0x17/0x30 Fix this by introducing a new state: NVME_CTRL_DELETE_NOIO, which will indicate the phase of controller deletion where I/O cannot be allowed to access the namespace. NVME_CTRL_DELETING still allows mpath I/O to be issued to the bottom device, and only after we flush the ana_work and scan_work (after nvme_stop_ctrl and nvme_prep_remove_namespaces) we change the state to NVME_CTRL_DELETING_NOIO. Also we prevent ana_work from re-firing by aborting early if we are not LIVE, so we should be safe here. In addition, change the transport drivers to follow the updated state machine. Fixes: 0d0b660f214d ("nvme: add ANA support") Reported-by: Anton Eidelman Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/host/core.c | 15 +++++++++++++++ drivers/nvme/host/fabrics.c | 2 +- drivers/nvme/host/fabrics.h | 3 ++- drivers/nvme/host/fc.c | 1 + drivers/nvme/host/multipath.c | 18 +++++++++++++++--- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/rdma.c | 10 ++++++---- drivers/nvme/host/tcp.c | 15 +++++++++------ 8 files changed, 50 insertions(+), 15 deletions(-) (limited to 'drivers/nvme/host/rdma.c') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4ee2330c603e..f38548e6d55e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -362,6 +362,16 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, break; } break; + case NVME_CTRL_DELETING_NOIO: + switch (old_state) { + case NVME_CTRL_DELETING: + case NVME_CTRL_DEAD: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; case NVME_CTRL_DEAD: switch (old_state) { case NVME_CTRL_DELETING: @@ -399,6 +409,7 @@ static bool nvme_state_terminal(struct nvme_ctrl *ctrl) case NVME_CTRL_CONNECTING: return false; case NVME_CTRL_DELETING: + case NVME_CTRL_DELETING_NOIO: case NVME_CTRL_DEAD: return true; default: @@ -3344,6 +3355,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, [NVME_CTRL_RESETTING] = "resetting", [NVME_CTRL_CONNECTING] = "connecting", [NVME_CTRL_DELETING] = "deleting", + [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)", [NVME_CTRL_DEAD] = "dead", }; @@ -3911,6 +3923,9 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) if (ctrl->state == NVME_CTRL_DEAD) nvme_kill_queues(ctrl); + /* this is a no-op when called from the controller reset handler */ + nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); + down_write(&ctrl->namespaces_rwsem); list_splice_init(&ctrl->namespaces, &ns_list); up_write(&ctrl->namespaces_rwsem); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 2a6c8190eeb7..4ec4829d6233 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -547,7 +547,7 @@ static struct nvmf_transport_ops *nvmf_lookup_transport( blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, struct request *rq) { - if (ctrl->state != NVME_CTRL_DELETING && + if (ctrl->state != NVME_CTRL_DELETING_NOIO && ctrl->state != NVME_CTRL_DEAD && !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index a0ec40ab62ee..a9c1e3b4585e 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -182,7 +182,8 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, bool queue_live) { - if (likely(ctrl->state == NVME_CTRL_LIVE)) + if (likely(ctrl->state == NVME_CTRL_LIVE || + ctrl->state == NVME_CTRL_DELETING)) return true; return __nvmf_check_ready(ctrl, rq, queue_live); } diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e999a8c4b7e8..549f5b0fb0b4 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -825,6 +825,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) break; case NVME_CTRL_DELETING: + case NVME_CTRL_DELETING_NOIO: default: /* no action to take - let it delete */ break; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 57d51148e71b..267295323343 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -167,9 +167,18 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) static bool nvme_path_is_disabled(struct nvme_ns *ns) { - return ns->ctrl->state != NVME_CTRL_LIVE || - test_bit(NVME_NS_ANA_PENDING, &ns->flags) || - test_bit(NVME_NS_REMOVING, &ns->flags); + /* + * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should + * still be able to complete assuming that the controller is connected. + * Otherwise it will fail immediately and return to the requeue list. + */ + if (ns->ctrl->state != NVME_CTRL_LIVE && + ns->ctrl->state != NVME_CTRL_DELETING) + return true; + if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || + test_bit(NVME_NS_REMOVING, &ns->flags)) + return true; + return false; } static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) @@ -574,6 +583,9 @@ static void nvme_ana_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); + if (ctrl->state != NVME_CTRL_LIVE) + return; + nvme_read_ana_log(ctrl); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 09ffc3246f60..e268f1d7e1a0 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -186,6 +186,7 @@ enum nvme_ctrl_state { NVME_CTRL_RESETTING, NVME_CTRL_CONNECTING, NVME_CTRL_DELETING, + NVME_CTRL_DELETING_NOIO, NVME_CTRL_DEAD, }; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index af0cfd25ed7a..876859cd14e8 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1082,11 +1082,12 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); if (!changed) { /* - * state change failure is ok if we're in DELETING state, + * state change failure is ok if we started ctrl delete, * unless we're during creation of a new controller to * avoid races with teardown flow. */ - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING && + ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO); WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; @@ -1139,8 +1140,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING && + ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO); return; } diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 83bb329d4113..a6d2e3330a58 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1929,11 +1929,12 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) { /* - * state change failure is ok if we're in DELETING state, + * state change failure is ok if we started ctrl delete, * unless we're during creation of a new controller to * avoid races with teardown flow. */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; @@ -1989,8 +1990,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) blk_mq_unquiesce_queue(ctrl->admin_q); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); return; } @@ -2025,8 +2027,9 @@ static void nvme_reset_ctrl_work(struct work_struct *work) nvme_tcp_teardown_ctrl(ctrl, false); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); return; } -- cgit v1.2.3