diff options
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r-- | drivers/nvme/host/core.c | 86 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 175 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 4 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 38 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 64 |
5 files changed, 213 insertions, 154 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d5e0906262ea..903d5813023a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -56,7 +56,7 @@ MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); static int nvme_char_major; module_param(nvme_char_major, int, 0); -static unsigned long default_ps_max_latency_us = 25000; +static unsigned long default_ps_max_latency_us = 100000; module_param(default_ps_max_latency_us, ulong, 0644); MODULE_PARM_DESC(default_ps_max_latency_us, "max power saving latency for new devices; use PM QOS to change per device"); @@ -925,6 +925,29 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) } #ifdef CONFIG_BLK_DEV_INTEGRITY +static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, + u16 bs) +{ + struct nvme_ns *ns = disk->private_data; + u16 old_ms = ns->ms; + u8 pi_type = 0; + + ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); + ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); + + /* PI implementation requires metadata equal t10 pi tuple size */ + if (ns->ms == sizeof(struct t10_pi_tuple)) + pi_type = id->dps & NVME_NS_DPS_PI_MASK; + + if (blk_get_integrity(disk) && + (ns->pi_type != pi_type || ns->ms != old_ms || + bs != queue_logical_block_size(disk->queue) || + (ns->ms && ns->ext))) + blk_integrity_unregister(disk); + + ns->pi_type = pi_type; +} + static void nvme_init_integrity(struct nvme_ns *ns) { struct blk_integrity integrity; @@ -951,6 +974,10 @@ static void nvme_init_integrity(struct nvme_ns *ns) blk_queue_max_integrity_segments(ns->queue, 1); } #else +static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, + u16 bs) +{ +} static void nvme_init_integrity(struct nvme_ns *ns) { } @@ -997,37 +1024,22 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { struct nvme_ns *ns = disk->private_data; - u8 lbaf, pi_type; - u16 old_ms; - unsigned short bs; - - old_ms = ns->ms; - lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; - ns->lba_shift = id->lbaf[lbaf].ds; - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); - ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); + u16 bs; /* * If identify namespace failed, use default 512 byte block size so * block layer can use before failing read/write for 0 capacity. */ + ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; if (ns->lba_shift == 0) ns->lba_shift = 9; bs = 1 << ns->lba_shift; - /* XXX: PI implementation requires metadata equal t10 pi tuple size */ - pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? - id->dps & NVME_NS_DPS_PI_MASK : 0; blk_mq_freeze_queue(disk->queue); - if (blk_get_integrity(disk) && (ns->pi_type != pi_type || - ns->ms != old_ms || - bs != queue_logical_block_size(disk->queue) || - (ns->ms && ns->ext))) - blk_integrity_unregister(disk); - ns->pi_type = pi_type; + if (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) + nvme_prep_integrity(disk, id, bs); blk_queue_logical_block_size(ns->queue, bs); - if (ns->ms && !blk_get_integrity(disk) && !ns->ext) nvme_init_integrity(ns); if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) @@ -1330,7 +1342,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl) * transitioning between power states. Therefore, when running * in any given state, we will enter the next lower-power * non-operational state after waiting 50 * (enlat + exlat) - * microseconds, as long as that state's total latency is under + * microseconds, as long as that state's exit latency is under * the requested maximum latency. * * We will not autonomously enter any non-operational state for @@ -1375,7 +1387,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl) * lowest-power state, not the number of states. */ for (state = (int)ctrl->npss; state >= 0; state--) { - u64 total_latency_us, transition_ms; + u64 total_latency_us, exit_latency_us, transition_ms; if (target) table->entries[state] = target; @@ -1396,12 +1408,15 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl) NVME_PS_FLAGS_NON_OP_STATE)) continue; - total_latency_us = - (u64)le32_to_cpu(ctrl->psd[state].entry_lat) + - + le32_to_cpu(ctrl->psd[state].exit_lat); - if (total_latency_us > ctrl->ps_max_latency_us) + exit_latency_us = + (u64)le32_to_cpu(ctrl->psd[state].exit_lat); + if (exit_latency_us > ctrl->ps_max_latency_us) continue; + total_latency_us = + exit_latency_us + + le32_to_cpu(ctrl->psd[state].entry_lat); + /* * This state is good. Use it as the APST idle * target for higher power states. @@ -1605,7 +1620,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); - if (ctrl->ops->is_fabrics) { + if (ctrl->ops->flags & NVME_F_FABRICS) { ctrl->icdoff = le16_to_cpu(id->icdoff); ctrl->ioccsz = le32_to_cpu(id->ioccsz); ctrl->iorcsz = le32_to_cpu(id->iorcsz); @@ -2098,7 +2113,6 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (ns->ndev) nvme_nvm_unregister_sysfs(ns); del_gendisk(ns->disk); - blk_mq_abort_requeue_list(ns->queue); blk_cleanup_queue(ns->queue); } @@ -2427,6 +2441,10 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) struct nvme_ns *ns; mutex_lock(&ctrl->namespaces_mutex); + + /* Forcibly start all queues to avoid having stuck requests */ + blk_mq_start_hw_queues(ctrl->admin_q); + list_for_each_entry(ns, &ctrl->namespaces, list) { /* * Revalidating a dead namespace sets capacity to 0. This will @@ -2436,8 +2454,16 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) continue; revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); - blk_mq_abort_requeue_list(ns->queue); - blk_mq_start_stopped_hw_queues(ns->queue, true); + + /* + * Forcibly start all queues to avoid having stuck requests. + * Note that we must ensure the queues are not stopped + * when the final removal happens. + */ + blk_mq_start_hw_queues(ns->queue); + + /* draining requests in requeue list */ + blk_mq_kick_requeue_list(ns->queue); } mutex_unlock(&ctrl->namespaces_mutex); } diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 70e689bf1cad..92964cef0f4b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -45,8 +45,6 @@ enum nvme_fc_queue_flags { #define NVMEFC_QUEUE_DELAY 3 /* ms units */ -#define NVME_FC_MAX_CONNECT_ATTEMPTS 1 - struct nvme_fc_queue { struct nvme_fc_ctrl *ctrl; struct device *dev; @@ -165,8 +163,6 @@ struct nvme_fc_ctrl { struct work_struct delete_work; struct work_struct reset_work; struct delayed_work connect_work; - int reconnect_delay; - int connect_attempts; struct kref ref; u32 flags; @@ -1143,6 +1139,7 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) /* *********************** NVME Ctrl Routines **************************** */ static void __nvme_fc_final_op_cleanup(struct request *rq); +static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg); static int nvme_fc_reinit_request(void *data, struct request *rq) @@ -1269,7 +1266,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) struct nvme_command *sqe = &op->cmd_iu.sqe; __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1); union nvme_result result; - bool complete_rq; + bool complete_rq, terminate_assoc = true; /* * WARNING: @@ -1298,6 +1295,14 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) * fabricate a CQE, the following fields will not be set as they * are not referenced: * cqe.sqid, cqe.sqhd, cqe.command_id + * + * Failure or error of an individual i/o, in a transport + * detected fashion unrelated to the nvme completion status, + * potentially cause the initiator and target sides to get out + * of sync on SQ head/tail (aka outstanding io count allowed). + * Per FC-NVME spec, failure of an individual command requires + * the connection to be terminated, which in turn requires the + * association to be terminated. */ fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma, @@ -1363,6 +1368,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) goto done; } + terminate_assoc = false; + done: if (op->flags & FCOP_FLAGS_AEN) { nvme_complete_async_event(&queue->ctrl->ctrl, status, &result); @@ -1370,19 +1377,23 @@ done: atomic_set(&op->state, FCPOP_STATE_IDLE); op->flags = FCOP_FLAGS_AEN; /* clear other flags */ nvme_fc_ctrl_put(ctrl); - return; + goto check_error; } complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op); if (!complete_rq) { if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) { - status = cpu_to_le16(NVME_SC_ABORT_REQ); + status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); if (blk_queue_dying(rq->q)) - status |= cpu_to_le16(NVME_SC_DNR); + status |= cpu_to_le16(NVME_SC_DNR << 1); } nvme_end_request(rq, status, result); } else __nvme_fc_final_op_cleanup(rq); + +check_error: + if (terminate_assoc) + nvme_fc_error_recovery(ctrl, "transport detected io error"); } static int @@ -1751,9 +1762,13 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: transport association error detected: %s\n", ctrl->cnum, errmsg); - dev_info(ctrl->ctrl.device, + dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: resetting controller\n", ctrl->cnum); + /* stop the queues on error, cleanup is in reset thread */ + if (ctrl->queue_count > 1) + nvme_stop_queues(&ctrl->ctrl); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { dev_err(ctrl->ctrl.device, "NVME-FC{%d}: error_recovery: Couldn't change state " @@ -2191,9 +2206,6 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) if (!opts->nr_io_queues) return 0; - dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", - opts->nr_io_queues); - nvme_fc_init_io_queues(ctrl); memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); @@ -2264,9 +2276,6 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) if (ctrl->queue_count == 1) return 0; - dev_info(ctrl->ctrl.device, "Recreating %d I/O queues.\n", - opts->nr_io_queues); - nvme_fc_init_io_queues(ctrl); ret = blk_mq_reinit_tagset(&ctrl->tag_set); @@ -2302,7 +2311,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) int ret; bool changed; - ctrl->connect_attempts++; + ++ctrl->ctrl.opts->nr_reconnects; /* * Create the admin queue @@ -2399,9 +2408,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - ctrl->connect_attempts = 0; - - kref_get(&ctrl->ctrl.kref); + ctrl->ctrl.opts->nr_reconnects = 0; if (ctrl->queue_count > 1) { nvme_start_queues(&ctrl->ctrl); @@ -2532,26 +2539,32 @@ nvme_fc_delete_ctrl_work(struct work_struct *work) /* * tear down the controller - * This will result in the last reference on the nvme ctrl to - * expire, calling the transport nvme_fc_nvme_ctrl_freed() callback. - * From there, the transport will tear down it's logical queues and - * association. + * After the last reference on the nvme ctrl is removed, + * the transport nvme_fc_nvme_ctrl_freed() callback will be + * invoked. From there, the transport will tear down it's + * logical queues and association. */ nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); } -static int -__nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl) +static bool +__nvme_fc_schedule_delete_work(struct nvme_fc_ctrl *ctrl) { if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) - return -EBUSY; + return true; if (!queue_work(nvme_fc_wq, &ctrl->delete_work)) - return -EBUSY; + return true; - return 0; + return false; +} + +static int +__nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl) +{ + return __nvme_fc_schedule_delete_work(ctrl) ? -EBUSY : 0; } /* @@ -2577,6 +2590,35 @@ nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl) } static void +nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) +{ + /* If we are resetting/deleting then do nothing */ + if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) { + WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW || + ctrl->ctrl.state == NVME_CTRL_LIVE); + return; + } + + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", + ctrl->cnum, status); + + if (nvmf_should_reconnect(&ctrl->ctrl)) { + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", + ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); + queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, + ctrl->ctrl.opts->reconnect_delay * HZ); + } else { + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: Max reconnect attempts (%d) " + "reached. Removing controller\n", + ctrl->cnum, ctrl->ctrl.opts->nr_reconnects); + WARN_ON(__nvme_fc_schedule_delete_work(ctrl)); + } +} + +static void nvme_fc_reset_ctrl_work(struct work_struct *work) { struct nvme_fc_ctrl *ctrl = @@ -2587,34 +2629,9 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) nvme_fc_delete_association(ctrl); ret = nvme_fc_create_association(ctrl); - if (ret) { - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", - ctrl->cnum, ret); - if (ctrl->connect_attempts >= NVME_FC_MAX_CONNECT_ATTEMPTS) { - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Max reconnect attempts (%d) " - "reached. Removing controller\n", - ctrl->cnum, ctrl->connect_attempts); - - if (!nvme_change_ctrl_state(&ctrl->ctrl, - NVME_CTRL_DELETING)) { - dev_err(ctrl->ctrl.device, - "NVME-FC{%d}: failed to change state " - "to DELETING\n", ctrl->cnum); - return; - } - - WARN_ON(!queue_work(nvme_fc_wq, &ctrl->delete_work)); - return; - } - - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", - ctrl->cnum, ctrl->reconnect_delay); - queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, - ctrl->reconnect_delay * HZ); - } else + if (ret) + nvme_fc_reconnect_or_delete(ctrl, ret); + else dev_info(ctrl->ctrl.device, "NVME-FC{%d}: controller reset complete\n", ctrl->cnum); } @@ -2628,7 +2645,7 @@ nvme_fc_reset_nvme_ctrl(struct nvme_ctrl *nctrl) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); - dev_warn(ctrl->ctrl.device, + dev_info(ctrl->ctrl.device, "NVME-FC{%d}: admin requested controller reset\n", ctrl->cnum); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) @@ -2645,7 +2662,7 @@ nvme_fc_reset_nvme_ctrl(struct nvme_ctrl *nctrl) static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .name = "fc", .module = THIS_MODULE, - .is_fabrics = true, + .flags = NVME_F_FABRICS, .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, @@ -2667,34 +2684,9 @@ nvme_fc_connect_ctrl_work(struct work_struct *work) struct nvme_fc_ctrl, connect_work); ret = nvme_fc_create_association(ctrl); - if (ret) { - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Reconnect attempt failed (%d)\n", - ctrl->cnum, ret); - if (ctrl->connect_attempts >= NVME_FC_MAX_CONNECT_ATTEMPTS) { - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Max reconnect attempts (%d) " - "reached. Removing controller\n", - ctrl->cnum, ctrl->connect_attempts); - - if (!nvme_change_ctrl_state(&ctrl->ctrl, - NVME_CTRL_DELETING)) { - dev_err(ctrl->ctrl.device, - "NVME-FC{%d}: failed to change state " - "to DELETING\n", ctrl->cnum); - return; - } - - WARN_ON(!queue_work(nvme_fc_wq, &ctrl->delete_work)); - return; - } - - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", - ctrl->cnum, ctrl->reconnect_delay); - queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, - ctrl->reconnect_delay * HZ); - } else + if (ret) + nvme_fc_reconnect_or_delete(ctrl, ret); + else dev_info(ctrl->ctrl.device, "NVME-FC{%d}: controller reconnect complete\n", ctrl->cnum); @@ -2720,6 +2712,12 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, unsigned long flags; int ret, idx; + if (!(rport->remoteport.port_role & + (FC_PORT_ROLE_NVME_DISCOVERY | FC_PORT_ROLE_NVME_TARGET))) { + ret = -EBADR; + goto out_fail; + } + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); if (!ctrl) { ret = -ENOMEM; @@ -2745,7 +2743,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work); INIT_WORK(&ctrl->reset_work, nvme_fc_reset_ctrl_work); INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); - ctrl->reconnect_delay = opts->reconnect_delay; spin_lock_init(&ctrl->lock); /* io queue count */ @@ -2825,6 +2822,8 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, return ERR_PTR(ret); } + kref_get(&ctrl->ctrl.kref); + dev_info(ctrl->ctrl.device, "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", ctrl->cnum, ctrl->ctrl.opts->subsysnqn); @@ -2961,7 +2960,7 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) static struct nvmf_transport_ops nvme_fc_transport = { .name = "fc", .required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR, - .allowed_opts = NVMF_OPT_RECONNECT_DELAY, + .allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO, .create_ctrl = nvme_fc_create_ctrl, }; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 29c708ca9621..9d6a070d4391 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -208,7 +208,9 @@ struct nvme_ns { struct nvme_ctrl_ops { const char *name; struct module *module; - bool is_fabrics; + unsigned int flags; +#define NVME_F_FABRICS (1 << 0) +#define NVME_F_METADATA_SUPPORTED (1 << 1) int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index fed803232edc..40c7581caeb0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -263,7 +263,7 @@ static void nvme_dbbuf_set(struct nvme_dev *dev) c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr); if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) { - dev_warn(dev->dev, "unable to set dbbuf\n"); + dev_warn(dev->ctrl.device, "unable to set dbbuf\n"); /* Free memory and continue on */ nvme_dbbuf_dma_free(dev); } @@ -1367,7 +1367,7 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); /* If there is a reset ongoing, we shouldn't reset again. */ - if (work_busy(&dev->reset_work)) + if (dev->ctrl.state == NVME_CTRL_RESETTING) return false; /* We shouldn't reset unless the controller is on fatal error state @@ -1394,11 +1394,11 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS, &pci_status); if (result == PCIBIOS_SUCCESSFUL) - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n", csts, pci_status); else - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n", csts, result); } @@ -1506,6 +1506,11 @@ static inline void nvme_release_cmb(struct nvme_dev *dev) if (dev->cmb) { iounmap(dev->cmb); dev->cmb = NULL; + if (dev->cmbsz) { + sysfs_remove_file_from_group(&dev->ctrl.device->kobj, + &dev_attr_cmb.attr, NULL); + dev->cmbsz = 0; + } } } @@ -1735,8 +1740,8 @@ static int nvme_pci_enable(struct nvme_dev *dev) */ if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) { dev->q_depth = 2; - dev_warn(dev->dev, "detected Apple NVMe controller, set " - "queue depth=%u to work around controller resets\n", + dev_warn(dev->ctrl.device, "detected Apple NVMe controller, " + "set queue depth=%u to work around controller resets\n", dev->q_depth); } @@ -1754,7 +1759,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) if (dev->cmbsz) { if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, &dev_attr_cmb.attr, NULL)) - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "failed to add sysfs attribute for CMB\n"); } } @@ -1779,6 +1784,7 @@ static void nvme_pci_disable(struct nvme_dev *dev) { struct pci_dev *pdev = to_pci_dev(dev->dev); + nvme_release_cmb(dev); pci_free_irq_vectors(pdev); if (pci_is_enabled(pdev)) { @@ -1799,7 +1805,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) if (pci_is_enabled(pdev)) { u32 csts = readl(dev->bar + NVME_REG_CSTS); - if (dev->ctrl.state == NVME_CTRL_LIVE) + if (dev->ctrl.state == NVME_CTRL_LIVE || + dev->ctrl.state == NVME_CTRL_RESETTING) nvme_start_freeze(&dev->ctrl); dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || pdev->error_state != pci_channel_io_normal); @@ -1897,7 +1904,7 @@ static void nvme_reset_work(struct work_struct *work) bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); int result = -ENODEV; - if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING)) + if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) goto out; /* @@ -1907,9 +1914,6 @@ static void nvme_reset_work(struct work_struct *work) if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) nvme_dev_disable(dev, false); - if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) - goto out; - result = nvme_pci_enable(dev); if (result) goto out; @@ -2003,8 +2007,8 @@ static int nvme_reset(struct nvme_dev *dev) { if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) return -ENODEV; - if (work_busy(&dev->reset_work)) - return -ENODEV; + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) + return -EBUSY; if (!queue_work(nvme_workq, &dev->reset_work)) return -EBUSY; return 0; @@ -2041,6 +2045,7 @@ static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .name = "pcie", .module = THIS_MODULE, + .flags = NVME_F_METADATA_SUPPORTED, .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, @@ -2129,6 +2134,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto release_pools; + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING); dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); queue_work(nvme_workq, &dev->reset_work); @@ -2172,6 +2178,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + cancel_work_sync(&dev->reset_work); pci_set_drvdata(pdev, NULL); if (!pci_device_is_present(pdev)) { @@ -2184,7 +2191,6 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_disable(dev, true); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); - nvme_release_cmb(dev); nvme_release_prp_pools(dev); nvme_dev_unmap(dev); nvme_put_ctrl(&dev->ctrl); @@ -2288,6 +2294,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(INTEL, 0x0a54), .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, + { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index dd1c6deef82f..24397d306d53 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -753,28 +753,26 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (ret) goto requeue; - blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true); - ret = nvmf_connect_admin_queue(&ctrl->ctrl); if (ret) - goto stop_admin_q; + goto requeue; set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); if (ret) - goto stop_admin_q; + goto requeue; nvme_start_keep_alive(&ctrl->ctrl); if (ctrl->queue_count > 1) { ret = nvme_rdma_init_io_queues(ctrl); if (ret) - goto stop_admin_q; + goto requeue; ret = nvme_rdma_connect_io_queues(ctrl); if (ret) - goto stop_admin_q; + goto requeue; } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); @@ -782,7 +780,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) ctrl->ctrl.opts->nr_reconnects = 0; if (ctrl->queue_count > 1) { - nvme_start_queues(&ctrl->ctrl); nvme_queue_scan(&ctrl->ctrl); nvme_queue_async_events(&ctrl->ctrl); } @@ -791,8 +788,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) return; -stop_admin_q: - blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); requeue: dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", ctrl->ctrl.opts->nr_reconnects); @@ -823,6 +818,13 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); + /* + * queues are not a live anymore, so restart the queues to fail fast + * new IO + */ + blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true); + nvme_start_queues(&ctrl->ctrl); + nvme_rdma_reconnect_or_remove(ctrl); } @@ -1038,6 +1040,19 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) nvme_rdma_wr_error(cq, wc, "SEND"); } +static inline int nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) +{ + int sig_limit; + + /* + * We signal completion every queue depth/2 and also handle the + * degenerated case of a device with queue_depth=1, where we + * would need to signal every message. + */ + sig_limit = max(queue->queue_size / 2, 1); + return (++queue->sig_count % sig_limit) == 0; +} + static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, struct ib_send_wr *first, bool flush) @@ -1065,9 +1080,6 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, * Would have been way to obvious to handle this in hardware or * at least the RDMA stack.. * - * This messy and racy code sniplet is copy and pasted from the iSER - * initiator, and the magic '32' comes from there as well. - * * Always signal the flushes. The magic request used for the flush * sequencer is not allocated in our driver's tagset and it's * triggered to be freed by blk_cleanup_queue(). So we need to @@ -1075,7 +1087,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, * embedded in request's payload, is not freed when __ib_process_cq() * calls wr_cqe->done(). */ - if ((++queue->sig_count % 32) == 0 || flush) + if (nvme_rdma_queue_sig_limit(queue) || flush) wr.send_flags |= IB_SEND_SIGNALED; if (first) @@ -1423,7 +1435,7 @@ nvme_rdma_timeout(struct request *rq, bool reserved) /* * We cannot accept any other command until the Connect command has completed. */ -static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, +static inline int nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, struct request *rq) { if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) { @@ -1431,11 +1443,22 @@ static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, if (!blk_rq_is_passthrough(rq) || cmd->common.opcode != nvme_fabrics_command || - cmd->fabrics.fctype != nvme_fabrics_type_connect) - return false; + cmd->fabrics.fctype != nvme_fabrics_type_connect) { + /* + * reconnecting state means transport disruption, which + * can take a long time and even might fail permanently, + * so we can't let incoming I/O be requeued forever. + * fail it fast to allow upper layers a chance to + * failover. + */ + if (queue->ctrl->ctrl.state == NVME_CTRL_RECONNECTING) + return -EIO; + else + return -EAGAIN; + } } - return true; + return 0; } static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, @@ -1453,8 +1476,9 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, WARN_ON_ONCE(rq->tag < 0); - if (!nvme_rdma_queue_is_ready(queue, rq)) - return BLK_MQ_RQ_QUEUE_BUSY; + ret = nvme_rdma_queue_is_ready(queue, rq); + if (unlikely(ret)) + goto err; dev = queue->device->dev; ib_dma_sync_single_for_cpu(dev, sqe->dma, @@ -1782,7 +1806,7 @@ static int nvme_rdma_reset_ctrl(struct nvme_ctrl *nctrl) static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .name = "rdma", .module = THIS_MODULE, - .is_fabrics = true, + .flags = NVME_F_FABRICS, .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, |