diff options
Diffstat (limited to 'drivers/nvme/host/pci.c')
-rw-r--r-- | drivers/nvme/host/pci.c | 258 |
1 files changed, 198 insertions, 60 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d3c5086673bc..b82492cd7503 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -60,6 +60,8 @@ MODULE_PARM_DESC(sgl_threshold, "Use SGLs when average request segment size is larger or equal to " "this size. Use 0 to disable SGLs."); +#define NVME_PCI_MIN_QUEUE_SIZE 2 +#define NVME_PCI_MAX_QUEUE_SIZE 4095 static int io_queue_depth_set(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops io_queue_depth_ops = { .set = io_queue_depth_set, @@ -68,7 +70,7 @@ static const struct kernel_param_ops io_queue_depth_ops = { static unsigned int io_queue_depth = 1024; module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); -MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); +MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2 and < 4096"); static int io_queue_count_set(const char *val, const struct kernel_param *kp) { @@ -135,6 +137,7 @@ struct nvme_dev { u32 cmbloc; struct nvme_ctrl ctrl; u32 last_ps; + bool hmb; mempool_t *iod_mempool; @@ -153,18 +156,14 @@ struct nvme_dev { unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; + + bool attrs_added; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) { - int ret; - u32 n; - - ret = kstrtou32(val, 10, &n); - if (ret != 0 || n < 2) - return -EINVAL; - - return param_set_uint(val, kp); + return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE, + NVME_PCI_MAX_QUEUE_SIZE); } static inline unsigned int sq_idx(unsigned int qid, u32 stride) @@ -1014,7 +1013,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) return; } - req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), command_id); + req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id); if (unlikely(!req)) { dev_warn(nvmeq->dev->ctrl.device, "invalid id %d completed on queue %d\n", @@ -1554,6 +1553,28 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) wmb(); /* ensure the first interrupt sees the initialization */ } +/* + * Try getting shutdown_lock while setting up IO queues. + */ +static int nvme_setup_io_queues_trylock(struct nvme_dev *dev) +{ + /* + * Give up if the lock is being held by nvme_dev_disable. + */ + if (!mutex_trylock(&dev->shutdown_lock)) + return -ENODEV; + + /* + * Controller is in wrong state, fail early. + */ + if (dev->ctrl.state != NVME_CTRL_CONNECTING) { + mutex_unlock(&dev->shutdown_lock); + return -ENODEV; + } + + return 0; +} + static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) { struct nvme_dev *dev = nvmeq->dev; @@ -1582,8 +1603,11 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) goto release_cq; nvmeq->cq_vector = vector; - nvme_init_queue(nvmeq, qid); + result = nvme_setup_io_queues_trylock(dev); + if (result) + return result; + nvme_init_queue(nvmeq, qid); if (!polled) { result = queue_request_irq(nvmeq); if (result < 0) @@ -1591,10 +1615,12 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) } set_bit(NVMEQ_ENABLED, &nvmeq->flags); + mutex_unlock(&dev->shutdown_lock); return result; release_sq: dev->online_queues--; + mutex_unlock(&dev->shutdown_lock); adapter_delete_sq(dev, qid); release_cq: adapter_delete_cq(dev, qid); @@ -1781,17 +1807,6 @@ static int nvme_create_io_queues(struct nvme_dev *dev) return ret >= 0 ? 0 : ret; } -static ssize_t nvme_cmb_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); - - return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", - ndev->cmbloc, ndev->cmbsz); -} -static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); - static u64 nvme_cmb_size_unit(struct nvme_dev *dev) { u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK; @@ -1860,20 +1875,6 @@ static void nvme_map_cmb(struct nvme_dev *dev) if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) pci_p2pmem_publish(pdev, true); - - if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL)) - dev_warn(dev->ctrl.device, - "failed to add sysfs attribute for CMB\n"); -} - -static inline void nvme_release_cmb(struct nvme_dev *dev) -{ - if (dev->cmb_size) { - sysfs_remove_file_from_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL); - dev->cmb_size = 0; - } } static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) @@ -1896,7 +1897,9 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) dev_warn(dev->ctrl.device, "failed to set host mem (err %d, flags %#x).\n", ret, bits); - } + } else + dev->hmb = bits & NVME_HOST_MEM_ENABLE; + return ret; } @@ -2053,6 +2056,102 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) return ret; } +static ssize_t cmb_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz : x%08x\n", + ndev->cmbloc, ndev->cmbsz); +} +static DEVICE_ATTR_RO(cmb); + +static ssize_t cmbloc_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%u\n", ndev->cmbloc); +} +static DEVICE_ATTR_RO(cmbloc); + +static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%u\n", ndev->cmbsz); +} +static DEVICE_ATTR_RO(cmbsz); + +static ssize_t hmb_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%d\n", ndev->hmb); +} + +static ssize_t hmb_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + bool new; + int ret; + + if (strtobool(buf, &new) < 0) + return -EINVAL; + + if (new == ndev->hmb) + return count; + + if (new) { + ret = nvme_setup_host_mem(ndev); + } else { + ret = nvme_set_host_mem(ndev, 0); + if (!ret) + nvme_free_host_mem(ndev); + } + + if (ret < 0) + return ret; + + return count; +} +static DEVICE_ATTR_RW(hmb); + +static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct nvme_ctrl *ctrl = + dev_get_drvdata(container_of(kobj, struct device, kobj)); + struct nvme_dev *dev = to_nvme_dev(ctrl); + + if (a == &dev_attr_cmb.attr || + a == &dev_attr_cmbloc.attr || + a == &dev_attr_cmbsz.attr) { + if (!dev->cmbsz) + return 0; + } + if (a == &dev_attr_hmb.attr && !ctrl->hmpre) + return 0; + + return a->mode; +} + +static struct attribute *nvme_pci_attrs[] = { + &dev_attr_cmb.attr, + &dev_attr_cmbloc.attr, + &dev_attr_cmbsz.attr, + &dev_attr_hmb.attr, + NULL, +}; + +static const struct attribute_group nvme_pci_attr_group = { + .attrs = nvme_pci_attrs, + .is_visible = nvme_pci_attrs_are_visible, +}; + /* * nirqs is the number of interrupts available for write and read * queues. The core already reserved an interrupt for the admin queue. @@ -2167,7 +2266,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (nr_io_queues == 0) return 0; - clear_bit(NVMEQ_ENABLED, &adminq->flags); + /* + * Free IRQ resources as soon as NVMEQ_ENABLED bit transitions + * from set to unset. If there is a window to it is truely freed, + * pci_free_irq_vectors() jumping into this window will crash. + * And take lock to avoid racing with pci_free_irq_vectors() in + * nvme_dev_disable() path. + */ + result = nvme_setup_io_queues_trylock(dev); + if (result) + return result; + if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags)) + pci_free_irq(pdev, 0, adminq); if (dev->cmb_use_sqes) { result = nvme_cmb_qdepth(dev, nr_io_queues, @@ -2183,14 +2293,17 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) result = nvme_remap_bar(dev, size); if (!result) break; - if (!--nr_io_queues) - return -ENOMEM; + if (!--nr_io_queues) { + result = -ENOMEM; + goto out_unlock; + } } while (1); adminq->q_db = dev->dbs; retry: /* Deregister the admin queue's interrupt */ - pci_free_irq(pdev, 0, adminq); + if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags)) + pci_free_irq(pdev, 0, adminq); /* * If we enable msix early due to not intx, disable it again before @@ -2199,8 +2312,10 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) pci_free_irq_vectors(pdev); result = nvme_setup_irqs(dev, nr_io_queues); - if (result <= 0) - return -EIO; + if (result <= 0) { + result = -EIO; + goto out_unlock; + } dev->num_vecs = result; result = max(result - 1, 1); @@ -2214,8 +2329,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) */ result = queue_request_irq(adminq); if (result) - return result; + goto out_unlock; set_bit(NVMEQ_ENABLED, &adminq->flags); + mutex_unlock(&dev->shutdown_lock); result = nvme_create_io_queues(dev); if (result || dev->online_queues < 2) @@ -2224,6 +2340,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (dev->online_queues - 1 < dev->max_qid) { nr_io_queues = dev->online_queues - 1; nvme_disable_io_queues(dev); + result = nvme_setup_io_queues_trylock(dev); + if (result) + return result; nvme_suspend_io_queues(dev); goto retry; } @@ -2232,6 +2351,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev->io_queues[HCTX_TYPE_READ], dev->io_queues[HCTX_TYPE_POLL]); return 0; +out_unlock: + mutex_unlock(&dev->shutdown_lock); + return result; } static void nvme_del_queue_end(struct request *req, blk_status_t error) @@ -2581,7 +2703,9 @@ static void nvme_reset_work(struct work_struct *work) bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); int result; - if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) { + if (dev->ctrl.state != NVME_CTRL_RESETTING) { + dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n", + dev->ctrl.state); result = -ENODEV; goto out; } @@ -2699,6 +2823,10 @@ static void nvme_reset_work(struct work_struct *work) goto out; } + if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj, + &nvme_pci_attr_group)) + dev->attrs_added = true; + nvme_start_ctrl(&dev->ctrl); return; @@ -2947,6 +3075,13 @@ static void nvme_shutdown(struct pci_dev *pdev) nvme_disable_prepare_reset(dev, true); } +static void nvme_remove_attrs(struct nvme_dev *dev) +{ + if (dev->attrs_added) + sysfs_remove_group(&dev->ctrl.device->kobj, + &nvme_pci_attr_group); +} + /* * The driver's remove may be called on a device in a partially initialized * state. This function must not have any dependencies on the device state in @@ -2962,14 +3097,13 @@ static void nvme_remove(struct pci_dev *pdev) if (!pci_device_is_present(pdev)) { nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); nvme_dev_disable(dev, true); - nvme_dev_remove_admin(dev); } flush_work(&dev->ctrl.reset_work); nvme_stop_ctrl(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); nvme_dev_disable(dev, true); - nvme_release_cmb(dev); + nvme_remove_attrs(dev); nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); @@ -2996,8 +3130,13 @@ static int nvme_resume(struct device *dev) if (ndev->last_ps == U32_MAX || nvme_set_power_state(ctrl, ndev->last_ps) != 0) - return nvme_try_sched_reset(&ndev->ctrl); + goto reset; + if (ctrl->hmpre && nvme_setup_host_mem(ndev)) + goto reset; + return 0; +reset: + return nvme_try_sched_reset(ctrl); } static int nvme_suspend(struct device *dev) @@ -3021,15 +3160,9 @@ static int nvme_suspend(struct device *dev) * the PCI bus layer to put it into D3 in order to take the PCIe link * down, so as to allow the platform to achieve its minimum low-power * state (which may not be possible if the link is up). - * - * If a host memory buffer is enabled, shut down the device as the NVMe - * specification allows the device to access the host memory buffer in - * host DRAM from all power states, but hosts will fail access to DRAM - * during S3. */ if (pm_suspend_via_firmware() || !ctrl->npss || !pcie_aspm_enabled(pdev) || - ndev->nr_host_mem_descs || (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) return nvme_disable_prepare_reset(ndev, true); @@ -3040,6 +3173,17 @@ static int nvme_suspend(struct device *dev) if (ctrl->state != NVME_CTRL_LIVE) goto unfreeze; + /* + * Host memory access may not be successful in a system suspend state, + * but the specification allows the controller to access memory in a + * non-operational power state. + */ + if (ndev->hmb) { + ret = nvme_set_host_mem(ndev, 0); + if (ret < 0) + goto unfreeze; + } + ret = nvme_get_power_state(ctrl, &ndev->last_ps); if (ret < 0) goto unfreeze; @@ -3192,12 +3336,6 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(0x1b4b, 0x1092), /* Lexar 256 GB SSD */ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST | NVME_QUIRK_IGNORE_DEV_SUBNQN, }, - { PCI_DEVICE(0x1d1d, 0x1f1f), /* LighNVM qemu device */ - .driver_data = NVME_QUIRK_LIGHTNVM, }, - { PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */ - .driver_data = NVME_QUIRK_LIGHTNVM, }, - { PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */ - .driver_data = NVME_QUIRK_LIGHTNVM, }, { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */ |