diff options
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r-- | drivers/nvme/host/Kconfig | 11 | ||||
-rw-r--r-- | drivers/nvme/host/Makefile | 10 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 278 | ||||
-rw-r--r-- | drivers/nvme/host/lightnvm.c | 75 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 26 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 411 |
6 files changed, 495 insertions, 316 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 5d6237391dcd..c894841c6456 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -1,6 +1,10 @@ +config NVME_CORE + tristate + config BLK_DEV_NVME tristate "NVM Express block device" depends on PCI && BLOCK + select NVME_CORE ---help--- The NVM Express driver is for solid state drives directly connected to the PCI or PCI Express bus. If you know you @@ -11,11 +15,12 @@ config BLK_DEV_NVME config BLK_DEV_NVME_SCSI bool "SCSI emulation for NVMe device nodes" - depends on BLK_DEV_NVME + depends on NVME_CORE ---help--- This adds support for the SG_IO ioctl on the NVMe character and block devices nodes, as well a a translation for a small number of selected SCSI commands to NVMe commands to the NVMe driver. If you don't know what this means you probably want - to say N here, and if you know what it means you probably - want to say N as well. + to say N here, unless you run a distro that abuses the SCSI + emulation to provide stable device names for mount by id, like + some OpenSuSE and SLES versions. diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index 51bf90871549..9a3ca892b4a7 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -1,6 +1,8 @@ +obj-$(CONFIG_NVME_CORE) += nvme-core.o +obj-$(CONFIG_BLK_DEV_NVME) += nvme.o -obj-$(CONFIG_BLK_DEV_NVME) += nvme.o +nvme-core-y := core.o +nvme-core-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o +nvme-core-$(CONFIG_NVM) += lightnvm.o -lightnvm-$(CONFIG_NVM) := lightnvm.o -nvme-y += core.o pci.o $(lightnvm-y) -nvme-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o +nvme-y += pci.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c5bf001af559..643f457131c2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -33,6 +33,20 @@ #define NVME_MINORS (1U << MINORBITS) +unsigned char admin_timeout = 60; +module_param(admin_timeout, byte, 0644); +MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); +EXPORT_SYMBOL_GPL(admin_timeout); + +unsigned char nvme_io_timeout = 30; +module_param_named(io_timeout, nvme_io_timeout, byte, 0644); +MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); +EXPORT_SYMBOL_GPL(nvme_io_timeout); + +unsigned char shutdown_timeout = 5; +module_param(shutdown_timeout, byte, 0644); +MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); + static int nvme_major; module_param(nvme_major, int, 0); @@ -40,7 +54,7 @@ static int nvme_char_major; module_param(nvme_char_major, int, 0); static LIST_HEAD(nvme_ctrl_list); -DEFINE_SPINLOCK(dev_list_lock); +static DEFINE_SPINLOCK(dev_list_lock); static struct class *nvme_class; @@ -55,8 +69,9 @@ static void nvme_free_ns(struct kref *kref) ns->disk->private_data = NULL; spin_unlock(&dev_list_lock); - nvme_put_ctrl(ns->ctrl); put_disk(ns->disk); + ida_simple_remove(&ns->ctrl->ns_ida, ns->instance); + nvme_put_ctrl(ns->ctrl); kfree(ns); } @@ -71,11 +86,21 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk) spin_lock(&dev_list_lock); ns = disk->private_data; - if (ns && !kref_get_unless_zero(&ns->kref)) - ns = NULL; + if (ns) { + if (!kref_get_unless_zero(&ns->kref)) + goto fail; + if (!try_module_get(ns->ctrl->ops->module)) + goto fail_put_ns; + } spin_unlock(&dev_list_lock); return ns; + +fail_put_ns: + kref_put(&ns->kref, nvme_free_ns); +fail: + spin_unlock(&dev_list_lock); + return NULL; } void nvme_requeue_req(struct request *req) @@ -88,6 +113,7 @@ void nvme_requeue_req(struct request *req) blk_mq_kick_requeue_list(req->q); spin_unlock_irqrestore(req->q->queue_lock, flags); } +EXPORT_SYMBOL_GPL(nvme_requeue_req); struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags) @@ -107,17 +133,18 @@ struct request *nvme_alloc_request(struct request_queue *q, req->cmd = (unsigned char *)cmd; req->cmd_len = sizeof(struct nvme_command); - req->special = (void *)0; return req; } +EXPORT_SYMBOL_GPL(nvme_alloc_request); /* * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code */ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, unsigned bufflen, u32 *result, unsigned timeout) + struct nvme_completion *cqe, void *buffer, unsigned bufflen, + unsigned timeout) { struct request *req; int ret; @@ -127,6 +154,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, return PTR_ERR(req); req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + req->special = cqe; if (buffer && bufflen) { ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); @@ -135,8 +163,6 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, } blk_execute_rq(req->q, NULL, req, 0); - if (result) - *result = (u32)(uintptr_t)req->special; ret = req->errors; out: blk_mq_free_request(req); @@ -146,8 +172,9 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buffer, unsigned bufflen) { - return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0); + return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0); } +EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, @@ -155,6 +182,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, u32 *result, unsigned timeout) { bool write = cmd->common.opcode & 1; + struct nvme_completion cqe; struct nvme_ns *ns = q->queuedata; struct gendisk *disk = ns ? ns->disk : NULL; struct request *req; @@ -167,6 +195,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, return PTR_ERR(req); req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + req->special = &cqe; if (ubuffer && bufflen) { ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, @@ -183,7 +212,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, goto out_unmap; } - if (meta_buffer) { + if (meta_buffer && meta_len) { struct bio_integrity_payload *bip; meta = kmalloc(meta_len, GFP_KERNEL); @@ -221,7 +250,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, blk_execute_rq(req->q, disk, req, 0); ret = req->errors; if (result) - *result = (u32)(uintptr_t)req->special; + *result = le32_to_cpu(cqe.result); if (meta && !ret && !write) { if (copy_to_user(meta_buffer, meta, meta_len)) ret = -EFAULT; @@ -302,6 +331,8 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, dma_addr_t dma_addr, u32 *result) { struct nvme_command c; + struct nvme_completion cqe; + int ret; memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_get_features; @@ -309,13 +340,18 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, c.features.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); - return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0); + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0); + if (ret >= 0) + *result = le32_to_cpu(cqe.result); + return ret; } int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, dma_addr_t dma_addr, u32 *result) { struct nvme_command c; + struct nvme_completion cqe; + int ret; memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_set_features; @@ -323,7 +359,10 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, c.features.fid = cpu_to_le32(fid); c.features.dword11 = cpu_to_le32(dword11); - return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0); + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0); + if (ret >= 0) + *result = le32_to_cpu(cqe.result); + return ret; } int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log) @@ -363,6 +402,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) *count = min(*count, nr_io_queues); return 0; } +EXPORT_SYMBOL_GPL(nvme_set_queue_count); static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { @@ -373,6 +413,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) if (copy_from_user(&io, uio, sizeof(io))) return -EFAULT; + if (io.flags) + return -EINVAL; switch (io.opcode) { case nvme_cmd_write: @@ -424,6 +466,8 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return -EACCES; if (copy_from_user(&cmd, ucmd, sizeof(cmd))) return -EFAULT; + if (cmd.flags) + return -EINVAL; memset(&c, 0, sizeof(c)); c.common.opcode = cmd.opcode; @@ -499,7 +543,10 @@ static int nvme_open(struct block_device *bdev, fmode_t mode) static void nvme_release(struct gendisk *disk, fmode_t mode) { - nvme_put_ns(disk->private_data); + struct nvme_ns *ns = disk->private_data; + + module_put(ns->ctrl->ops->module); + nvme_put_ns(ns); } static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) @@ -540,8 +587,14 @@ static void nvme_init_integrity(struct nvme_ns *ns) static void nvme_config_discard(struct nvme_ns *ns) { + struct nvme_ctrl *ctrl = ns->ctrl; u32 logical_block_size = queue_logical_block_size(ns->queue); - ns->queue->limits.discard_zeroes_data = 0; + + if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES) + ns->queue->limits.discard_zeroes_data = 1; + else + ns->queue->limits.discard_zeroes_data = 0; + ns->queue->limits.discard_alignment = logical_block_size; ns->queue->limits.discard_granularity = logical_block_size; blk_queue_max_discard_sectors(ns->queue, 0xffffffff); @@ -556,9 +609,13 @@ static int nvme_revalidate_disk(struct gendisk *disk) u16 old_ms; unsigned short bs; + if (test_bit(NVME_NS_DEAD, &ns->flags)) { + set_capacity(disk, 0); + return -ENODEV; + } if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) { - dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n", - __func__, ns->ctrl->instance, ns->ns_id); + dev_warn(disk_to_dev(ns->disk), "%s: Identify failure\n", + __func__); return -ENODEV; } if (id->ncap == 0) { @@ -568,7 +625,7 @@ static int nvme_revalidate_disk(struct gendisk *disk) if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) { if (nvme_nvm_register(ns->queue, disk->disk_name)) { - dev_warn(ns->ctrl->dev, + dev_warn(disk_to_dev(ns->disk), "%s: LightNVM init failure\n", __func__); kfree(id); return -ENODEV; @@ -741,7 +798,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) if (fatal_signal_pending(current)) return -EINTR; if (time_after(jiffies, timeout)) { - dev_err(ctrl->dev, + dev_err(ctrl->device, "Device not ready; aborting %s\n", enabled ? "initialisation" : "reset"); return -ENODEV; @@ -769,6 +826,7 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) return ret; return nvme_wait_ready(ctrl, cap, false); } +EXPORT_SYMBOL_GPL(nvme_disable_ctrl); int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) { @@ -781,7 +839,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) int ret; if (page_shift < dev_page_min) { - dev_err(ctrl->dev, + dev_err(ctrl->device, "Minimum device page size %u too large for host (%u)\n", 1 << dev_page_min, 1 << page_shift); return -ENODEV; @@ -800,6 +858,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) return ret; return nvme_wait_ready(ctrl, cap, true); } +EXPORT_SYMBOL_GPL(nvme_enable_ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) { @@ -822,7 +881,7 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) if (fatal_signal_pending(current)) return -EINTR; if (time_after(jiffies, timeout)) { - dev_err(ctrl->dev, + dev_err(ctrl->device, "Device shutdown incomplete; abort shutdown\n"); return -ENODEV; } @@ -830,6 +889,24 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) return ret; } +EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); + +static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, + struct request_queue *q) +{ + if (ctrl->max_hw_sectors) { + u32 max_segments = + (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; + + blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); + blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); + } + if (ctrl->stripe_size) + blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9); + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) + blk_queue_flush(q, REQ_FLUSH | REQ_FUA); + blk_queue_virt_boundary(q, ctrl->page_size - 1); +} /* * Initialize the cached copies of the Identify data and various controller @@ -844,13 +921,13 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); if (ret) { - dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret); + dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); return ret; } ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); if (ret) { - dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret); + dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); return ret; } page_shift = NVME_CAP_MPSMIN(cap) + 12; @@ -860,13 +937,15 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ret = nvme_identify_ctrl(ctrl, &id); if (ret) { - dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret); + dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret); return -EIO; } + ctrl->vid = le16_to_cpu(id->vid); ctrl->oncs = le16_to_cpup(&id->oncs); atomic_set(&ctrl->abort_limit, id->acl + 1); ctrl->vwc = id->vwc; + ctrl->cntlid = le16_to_cpup(&id->cntlid); memcpy(ctrl->serial, id->sn, sizeof(id->sn)); memcpy(ctrl->model, id->mn, sizeof(id->mn)); memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr)); @@ -888,9 +967,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } } + nvme_set_queue_limits(ctrl, ctrl->admin_q); + kfree(id); return 0; } +EXPORT_SYMBOL_GPL(nvme_init_identify); static int nvme_dev_open(struct inode *inode, struct file *file) { @@ -937,13 +1019,13 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { - dev_warn(ctrl->dev, + dev_warn(ctrl->device, "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); ret = -EINVAL; goto out_unlock; } - dev_warn(ctrl->dev, + dev_warn(ctrl->device, "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); kref_get(&ns->kref); mutex_unlock(&ctrl->namespaces_mutex); @@ -969,7 +1051,7 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, case NVME_IOCTL_IO_CMD: return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: - dev_warn(ctrl->dev, "resetting controller\n"); + dev_warn(ctrl->device, "resetting controller\n"); return ctrl->ops->reset_ctrl(ctrl); case NVME_IOCTL_SUBSYS_RESET: return nvme_reset_subsystem(ctrl); @@ -1000,6 +1082,30 @@ static ssize_t nvme_sysfs_reset(struct device *dev, } static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); +static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + struct nvme_ctrl *ctrl = ns->ctrl; + int serial_len = sizeof(ctrl->serial); + int model_len = sizeof(ctrl->model); + + if (memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) + return sprintf(buf, "eui.%16phN\n", ns->uuid); + + if (memchr_inv(ns->eui, 0, sizeof(ns->eui))) + return sprintf(buf, "eui.%8phN\n", ns->eui); + + while (ctrl->serial[serial_len - 1] == ' ') + serial_len--; + while (ctrl->model[model_len - 1] == ' ') + model_len--; + + return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid, + serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id); +} +static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); + static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1025,6 +1131,7 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); static struct attribute *nvme_ns_attrs[] = { + &dev_attr_wwid.attr, &dev_attr_uuid.attr, &dev_attr_eui.attr, &dev_attr_nsid.attr, @@ -1053,7 +1160,7 @@ static const struct attribute_group nvme_ns_attr_group = { .is_visible = nvme_attrs_are_visible, }; -#define nvme_show_function(field) \ +#define nvme_show_str_function(field) \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ @@ -1062,15 +1169,26 @@ static ssize_t field##_show(struct device *dev, \ } \ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); -nvme_show_function(model); -nvme_show_function(serial); -nvme_show_function(firmware_rev); +#define nvme_show_int_function(field) \ +static ssize_t field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ + return sprintf(buf, "%d\n", ctrl->field); \ +} \ +static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); + +nvme_show_str_function(model); +nvme_show_str_function(serial); +nvme_show_str_function(firmware_rev); +nvme_show_int_function(cntlid); static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_model.attr, &dev_attr_serial.attr, &dev_attr_firmware_rev.attr, + &dev_attr_cntlid.attr, NULL }; @@ -1118,10 +1236,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (!ns) return; + ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL); + if (ns->instance < 0) + goto out_free_ns; + ns->queue = blk_mq_init_queue(ctrl->tagset); if (IS_ERR(ns->queue)) - goto out_free_ns; - queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); + goto out_release_instance; queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); ns->queue->queuedata = ns; ns->ctrl = ctrl; @@ -1135,17 +1256,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns->disk = disk; ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - if (ctrl->max_hw_sectors) { - blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors); - blk_queue_max_segments(ns->queue, - (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1); - } - if (ctrl->stripe_size) - blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9); - if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) - blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); - blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1); + nvme_set_queue_limits(ctrl, ns->queue); disk->major = nvme_major; disk->first_minor = 0; @@ -1154,7 +1267,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) disk->queue = ns->queue; disk->driverfs_dev = ctrl->device; disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid); + sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance); if (nvme_revalidate_disk(ns->disk)) goto out_free_disk; @@ -1174,40 +1287,29 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) kfree(disk); out_free_queue: blk_cleanup_queue(ns->queue); + out_release_instance: + ida_simple_remove(&ctrl->ns_ida, ns->instance); out_free_ns: kfree(ns); } static void nvme_ns_remove(struct nvme_ns *ns) { - bool kill = nvme_io_incapable(ns->ctrl) && - !blk_queue_dying(ns->queue); - - lockdep_assert_held(&ns->ctrl->namespaces_mutex); - - if (kill) { - blk_set_queue_dying(ns->queue); + if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) + return; - /* - * The controller was shutdown first if we got here through - * device removal. The shutdown may requeue outstanding - * requests. These need to be aborted immediately so - * del_gendisk doesn't block indefinitely for their completion. - */ - blk_mq_abort_requeue_list(ns->queue); - } if (ns->disk->flags & GENHD_FL_UP) { if (blk_get_integrity(ns->disk)) blk_integrity_unregister(ns->disk); sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, &nvme_ns_attr_group); del_gendisk(ns->disk); - } - if (kill || !blk_queue_dying(ns->queue)) { blk_mq_abort_requeue_list(ns->queue); blk_cleanup_queue(ns->queue); } + mutex_lock(&ns->ctrl->namespaces_mutex); list_del_init(&ns->list); + mutex_unlock(&ns->ctrl->namespaces_mutex); nvme_put_ns(ns); } @@ -1296,16 +1398,16 @@ void nvme_scan_namespaces(struct nvme_ctrl *ctrl) mutex_unlock(&ctrl->namespaces_mutex); kfree(id); } +EXPORT_SYMBOL_GPL(nvme_scan_namespaces); void nvme_remove_namespaces(struct nvme_ctrl *ctrl) { struct nvme_ns *ns, *next; - mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) nvme_ns_remove(ns); - mutex_unlock(&ctrl->namespaces_mutex); } +EXPORT_SYMBOL_GPL(nvme_remove_namespaces); static DEFINE_IDA(nvme_instance_ida); @@ -1337,13 +1439,14 @@ static void nvme_release_instance(struct nvme_ctrl *ctrl) } void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) - { +{ device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); spin_lock(&dev_list_lock); list_del(&ctrl->node); spin_unlock(&dev_list_lock); } +EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); static void nvme_free_ctrl(struct kref *kref) { @@ -1351,6 +1454,7 @@ static void nvme_free_ctrl(struct kref *kref) put_device(ctrl->device); nvme_release_instance(ctrl); + ida_destroy(&ctrl->ns_ida); ctrl->ops->free_ctrl(ctrl); } @@ -1359,6 +1463,7 @@ void nvme_put_ctrl(struct nvme_ctrl *ctrl) { kref_put(&ctrl->kref, nvme_free_ctrl); } +EXPORT_SYMBOL_GPL(nvme_put_ctrl); /* * Initialize a NVMe controller structures. This needs to be called during @@ -1383,14 +1488,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->device = device_create_with_groups(nvme_class, ctrl->dev, MKDEV(nvme_char_major, ctrl->instance), - dev, nvme_dev_attr_groups, + ctrl, nvme_dev_attr_groups, "nvme%d", ctrl->instance); if (IS_ERR(ctrl->device)) { ret = PTR_ERR(ctrl->device); goto out_release_instance; } get_device(ctrl->device); - dev_set_drvdata(ctrl->device, ctrl); + ida_init(&ctrl->ns_ida); spin_lock(&dev_list_lock); list_add_tail(&ctrl->node, &nvme_ctrl_list); @@ -1402,6 +1507,40 @@ out_release_instance: out: return ret; } +EXPORT_SYMBOL_GPL(nvme_init_ctrl); + +/** + * nvme_kill_queues(): Ends all namespace queues + * @ctrl: the dead controller that needs to end + * + * Call this function when the driver determines it is unable to get the + * controller in a state capable of servicing IO. + */ +void nvme_kill_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (!kref_get_unless_zero(&ns->kref)) + continue; + + /* + * Revalidating a dead namespace sets capacity to 0. This will + * end buffered writers dirtying pages that can't be synced. + */ + if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + revalidate_disk(ns->disk); + + blk_set_queue_dying(ns->queue); + blk_mq_abort_requeue_list(ns->queue); + blk_mq_start_stopped_hw_queues(ns->queue, true); + + nvme_put_ns(ns); + } + mutex_unlock(&ctrl->namespaces_mutex); +} +EXPORT_SYMBOL_GPL(nvme_kill_queues); void nvme_stop_queues(struct nvme_ctrl *ctrl) { @@ -1418,6 +1557,7 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl) } mutex_unlock(&ctrl->namespaces_mutex); } +EXPORT_SYMBOL_GPL(nvme_stop_queues); void nvme_start_queues(struct nvme_ctrl *ctrl) { @@ -1431,6 +1571,7 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) } mutex_unlock(&ctrl->namespaces_mutex); } +EXPORT_SYMBOL_GPL(nvme_start_queues); int __init nvme_core_init(void) { @@ -1470,3 +1611,8 @@ void nvme_core_exit(void) class_destroy(nvme_class); __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); } + +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0"); +module_init(nvme_core_init); +module_exit(nvme_core_exit); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 5cd3725e2fa4..9461dd639acd 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -146,9 +146,18 @@ struct nvme_nvm_command { }; }; +struct nvme_nvm_completion { + __le64 result; /* Used by LightNVM to return ppa completions */ + __le16 sq_head; /* how much of this queue may be reclaimed */ + __le16 sq_id; /* submission queue that generated this entry */ + __u16 command_id; /* of the command which completed */ + __le16 status; /* did the command fail, and if so, why? */ +}; + +#define NVME_NVM_LP_MLC_PAIRS 886 struct nvme_nvm_lp_mlc { __u16 num_pairs; - __u8 pairs[886]; + __u8 pairs[NVME_NVM_LP_MLC_PAIRS]; }; struct nvme_nvm_lp_tbl { @@ -282,9 +291,14 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) memcpy(dst->lptbl.id, src->lptbl.id, 8); dst->lptbl.mlc.num_pairs = le16_to_cpu(src->lptbl.mlc.num_pairs); - /* 4 bits per pair */ + + if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) { + pr_err("nvm: number of MLC pairs not supported\n"); + return -EINVAL; + } + memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, - dst->lptbl.mlc.num_pairs >> 1); + dst->lptbl.mlc.num_pairs); } } @@ -373,8 +387,31 @@ out: return ret; } +static void nvme_nvm_bb_tbl_fold(struct nvm_dev *nvmdev, + int nr_dst_blks, u8 *dst_blks, + int nr_src_blks, u8 *src_blks) +{ + int blk, offset, pl, blktype; + + for (blk = 0; blk < nr_dst_blks; blk++) { + offset = blk * nvmdev->plane_mode; + blktype = src_blks[offset]; + + /* Bad blocks on any planes take precedence over other types */ + for (pl = 0; pl < nvmdev->plane_mode; pl++) { + if (src_blks[offset + pl] & + (NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) { + blktype = src_blks[offset + pl]; + break; + } + } + + dst_blks[blk] = blktype; + } +} + static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, - int nr_blocks, nvm_bb_update_fn *update_bbtbl, + int nr_dst_blks, nvm_bb_update_fn *update_bbtbl, void *priv) { struct request_queue *q = nvmdev->q; @@ -382,7 +419,9 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_nvm_command c = {}; struct nvme_nvm_bb_tbl *bb_tbl; - int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blocks; + u8 *dst_blks = NULL; + int nr_src_blks = nr_dst_blks * nvmdev->plane_mode; + int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_src_blks; int ret = 0; c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl; @@ -393,6 +432,12 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, if (!bb_tbl) return -ENOMEM; + dst_blks = kzalloc(nr_dst_blks, GFP_KERNEL); + if (!dst_blks) { + ret = -ENOMEM; + goto out; + } + ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c, bb_tbl, tblsz); if (ret) { @@ -414,16 +459,21 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, goto out; } - if (le32_to_cpu(bb_tbl->tblks) != nr_blocks) { + if (le32_to_cpu(bb_tbl->tblks) != nr_src_blks) { ret = -EINVAL; dev_err(ctrl->dev, "bbt unsuspected blocks returned (%u!=%u)", - le32_to_cpu(bb_tbl->tblks), nr_blocks); + le32_to_cpu(bb_tbl->tblks), nr_src_blks); goto out; } + nvme_nvm_bb_tbl_fold(nvmdev, nr_dst_blks, dst_blks, + nr_src_blks, bb_tbl->blk); + ppa = dev_to_generic_addr(nvmdev, ppa); - ret = update_bbtbl(ppa, nr_blocks, bb_tbl->blk, priv); + ret = update_bbtbl(ppa, nr_dst_blks, dst_blks, priv); + out: + kfree(dst_blks); kfree(bb_tbl); return ret; } @@ -465,6 +515,10 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd, static void nvme_nvm_end_io(struct request *rq, int error) { struct nvm_rq *rqd = rq->end_io_data; + struct nvme_nvm_completion *cqe = rq->special; + + if (cqe) + rqd->ppa_status = le64_to_cpu(cqe->result); nvm_end_io(rqd, error); @@ -484,7 +538,8 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) if (IS_ERR(rq)) return -ENOMEM; - cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL); + cmd = kzalloc(sizeof(struct nvme_nvm_command) + + sizeof(struct nvme_nvm_completion), GFP_KERNEL); if (!cmd) { blk_mq_free_request(rq); return -ENOMEM; @@ -503,7 +558,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) rq->cmd = (unsigned char *)cmd; rq->cmd_len = sizeof(struct nvme_nvm_command); - rq->special = (void *)0; + rq->special = cmd + 1; rq->end_io_data = rqd; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 4fb5bb737868..f846da4eb338 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -59,6 +59,12 @@ enum nvme_quirks { * correctly. */ NVME_QUIRK_IDENTIFY_CNS = (1 << 1), + + /* + * The controller deterministically returns O's on reads to discarded + * logical blocks. + */ + NVME_QUIRK_DISCARD_ZEROES = (1 << 2), }; struct nvme_ctrl { @@ -72,11 +78,13 @@ struct nvme_ctrl { struct mutex namespaces_mutex; struct device *device; /* char device */ struct list_head node; + struct ida ns_ida; char name[12]; char serial[20]; char model[40]; char firmware_rev[8]; + int cntlid; u32 ctrl_config; @@ -84,6 +92,7 @@ struct nvme_ctrl { u32 max_hw_sectors; u32 stripe_size; u16 oncs; + u16 vid; atomic_t abort_limit; u8 event_limit; u8 vwc; @@ -102,6 +111,7 @@ struct nvme_ns { struct request_queue *queue; struct gendisk *disk; struct kref kref; + int instance; u8 eui[8]; u8 uuid[16]; @@ -112,11 +122,17 @@ struct nvme_ns { bool ext; u8 pi_type; int type; + unsigned long flags; + +#define NVME_NS_REMOVING 0 +#define NVME_NS_DEAD 1 + u64 mode_select_num_blocks; u32 mode_select_block_len; }; struct nvme_ctrl_ops { + struct module *module; int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); @@ -139,9 +155,9 @@ static inline bool nvme_io_incapable(struct nvme_ctrl *ctrl) u32 val = 0; if (ctrl->ops->io_incapable(ctrl)) - return false; + return true; if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val)) - return false; + return true; return val & NVME_CSTS_CFS; } @@ -240,6 +256,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl); void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); +void nvme_kill_queues(struct nvme_ctrl *ctrl); struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags); @@ -247,7 +264,8 @@ void nvme_requeue_req(struct request *req); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, unsigned bufflen, u32 *result, unsigned timeout); + struct nvme_completion *cqe, void *buffer, unsigned bufflen, + unsigned timeout); int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, u32 *result, unsigned timeout); @@ -265,8 +283,6 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, dma_addr_t dma_addr, u32 *result); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); -extern spinlock_t dev_list_lock; - struct sg_io_hdr; int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 72ef8322d32a..24ccda303efb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -27,7 +27,6 @@ #include <linux/interrupt.h> #include <linux/io.h> #include <linux/kdev_t.h> -#include <linux/kthread.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/module.h> @@ -39,6 +38,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/t10-pi.h> +#include <linux/timer.h> #include <linux/types.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <asm/unaligned.h> @@ -57,18 +57,6 @@ #define NVME_NR_AEN_COMMANDS 1 #define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) -unsigned char admin_timeout = 60; -module_param(admin_timeout, byte, 0644); -MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); - -unsigned char nvme_io_timeout = 30; -module_param_named(io_timeout, nvme_io_timeout, byte, 0644); -MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); - -unsigned char shutdown_timeout = 5; -module_param(shutdown_timeout, byte, 0644); -MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); - static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0); @@ -76,24 +64,19 @@ static bool use_cmb_sqes = true; module_param(use_cmb_sqes, bool, 0644); MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); -static LIST_HEAD(dev_list); -static struct task_struct *nvme_thread; static struct workqueue_struct *nvme_workq; -static wait_queue_head_t nvme_kthread_wait; struct nvme_dev; struct nvme_queue; static int nvme_reset(struct nvme_dev *dev); static void nvme_process_cq(struct nvme_queue *nvmeq); -static void nvme_remove_dead_ctrl(struct nvme_dev *dev); static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); /* * Represents an NVM Express device. Each nvme_dev is a PCI function. */ struct nvme_dev { - struct list_head node; struct nvme_queue **queues; struct blk_mq_tag_set tagset; struct blk_mq_tag_set admin_tagset; @@ -111,6 +94,8 @@ struct nvme_dev { struct work_struct reset_work; struct work_struct scan_work; struct work_struct remove_work; + struct work_struct async_work; + struct timer_list watchdog_timer; struct mutex shutdown_lock; bool subsystem; void __iomem *cmb; @@ -120,6 +105,7 @@ struct nvme_dev { unsigned long flags; #define NVME_CTRL_RESETTING 0 +#define NVME_CTRL_REMOVING 1 struct nvme_ctrl ctrl; struct completion ioq_wait; @@ -148,7 +134,6 @@ struct nvme_queue { u32 __iomem *q_db; u16 q_depth; s16 cq_vector; - u16 sq_head; u16 sq_tail; u16 cq_head; u16 qid; @@ -286,23 +271,37 @@ static int nvme_init_request(void *data, struct request *req, return 0; } +static void nvme_queue_scan(struct nvme_dev *dev) +{ + /* + * Do not queue new scan work when a controller is reset during + * removal. + */ + if (test_bit(NVME_CTRL_REMOVING, &dev->flags)) + return; + queue_work(nvme_workq, &dev->scan_work); +} + static void nvme_complete_async_event(struct nvme_dev *dev, struct nvme_completion *cqe) { u16 status = le16_to_cpu(cqe->status) >> 1; u32 result = le32_to_cpu(cqe->result); - if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) + if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) { ++dev->ctrl.event_limit; + queue_work(nvme_workq, &dev->async_work); + } + if (status != NVME_SC_SUCCESS) return; switch (result & 0xff07) { case NVME_AER_NOTICE_NS_CHANGED: - dev_info(dev->dev, "rescanning\n"); - queue_work(nvme_workq, &dev->scan_work); + dev_info(dev->ctrl.device, "rescanning\n"); + nvme_queue_scan(dev); default: - dev_warn(dev->dev, "async event result %08x\n", result); + dev_warn(dev->ctrl.device, "async event result %08x\n", result); } } @@ -678,6 +677,14 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(req); spin_lock_irq(&nvmeq->q_lock); + if (unlikely(nvmeq->cq_vector < 0)) { + if (ns && !test_bit(NVME_NS_DEAD, &ns->flags)) + ret = BLK_MQ_RQ_QUEUE_BUSY; + else + ret = BLK_MQ_RQ_QUEUE_ERROR; + spin_unlock_irq(&nvmeq->q_lock); + goto out; + } __nvme_submit_cmd(nvmeq, &cmnd); nvme_process_cq(nvmeq); spin_unlock_irq(&nvmeq->q_lock); @@ -708,7 +715,7 @@ static void nvme_complete_rq(struct request *req) } if (unlikely(iod->aborted)) { - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "completing aborted command with status: %04x\n", req->errors); } @@ -716,6 +723,13 @@ static void nvme_complete_rq(struct request *req) blk_mq_end_request(req, error); } +/* We read the CQE phase first to check if the rest of the entry is valid */ +static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head, + u16 phase) +{ + return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase; +} + static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) { u16 head, phase; @@ -723,14 +737,10 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) head = nvmeq->cq_head; phase = nvmeq->cq_phase; - for (;;) { + while (nvme_cqe_valid(nvmeq, head, phase)) { struct nvme_completion cqe = nvmeq->cqes[head]; - u16 status = le16_to_cpu(cqe.status); struct request *req; - if ((status & 1) != phase) - break; - nvmeq->sq_head = le16_to_cpu(cqe.sq_head); if (++head == nvmeq->q_depth) { head = 0; phase = !phase; @@ -740,7 +750,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) *tag = -1; if (unlikely(cqe.command_id >= nvmeq->q_depth)) { - dev_warn(nvmeq->q_dmadev, + dev_warn(nvmeq->dev->ctrl.device, "invalid id %d completed on queue %d\n", cqe.command_id, le16_to_cpu(cqe.sq_id)); continue; @@ -759,11 +769,9 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) } req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id); - if (req->cmd_type == REQ_TYPE_DRV_PRIV) { - u32 result = le32_to_cpu(cqe.result); - req->special = (void *)(uintptr_t)result; - } - blk_mq_complete_request(req, status >> 1); + if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special) + memcpy(req->special, &cqe, sizeof(cqe)); + blk_mq_complete_request(req, le16_to_cpu(cqe.status) >> 1); } @@ -804,18 +812,16 @@ static irqreturn_t nvme_irq(int irq, void *data) static irqreturn_t nvme_irq_check(int irq, void *data) { struct nvme_queue *nvmeq = data; - struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; - if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) - return IRQ_NONE; - return IRQ_WAKE_THREAD; + if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) + return IRQ_WAKE_THREAD; + return IRQ_NONE; } static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) { struct nvme_queue *nvmeq = hctx->driver_data; - if ((le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) == - nvmeq->cq_phase) { + if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { spin_lock_irq(&nvmeq->q_lock); __nvme_process_cq(nvmeq, &tag); spin_unlock_irq(&nvmeq->q_lock); @@ -827,15 +833,22 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) return 0; } -static void nvme_submit_async_event(struct nvme_dev *dev) +static void nvme_async_event_work(struct work_struct *work) { + struct nvme_dev *dev = container_of(work, struct nvme_dev, async_work); + struct nvme_queue *nvmeq = dev->queues[0]; struct nvme_command c; memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; - c.common.command_id = NVME_AQ_BLKMQ_DEPTH + --dev->ctrl.event_limit; - __nvme_submit_cmd(dev->queues[0], &c); + spin_lock_irq(&nvmeq->q_lock); + while (dev->ctrl.event_limit > 0) { + c.common.command_id = NVME_AQ_BLKMQ_DEPTH + + --dev->ctrl.event_limit; + __nvme_submit_cmd(nvmeq, &c); + } + spin_unlock_irq(&nvmeq->q_lock); } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) @@ -905,12 +918,10 @@ static void abort_endio(struct request *req, int error) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = iod->nvmeq; - u32 result = (u32)(uintptr_t)req->special; u16 status = req->errors; - dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); + dev_warn(nvmeq->dev->ctrl.device, "Abort status: 0x%x", status); atomic_inc(&nvmeq->dev->ctrl.abort_limit); - blk_mq_free_request(req); } @@ -929,7 +940,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * shutdown, so we return BLK_EH_HANDLED. */ if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) { - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); nvme_dev_disable(dev, false); @@ -943,7 +954,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * returned to the driver, or if this is the admin queue. */ if (!nvmeq->qid || iod->aborted) { - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); nvme_dev_disable(dev, false); @@ -969,8 +980,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) cmd.abort.cid = req->tag; cmd.abort.sqid = cpu_to_le16(nvmeq->qid); - dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n", - req->tag, nvmeq->qid); + dev_warn(nvmeq->dev->ctrl.device, + "I/O %d QID %d timeout, aborting\n", + req->tag, nvmeq->qid); abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, BLK_MQ_REQ_NOWAIT); @@ -999,7 +1011,7 @@ static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved if (!blk_mq_request_started(req)) return; - dev_warn(nvmeq->q_dmadev, + dev_dbg_ratelimited(nvmeq->dev->ctrl.device, "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid); status = NVME_SC_ABORT_REQ; @@ -1154,9 +1166,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->qid = qid; nvmeq->cq_vector = -1; dev->queues[qid] = nvmeq; - - /* make sure queue descriptor is set before queue count, for kthread */ - mb(); dev->queue_count++; return nvmeq; @@ -1245,6 +1254,12 @@ static struct blk_mq_ops nvme_mq_ops = { static void nvme_dev_remove_admin(struct nvme_dev *dev) { if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { + /* + * If the controller was reset during removal, it's possible + * user requests may be waiting on a stopped queue. Start the + * queue to flush these to completion. + */ + blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); blk_cleanup_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } @@ -1335,53 +1350,31 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } -static int nvme_kthread(void *data) -{ - struct nvme_dev *dev, *next; - - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - spin_lock(&dev_list_lock); - list_for_each_entry_safe(dev, next, &dev_list, node) { - int i; - u32 csts = readl(dev->bar + NVME_REG_CSTS); - - /* - * Skip controllers currently under reset. - */ - if (work_pending(&dev->reset_work) || work_busy(&dev->reset_work)) - continue; - - if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) || - csts & NVME_CSTS_CFS) { - if (queue_work(nvme_workq, &dev->reset_work)) { - dev_warn(dev->dev, - "Failed status: %x, reset controller\n", - readl(dev->bar + NVME_REG_CSTS)); - } - continue; - } - for (i = 0; i < dev->queue_count; i++) { - struct nvme_queue *nvmeq = dev->queues[i]; - if (!nvmeq) - continue; - spin_lock_irq(&nvmeq->q_lock); - nvme_process_cq(nvmeq); - - while (i == 0 && dev->ctrl.event_limit > 0) - nvme_submit_async_event(dev); - spin_unlock_irq(&nvmeq->q_lock); - } +static void nvme_watchdog_timer(unsigned long data) +{ + struct nvme_dev *dev = (struct nvme_dev *)data; + u32 csts = readl(dev->bar + NVME_REG_CSTS); + + /* + * Skip controllers currently under reset. + */ + if (!work_pending(&dev->reset_work) && !work_busy(&dev->reset_work) && + ((csts & NVME_CSTS_CFS) || + (dev->subsystem && (csts & NVME_CSTS_NSSRO)))) { + if (queue_work(nvme_workq, &dev->reset_work)) { + dev_warn(dev->dev, + "Failed status: 0x%x, reset controller.\n", + csts); } - spin_unlock(&dev_list_lock); - schedule_timeout(round_jiffies_relative(HZ)); + return; } - return 0; + + mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); } static int nvme_create_io_queues(struct nvme_dev *dev) { - unsigned i; + unsigned i, max; int ret = 0; for (i = dev->queue_count; i <= dev->max_qid; i++) { @@ -1391,7 +1384,8 @@ static int nvme_create_io_queues(struct nvme_dev *dev) } } - for (i = dev->online_queues; i <= dev->queue_count - 1; i++) { + max = min(dev->max_qid, dev->queue_count - 1); + for (i = dev->online_queues; i <= max; i++) { ret = nvme_create_queue(dev->queues[i], i); if (ret) { nvme_free_queues(dev, i); @@ -1482,7 +1476,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) * access to the admin queue, as that might be only way to fix them up. */ if (result > 0) { - dev_err(dev->dev, "Could not set queue count (%d)\n", result); + dev_err(dev->ctrl.device, + "Could not set queue count (%d)\n", result); nr_io_queues = 0; result = 0; } @@ -1548,9 +1543,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) adminq->cq_vector = -1; goto free_queues; } - - /* Free previously allocated queues that are no longer usable */ - nvme_free_queues(dev, nr_io_queues + 1); return nvme_create_io_queues(dev); free_queues: @@ -1684,15 +1676,21 @@ static int nvme_dev_add(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(&dev->tagset)) return 0; dev->ctrl.tagset = &dev->tagset; + } else { + blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1); + + /* Free previously allocated queues that are no longer usable */ + nvme_free_queues(dev, dev->online_queues); } - queue_work(nvme_workq, &dev->scan_work); + + nvme_queue_scan(dev); return 0; } -static int nvme_dev_map(struct nvme_dev *dev) +static int nvme_pci_enable(struct nvme_dev *dev) { u64 cap; - int bars, result = -ENOMEM; + int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); if (pci_enable_device_mem(pdev)) @@ -1700,24 +1698,14 @@ static int nvme_dev_map(struct nvme_dev *dev) dev->entry[0].vector = pdev->irq; pci_set_master(pdev); - bars = pci_select_bars(pdev, IORESOURCE_MEM); - if (!bars) - goto disable_pci; - - if (pci_request_selected_regions(pdev, bars, "nvme")) - goto disable_pci; if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) goto disable; - dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); - if (!dev->bar) - goto disable; - if (readl(dev->bar + NVME_REG_CSTS) == -1) { result = -ENODEV; - goto unmap; + goto disable; } /* @@ -1727,7 +1715,7 @@ static int nvme_dev_map(struct nvme_dev *dev) if (!pdev->irq) { result = pci_enable_msix(pdev, dev->entry, 1); if (result < 0) - goto unmap; + goto disable; } cap = lo_hi_readq(dev->bar + NVME_REG_CAP); @@ -1754,18 +1742,20 @@ static int nvme_dev_map(struct nvme_dev *dev) pci_save_state(pdev); return 0; - unmap: - iounmap(dev->bar); - dev->bar = NULL; disable: - pci_release_regions(pdev); - disable_pci: pci_disable_device(pdev); return result; } static void nvme_dev_unmap(struct nvme_dev *dev) { + if (dev->bar) + iounmap(dev->bar); + pci_release_regions(to_pci_dev(dev->dev)); +} + +static void nvme_pci_disable(struct nvme_dev *dev) +{ struct pci_dev *pdev = to_pci_dev(dev->dev); if (pdev->msi_enabled) @@ -1773,71 +1763,21 @@ static void nvme_dev_unmap(struct nvme_dev *dev) else if (pdev->msix_enabled) pci_disable_msix(pdev); - if (dev->bar) { - iounmap(dev->bar); - dev->bar = NULL; - pci_release_regions(pdev); - } - if (pci_is_enabled(pdev)) { pci_disable_pcie_error_reporting(pdev); pci_disable_device(pdev); } } -static int nvme_dev_list_add(struct nvme_dev *dev) -{ - bool start_thread = false; - - spin_lock(&dev_list_lock); - if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { - start_thread = true; - nvme_thread = NULL; - } - list_add(&dev->node, &dev_list); - spin_unlock(&dev_list_lock); - - if (start_thread) { - nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); - wake_up_all(&nvme_kthread_wait); - } else - wait_event_killable(nvme_kthread_wait, nvme_thread); - - if (IS_ERR_OR_NULL(nvme_thread)) - return nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; - - return 0; -} - -/* -* Remove the node from the device list and check -* for whether or not we need to stop the nvme_thread. -*/ -static void nvme_dev_list_remove(struct nvme_dev *dev) -{ - struct task_struct *tmp = NULL; - - spin_lock(&dev_list_lock); - list_del_init(&dev->node); - if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) { - tmp = nvme_thread; - nvme_thread = NULL; - } - spin_unlock(&dev_list_lock); - - if (tmp) - kthread_stop(tmp); -} - static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { int i; u32 csts = -1; - nvme_dev_list_remove(dev); + del_timer_sync(&dev->watchdog_timer); mutex_lock(&dev->shutdown_lock); - if (dev->bar) { + if (pci_is_enabled(to_pci_dev(dev->dev))) { nvme_stop_queues(&dev->ctrl); csts = readl(dev->bar + NVME_REG_CSTS); } @@ -1850,7 +1790,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) nvme_disable_io_queues(dev); nvme_disable_admin_queue(dev, shutdown); } - nvme_dev_unmap(dev); + nvme_pci_disable(dev); for (i = dev->queue_count - 1; i >= 0; i--) nvme_clear_queue(dev->queues[i]); @@ -1894,10 +1834,20 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) kfree(dev); } +static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) +{ + dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status); + + kref_get(&dev->ctrl.kref); + nvme_dev_disable(dev, false); + if (!schedule_work(&dev->remove_work)) + nvme_put_ctrl(&dev->ctrl); +} + static void nvme_reset_work(struct work_struct *work) { struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); - int result; + int result = -ENODEV; if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags))) goto out; @@ -1906,44 +1856,43 @@ static void nvme_reset_work(struct work_struct *work) * If we're called to reset a live controller first shut it down before * moving on. */ - if (dev->bar) + if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) nvme_dev_disable(dev, false); set_bit(NVME_CTRL_RESETTING, &dev->flags); - result = nvme_dev_map(dev); + result = nvme_pci_enable(dev); if (result) goto out; result = nvme_configure_admin_queue(dev); if (result) - goto unmap; + goto out; nvme_init_queue(dev->queues[0], 0); result = nvme_alloc_admin_tags(dev); if (result) - goto disable; + goto out; result = nvme_init_identify(&dev->ctrl); if (result) - goto free_tags; + goto out; result = nvme_setup_io_queues(dev); if (result) - goto free_tags; + goto out; dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; + queue_work(nvme_workq, &dev->async_work); - result = nvme_dev_list_add(dev); - if (result) - goto remove; + mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); /* * Keep the controller around but remove all namespaces if we don't have * any working I/O queue. */ if (dev->online_queues < 2) { - dev_warn(dev->dev, "IO queues not created\n"); + dev_warn(dev->ctrl.device, "IO queues not created\n"); nvme_remove_namespaces(&dev->ctrl); } else { nvme_start_queues(&dev->ctrl); @@ -1953,19 +1902,8 @@ static void nvme_reset_work(struct work_struct *work) clear_bit(NVME_CTRL_RESETTING, &dev->flags); return; - remove: - nvme_dev_list_remove(dev); - free_tags: - nvme_dev_remove_admin(dev); - blk_put_queue(dev->ctrl.admin_q); - dev->ctrl.admin_q = NULL; - dev->queues[0]->tags = NULL; - disable: - nvme_disable_admin_queue(dev, false); - unmap: - nvme_dev_unmap(dev); out: - nvme_remove_dead_ctrl(dev); + nvme_remove_dead_ctrl(dev, result); } static void nvme_remove_dead_ctrl_work(struct work_struct *work) @@ -1973,19 +1911,12 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work) struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); struct pci_dev *pdev = to_pci_dev(dev->dev); + nvme_kill_queues(&dev->ctrl); if (pci_get_drvdata(pdev)) pci_stop_and_remove_bus_device_locked(pdev); nvme_put_ctrl(&dev->ctrl); } -static void nvme_remove_dead_ctrl(struct nvme_dev *dev) -{ - dev_warn(dev->dev, "Removing after probe failure\n"); - kref_get(&dev->ctrl.kref); - if (!schedule_work(&dev->remove_work)) - nvme_put_ctrl(&dev->ctrl); -} - static int nvme_reset(struct nvme_dev *dev) { if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) @@ -2029,6 +1960,7 @@ static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) } static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { + .module = THIS_MODULE, .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, @@ -2037,6 +1969,27 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .free_ctrl = nvme_pci_free_ctrl, }; +static int nvme_dev_map(struct nvme_dev *dev) +{ + int bars; + struct pci_dev *pdev = to_pci_dev(dev->dev); + + bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (!bars) + return -ENODEV; + if (pci_request_selected_regions(pdev, bars, "nvme")) + return -ENODEV; + + dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); + if (!dev->bar) + goto release; + + return 0; + release: + pci_release_regions(pdev); + return -ENODEV; +} + static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int node, result = -ENOMEM; @@ -2061,10 +2014,16 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev->dev = get_device(&pdev->dev); pci_set_drvdata(pdev, dev); - INIT_LIST_HEAD(&dev->node); + result = nvme_dev_map(dev); + if (result) + goto free; + INIT_WORK(&dev->scan_work, nvme_dev_scan); INIT_WORK(&dev->reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); + INIT_WORK(&dev->async_work, nvme_async_event_work); + setup_timer(&dev->watchdog_timer, nvme_watchdog_timer, + (unsigned long)dev); mutex_init(&dev->shutdown_lock); init_completion(&dev->ioq_wait); @@ -2077,6 +2036,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto release_pools; + dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); + queue_work(nvme_workq, &dev->reset_work); return 0; @@ -2084,6 +2045,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_release_prp_pools(dev); put_pci: put_device(dev->dev); + nvme_dev_unmap(dev); free: kfree(dev->queues); kfree(dev->entry); @@ -2107,24 +2069,30 @@ static void nvme_shutdown(struct pci_dev *pdev) nvme_dev_disable(dev, true); } +/* + * The driver's remove may be called on a device in a partially initialized + * state. This function must not have any dependencies on the device state in + * order to proceed. + */ static void nvme_remove(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - spin_lock(&dev_list_lock); - list_del_init(&dev->node); - spin_unlock(&dev_list_lock); + del_timer_sync(&dev->watchdog_timer); + set_bit(NVME_CTRL_REMOVING, &dev->flags); pci_set_drvdata(pdev, NULL); - flush_work(&dev->reset_work); + flush_work(&dev->async_work); flush_work(&dev->scan_work); nvme_remove_namespaces(&dev->ctrl); nvme_uninit_ctrl(&dev->ctrl); nvme_dev_disable(dev, true); + flush_work(&dev->reset_work); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); nvme_release_cmb(dev); nvme_release_prp_pools(dev); + nvme_dev_unmap(dev); nvme_put_ctrl(&dev->ctrl); } @@ -2160,7 +2128,7 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, * shutdown the controller to quiesce. The controller will be restarted * after the slot reset through driver's slot_reset callback. */ - dev_warn(&pdev->dev, "error detected: state:%d\n", state); + dev_warn(dev->ctrl.device, "error detected: state:%d\n", state); switch (state) { case pci_channel_io_normal: return PCI_ERS_RESULT_CAN_RECOVER; @@ -2177,7 +2145,7 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - dev_info(&pdev->dev, "restart after slot reset\n"); + dev_info(dev->ctrl.device, "restart after slot reset\n"); pci_restore_state(pdev); queue_work(nvme_workq, &dev->reset_work); return PCI_ERS_RESULT_RECOVERED; @@ -2200,7 +2168,8 @@ static const struct pci_error_handlers nvme_err_handler = { static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(INTEL, 0x0953), - .driver_data = NVME_QUIRK_STRIPE_SIZE, }, + .driver_data = NVME_QUIRK_STRIPE_SIZE | + NVME_QUIRK_DISCARD_ZEROES, }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, @@ -2225,34 +2194,20 @@ static int __init nvme_init(void) { int result; - init_waitqueue_head(&nvme_kthread_wait); - nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0); if (!nvme_workq) return -ENOMEM; - result = nvme_core_init(); - if (result < 0) - goto kill_workq; - result = pci_register_driver(&nvme_driver); if (result) - goto core_exit; - return 0; - - core_exit: - nvme_core_exit(); - kill_workq: - destroy_workqueue(nvme_workq); + destroy_workqueue(nvme_workq); return result; } static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); - nvme_core_exit(); destroy_workqueue(nvme_workq); - BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); _nvme_check_size(); } |