diff options
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r-- | drivers/nvme/host/core.c | 390 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 7 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.h | 2 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 17 | ||||
-rw-r--r-- | drivers/nvme/host/lightnvm.c | 315 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 29 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 77 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 86 | ||||
-rw-r--r-- | drivers/nvme/host/scsi.c | 7 |
9 files changed, 776 insertions, 154 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8a3c3e32a704..9b3b57fef446 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -26,6 +26,7 @@ #include <linux/ptrace.h> #include <linux/nvme_ioctl.h> #include <linux/t10-pi.h> +#include <linux/pm_qos.h> #include <scsi/sg.h> #include <asm/unaligned.h> @@ -56,6 +57,11 @@ EXPORT_SYMBOL_GPL(nvme_max_retries); static int nvme_char_major; module_param(nvme_char_major, int, 0); +static unsigned long default_ps_max_latency_us = 25000; +module_param(default_ps_max_latency_us, ulong, 0644); +MODULE_PARM_DESC(default_ps_max_latency_us, + "max power saving latency for new devices; use PM QOS to change per device"); + static LIST_HEAD(nvme_ctrl_list); static DEFINE_SPINLOCK(dev_list_lock); @@ -208,18 +214,18 @@ EXPORT_SYMBOL_GPL(nvme_requeue_req); struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags, int qid) { + unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; struct request *req; if (qid == NVME_QID_ANY) { - req = blk_mq_alloc_request(q, nvme_is_write(cmd), flags); + req = blk_mq_alloc_request(q, op, flags); } else { - req = blk_mq_alloc_request_hctx(q, nvme_is_write(cmd), flags, + req = blk_mq_alloc_request_hctx(q, op, flags, qid ? qid - 1 : 0); } if (IS_ERR(req)) return req; - req->cmd_type = REQ_TYPE_DRV_PRIV; req->cmd_flags |= REQ_FAILFAST_DRIVER; nvme_req(req)->cmd = cmd; @@ -238,26 +244,38 @@ static inline void nvme_setup_flush(struct nvme_ns *ns, static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd) { + unsigned short segments = blk_rq_nr_discard_segments(req), n = 0; struct nvme_dsm_range *range; - unsigned int nr_bytes = blk_rq_bytes(req); + struct bio *bio; - range = kmalloc(sizeof(*range), GFP_ATOMIC); + range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC); if (!range) return BLK_MQ_RQ_QUEUE_BUSY; - range->cattr = cpu_to_le32(0); - range->nlb = cpu_to_le32(nr_bytes >> ns->lba_shift); - range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + __rq_for_each_bio(bio, req) { + u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector); + u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; + + range[n].cattr = cpu_to_le32(0); + range[n].nlb = cpu_to_le32(nlb); + range[n].slba = cpu_to_le64(slba); + n++; + } + + if (WARN_ON_ONCE(n != segments)) { + kfree(range); + return BLK_MQ_RQ_QUEUE_ERROR; + } memset(cmnd, 0, sizeof(*cmnd)); cmnd->dsm.opcode = nvme_cmd_dsm; cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); - cmnd->dsm.nr = 0; + cmnd->dsm.nr = segments - 1; cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); req->special_vec.bv_page = virt_to_page(range); req->special_vec.bv_offset = offset_in_page(range); - req->special_vec.bv_len = sizeof(*range); + req->special_vec.bv_len = sizeof(*range) * segments; req->rq_flags |= RQF_SPECIAL_PAYLOAD; return BLK_MQ_RQ_QUEUE_OK; @@ -309,17 +327,27 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, { int ret = BLK_MQ_RQ_QUEUE_OK; - if (req->cmd_type == REQ_TYPE_DRV_PRIV) + switch (req_op(req)) { + case REQ_OP_DRV_IN: + case REQ_OP_DRV_OUT: memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); - else if (req_op(req) == REQ_OP_FLUSH) + break; + case REQ_OP_FLUSH: nvme_setup_flush(ns, cmd); - else if (req_op(req) == REQ_OP_DISCARD) + break; + case REQ_OP_DISCARD: ret = nvme_setup_discard(ns, req, cmd); - else + break; + case REQ_OP_READ: + case REQ_OP_WRITE: nvme_setup_rw(ns, req, cmd); + break; + default: + WARN_ON_ONCE(1); + return BLK_MQ_RQ_QUEUE_ERROR; + } cmd->common.command_id = req->tag; - return ret; } EXPORT_SYMBOL_GPL(nvme_setup_cmd); @@ -538,7 +566,7 @@ int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ c.identify.opcode = nvme_admin_identify; - c.identify.cns = cpu_to_le32(NVME_ID_CNS_CTRL); + c.identify.cns = NVME_ID_CNS_CTRL; *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); if (!*id) @@ -556,7 +584,7 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n struct nvme_command c = { }; c.identify.opcode = nvme_admin_identify; - c.identify.cns = cpu_to_le32(NVME_ID_CNS_NS_ACTIVE_LIST); + c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST; c.identify.nsid = cpu_to_le32(nsid); return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000); } @@ -568,8 +596,9 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, int error; /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ - c.identify.opcode = nvme_admin_identify, - c.identify.nsid = cpu_to_le32(nsid), + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(nsid); + c.identify.cns = NVME_ID_CNS_NS; *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); if (!*id) @@ -784,6 +813,13 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, return nvme_sg_io(ns, (void __user *)arg); #endif default: +#ifdef CONFIG_NVM + if (ns->ndev) + return nvme_nvm_ioctl(ns, cmd, arg); +#endif + if (is_sed_ioctl(cmd)) + return sed_ioctl(ns->ctrl->opal_dev, cmd, + (void __user *) arg); return -ENOTTY; } } @@ -861,6 +897,9 @@ static void nvme_config_discard(struct nvme_ns *ns) struct nvme_ctrl *ctrl = ns->ctrl; u32 logical_block_size = queue_logical_block_size(ns->queue); + BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < + NVME_DSM_MAX_RANGES); + if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES) ns->queue->limits.discard_zeroes_data = 1; else @@ -869,6 +908,7 @@ static void nvme_config_discard(struct nvme_ns *ns) ns->queue->limits.discard_alignment = logical_block_size; ns->queue->limits.discard_granularity = logical_block_size; blk_queue_max_discard_sectors(ns->queue, UINT_MAX); + blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); } @@ -1051,6 +1091,28 @@ static const struct pr_ops nvme_pr_ops = { .pr_clear = nvme_pr_clear, }; +#ifdef CONFIG_BLK_SED_OPAL +int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, + bool send) +{ + struct nvme_ctrl *ctrl = data; + struct nvme_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + if (send) + cmd.common.opcode = nvme_admin_security_send; + else + cmd.common.opcode = nvme_admin_security_recv; + cmd.common.nsid = 0; + cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); + cmd.common.cdw10[1] = cpu_to_le32(len); + + return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, + ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0); +} +EXPORT_SYMBOL_GPL(nvme_sec_submit); +#endif /* CONFIG_BLK_SED_OPAL */ + static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, @@ -1196,6 +1258,176 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, blk_queue_write_cache(q, vwc, vwc); } +static void nvme_configure_apst(struct nvme_ctrl *ctrl) +{ + /* + * APST (Autonomous Power State Transition) lets us program a + * table of power state transitions that the controller will + * perform automatically. We configure it with a simple + * heuristic: we are willing to spend at most 2% of the time + * transitioning between power states. Therefore, when running + * in any given state, we will enter the next lower-power + * non-operational state after waiting 100 * (enlat + exlat) + * microseconds, as long as that state's total latency is under + * the requested maximum latency. + * + * We will not autonomously enter any non-operational state for + * which the total latency exceeds ps_max_latency_us. Users + * can set ps_max_latency_us to zero to turn off APST. + */ + + unsigned apste; + struct nvme_feat_auto_pst *table; + int ret; + + /* + * If APST isn't supported or if we haven't been initialized yet, + * then don't do anything. + */ + if (!ctrl->apsta) + return; + + if (ctrl->npss > 31) { + dev_warn(ctrl->device, "NPSS is invalid; not using APST\n"); + return; + } + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return; + + if (ctrl->ps_max_latency_us == 0) { + /* Turn off APST. */ + apste = 0; + } else { + __le64 target = cpu_to_le64(0); + int state; + + /* + * Walk through all states from lowest- to highest-power. + * According to the spec, lower-numbered states use more + * power. NPSS, despite the name, is the index of the + * lowest-power state, not the number of states. + */ + for (state = (int)ctrl->npss; state >= 0; state--) { + u64 total_latency_us, transition_ms; + + if (target) + table->entries[state] = target; + + /* + * Is this state a useful non-operational state for + * higher-power states to autonomously transition to? + */ + if (!(ctrl->psd[state].flags & + NVME_PS_FLAGS_NON_OP_STATE)) + continue; + + total_latency_us = + (u64)le32_to_cpu(ctrl->psd[state].entry_lat) + + + le32_to_cpu(ctrl->psd[state].exit_lat); + if (total_latency_us > ctrl->ps_max_latency_us) + continue; + + /* + * This state is good. Use it as the APST idle + * target for higher power states. + */ + transition_ms = total_latency_us + 19; + do_div(transition_ms, 20); + if (transition_ms > (1 << 24) - 1) + transition_ms = (1 << 24) - 1; + + target = cpu_to_le64((state << 3) | + (transition_ms << 8)); + } + + apste = 1; + } + + ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste, + table, sizeof(*table), NULL); + if (ret) + dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret); + + kfree(table); +} + +static void nvme_set_latency_tolerance(struct device *dev, s32 val) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + u64 latency; + + switch (val) { + case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT: + case PM_QOS_LATENCY_ANY: + latency = U64_MAX; + break; + + default: + latency = val; + } + + if (ctrl->ps_max_latency_us != latency) { + ctrl->ps_max_latency_us = latency; + nvme_configure_apst(ctrl); + } +} + +struct nvme_core_quirk_entry { + /* + * NVMe model and firmware strings are padded with spaces. For + * simplicity, strings in the quirk table are padded with NULLs + * instead. + */ + u16 vid; + const char *mn; + const char *fr; + unsigned long quirks; +}; + +static const struct nvme_core_quirk_entry core_quirks[] = { + /* + * Seen on a Samsung "SM951 NVMe SAMSUNG 256GB": using APST causes + * the controller to go out to lunch. It dies when the watchdog + * timer reads CSTS and gets 0xffffffff. + */ + { + .vid = 0x144d, + .fr = "BXW75D0Q", + .quirks = NVME_QUIRK_NO_APST, + }, +}; + +/* match is null-terminated but idstr is space-padded. */ +static bool string_matches(const char *idstr, const char *match, size_t len) +{ + size_t matchlen; + + if (!match) + return true; + + matchlen = strlen(match); + WARN_ON_ONCE(matchlen > len); + + if (memcmp(idstr, match, matchlen)) + return false; + + for (; matchlen < len; matchlen++) + if (idstr[matchlen] != ' ') + return false; + + return true; +} + +static bool quirk_matches(const struct nvme_id_ctrl *id, + const struct nvme_core_quirk_entry *q) +{ + return q->vid == le16_to_cpu(id->vid) && + string_matches(id->mn, q->mn, sizeof(id->mn)) && + string_matches(id->fr, q->fr, sizeof(id->fr)); +} + /* * Initialize the cached copies of the Identify data and various controller * register in our nvme_ctrl structure. This should be called as soon as @@ -1207,6 +1439,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) u64 cap; int ret, page_shift; u32 max_hw_sectors; + u8 prev_apsta; ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); if (ret) { @@ -1230,6 +1463,25 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return -EIO; } + if (!ctrl->identified) { + /* + * Check for quirks. Quirk can depend on firmware version, + * so, in principle, the set of quirks present can change + * across a reset. As a possible future enhancement, we + * could re-scan for quirks every time we reinitialize + * the device, but we'd have to make sure that the driver + * behaves intelligently if the quirks change. + */ + + int i; + + for (i = 0; i < ARRAY_SIZE(core_quirks); i++) { + if (quirk_matches(id, &core_quirks[i])) + ctrl->quirks |= core_quirks[i].quirks; + } + } + + ctrl->oacs = le16_to_cpu(id->oacs); ctrl->vid = le16_to_cpu(id->vid); ctrl->oncs = le16_to_cpup(&id->oncs); atomic_set(&ctrl->abort_limit, id->acl + 1); @@ -1249,6 +1501,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->sgls = le32_to_cpu(id->sgls); ctrl->kas = le16_to_cpu(id->kas); + ctrl->npss = id->npss; + prev_apsta = ctrl->apsta; + ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta; + memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); + if (ctrl->ops->is_fabrics) { ctrl->icdoff = le16_to_cpu(id->icdoff); ctrl->ioccsz = le32_to_cpu(id->ioccsz); @@ -1272,6 +1529,16 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } kfree(id); + + if (ctrl->apsta && !prev_apsta) + dev_pm_qos_expose_latency_tolerance(ctrl->device); + else if (!ctrl->apsta && prev_apsta) + dev_pm_qos_hide_latency_tolerance(ctrl->device); + + nvme_configure_apst(ctrl); + + ctrl->identified = true; + return ret; } EXPORT_SYMBOL_GPL(nvme_init_identify); @@ -1521,6 +1788,29 @@ static ssize_t nvme_sysfs_show_transport(struct device *dev, } static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL); +static ssize_t nvme_sysfs_show_state(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + static const char *const state_name[] = { + [NVME_CTRL_NEW] = "new", + [NVME_CTRL_LIVE] = "live", + [NVME_CTRL_RESETTING] = "resetting", + [NVME_CTRL_RECONNECTING]= "reconnecting", + [NVME_CTRL_DELETING] = "deleting", + [NVME_CTRL_DEAD] = "dead", + }; + + if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) && + state_name[ctrl->state]) + return sprintf(buf, "%s\n", state_name[ctrl->state]); + + return sprintf(buf, "unknown state\n"); +} + +static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL); + static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, struct device_attribute *attr, char *buf) @@ -1553,6 +1843,7 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_transport.attr, &dev_attr_subsysnqn.attr, &dev_attr_address.attr, + &dev_attr_state.attr, NULL }; @@ -2009,6 +2300,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, list_add_tail(&ctrl->node, &nvme_ctrl_list); spin_unlock(&dev_list_lock); + /* + * Initialize latency tolerance controls. The sysfs files won't + * be visible to userspace unless the device actually supports APST. + */ + ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance; + dev_pm_qos_update_user_latency_tolerance(ctrl->device, + min(default_ps_max_latency_us, (unsigned long)S32_MAX)); + return 0; out_release_instance: nvme_release_instance(ctrl); @@ -2034,9 +2333,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) * Revalidating a dead namespace sets capacity to 0. This will * end buffered writers dirtying pages that can't be synced. */ - if (ns->disk && !test_and_set_bit(NVME_NS_DEAD, &ns->flags)) - revalidate_disk(ns->disk); - + if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + continue; + revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); blk_mq_abort_requeue_list(ns->queue); blk_mq_start_stopped_hw_queues(ns->queue, true); @@ -2045,6 +2344,53 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_kill_queues); +void nvme_unfreeze(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_mq_unfreeze_queue(ns->queue); + mutex_unlock(&ctrl->namespaces_mutex); +} +EXPORT_SYMBOL_GPL(nvme_unfreeze); + +void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout); + if (timeout <= 0) + break; + } + mutex_unlock(&ctrl->namespaces_mutex); +} +EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); + +void nvme_wait_freeze(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_mq_freeze_queue_wait(ns->queue); + mutex_unlock(&ctrl->namespaces_mutex); +} +EXPORT_SYMBOL_GPL(nvme_wait_freeze); + +void nvme_start_freeze(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_mq_freeze_queue_start(ns->queue); + mutex_unlock(&ctrl->namespaces_mutex); +} +EXPORT_SYMBOL_GPL(nvme_start_freeze); + void nvme_stop_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 916d13608059..5b7386f69f4d 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -480,11 +480,16 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue); * being implemented to the common NVMe fabrics library. Part of * the overall init sequence of starting up a fabrics driver. */ -void nvmf_register_transport(struct nvmf_transport_ops *ops) +int nvmf_register_transport(struct nvmf_transport_ops *ops) { + if (!ops->create_ctrl) + return -EINVAL; + mutex_lock(&nvmf_transports_mutex); list_add_tail(&ops->entry, &nvmf_transports); mutex_unlock(&nvmf_transports_mutex); + + return 0; } EXPORT_SYMBOL_GPL(nvmf_register_transport); diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 924145c979f1..156018182ce4 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -128,7 +128,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl); int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid); -void nvmf_register_transport(struct nvmf_transport_ops *ops); +int nvmf_register_transport(struct nvmf_transport_ops *ops); void nvmf_unregister_transport(struct nvmf_transport_ops *ops); void nvmf_free_options(struct nvmf_ctrl_options *opts); const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e65041c640cb..9690beb15e69 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1937,7 +1937,7 @@ nvme_fc_complete_rq(struct request *rq) return; } - if (rq->cmd_type == REQ_TYPE_DRV_PRIV) + if (blk_rq_is_passthrough(rq)) error = rq->errors; else error = nvme_error_status(rq->errors); @@ -2353,18 +2353,6 @@ __nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, /* sanity checks */ - /* FC-NVME supports 64-byte SQE only */ - if (ctrl->ctrl.ioccsz != 4) { - dev_err(ctrl->ctrl.device, "ioccsz %d is not supported!\n", - ctrl->ctrl.ioccsz); - goto out_remove_admin_queue; - } - /* FC-NVME supports 16-byte CQE only */ - if (ctrl->ctrl.iorcsz != 1) { - dev_err(ctrl->ctrl.device, "iorcsz %d is not supported!\n", - ctrl->ctrl.iorcsz); - goto out_remove_admin_queue; - } /* FC-NVME does not have other data in the capsule */ if (ctrl->ctrl.icdoff) { dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n", @@ -2562,8 +2550,7 @@ static int __init nvme_fc_init_module(void) if (!nvme_fc_wq) return -ENOMEM; - nvmf_register_transport(&nvme_fc_transport); - return 0; + return nvmf_register_transport(&nvme_fc_transport); } static void __exit nvme_fc_exit_module(void) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 588d4a34c083..21cac8523bd8 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -26,6 +26,8 @@ #include <linux/bitops.h> #include <linux/lightnvm.h> #include <linux/vmalloc.h> +#include <linux/sched/sysctl.h> +#include <uapi/linux/lightnvm.h> enum nvme_nvm_admin_opcode { nvme_nvm_admin_identity = 0xe2, @@ -248,50 +250,48 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) { struct nvme_nvm_id_group *src; struct nvm_id_group *dst; - int i, end; - - end = min_t(u32, 4, nvm_id->cgrps); - - for (i = 0; i < end; i++) { - src = &nvme_nvm_id->groups[i]; - dst = &nvm_id->groups[i]; - - dst->mtype = src->mtype; - dst->fmtype = src->fmtype; - dst->num_ch = src->num_ch; - dst->num_lun = src->num_lun; - dst->num_pln = src->num_pln; - - dst->num_pg = le16_to_cpu(src->num_pg); - dst->num_blk = le16_to_cpu(src->num_blk); - dst->fpg_sz = le16_to_cpu(src->fpg_sz); - dst->csecs = le16_to_cpu(src->csecs); - dst->sos = le16_to_cpu(src->sos); - - dst->trdt = le32_to_cpu(src->trdt); - dst->trdm = le32_to_cpu(src->trdm); - dst->tprt = le32_to_cpu(src->tprt); - dst->tprm = le32_to_cpu(src->tprm); - dst->tbet = le32_to_cpu(src->tbet); - dst->tbem = le32_to_cpu(src->tbem); - dst->mpos = le32_to_cpu(src->mpos); - dst->mccap = le32_to_cpu(src->mccap); - - dst->cpar = le16_to_cpu(src->cpar); - - if (dst->fmtype == NVM_ID_FMTYPE_MLC) { - memcpy(dst->lptbl.id, src->lptbl.id, 8); - dst->lptbl.mlc.num_pairs = - le16_to_cpu(src->lptbl.mlc.num_pairs); - - if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) { - pr_err("nvm: number of MLC pairs not supported\n"); - return -EINVAL; - } - memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, - dst->lptbl.mlc.num_pairs); + if (nvme_nvm_id->cgrps != 1) + return -EINVAL; + + src = &nvme_nvm_id->groups[0]; + dst = &nvm_id->grp; + + dst->mtype = src->mtype; + dst->fmtype = src->fmtype; + dst->num_ch = src->num_ch; + dst->num_lun = src->num_lun; + dst->num_pln = src->num_pln; + + dst->num_pg = le16_to_cpu(src->num_pg); + dst->num_blk = le16_to_cpu(src->num_blk); + dst->fpg_sz = le16_to_cpu(src->fpg_sz); + dst->csecs = le16_to_cpu(src->csecs); + dst->sos = le16_to_cpu(src->sos); + + dst->trdt = le32_to_cpu(src->trdt); + dst->trdm = le32_to_cpu(src->trdm); + dst->tprt = le32_to_cpu(src->tprt); + dst->tprm = le32_to_cpu(src->tprm); + dst->tbet = le32_to_cpu(src->tbet); + dst->tbem = le32_to_cpu(src->tbem); + dst->mpos = le32_to_cpu(src->mpos); + dst->mccap = le32_to_cpu(src->mccap); + + dst->cpar = le16_to_cpu(src->cpar); + + if (dst->fmtype == NVM_ID_FMTYPE_MLC) { + memcpy(dst->lptbl.id, src->lptbl.id, 8); + dst->lptbl.mlc.num_pairs = + le16_to_cpu(src->lptbl.mlc.num_pairs); + + if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) { + pr_err("nvm: number of MLC pairs not supported\n"); + return -EINVAL; } + + memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, + dst->lptbl.mlc.num_pairs); } return 0; @@ -321,7 +321,6 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) nvm_id->ver_id = nvme_nvm_id->ver_id; nvm_id->vmnt = nvme_nvm_id->vmnt; - nvm_id->cgrps = nvme_nvm_id->cgrps; nvm_id->cap = le32_to_cpu(nvme_nvm_id->cap); nvm_id->dom = le32_to_cpu(nvme_nvm_id->dom); memcpy(&nvm_id->ppaf, &nvme_nvm_id->ppaf, @@ -372,7 +371,7 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, } /* Transform physical address to target address space */ - nvmdev->mt->part_to_tgt(nvmdev, entries, cmd_nlb); + nvm_part_to_tgt(nvmdev, entries, cmd_nlb); if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) { ret = -EINTR; @@ -485,7 +484,8 @@ static void nvme_nvm_end_io(struct request *rq, int error) struct nvm_rq *rqd = rq->end_io_data; rqd->ppa_status = nvme_req(rq)->result.u64; - nvm_end_io(rqd, error); + rqd->error = error; + nvm_end_io(rqd); kfree(nvme_req(rq)->cmd); blk_mq_free_request(rq); @@ -586,6 +586,224 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .max_phys_sect = 64, }; +static void nvme_nvm_end_user_vio(struct request *rq, int error) +{ + struct completion *waiting = rq->end_io_data; + + complete(waiting); +} + +static int nvme_nvm_submit_user_cmd(struct request_queue *q, + struct nvme_ns *ns, + struct nvme_nvm_command *vcmd, + void __user *ubuf, unsigned int bufflen, + void __user *meta_buf, unsigned int meta_len, + void __user *ppa_buf, unsigned int ppa_len, + u32 *result, u64 *status, unsigned int timeout) +{ + bool write = nvme_is_write((struct nvme_command *)vcmd); + struct nvm_dev *dev = ns->ndev; + struct gendisk *disk = ns->disk; + struct request *rq; + struct bio *bio = NULL; + __le64 *ppa_list = NULL; + dma_addr_t ppa_dma; + __le64 *metadata = NULL; + dma_addr_t metadata_dma; + DECLARE_COMPLETION_ONSTACK(wait); + int ret; + + rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0, + NVME_QID_ANY); + if (IS_ERR(rq)) { + ret = -ENOMEM; + goto err_cmd; + } + + rq->timeout = timeout ? timeout : ADMIN_TIMEOUT; + + rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; + rq->end_io_data = &wait; + + if (ppa_buf && ppa_len) { + ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); + if (!ppa_list) { + ret = -ENOMEM; + goto err_rq; + } + if (copy_from_user(ppa_list, (void __user *)ppa_buf, + sizeof(u64) * (ppa_len + 1))) { + ret = -EFAULT; + goto err_ppa; + } + vcmd->ph_rw.spba = cpu_to_le64(ppa_dma); + } else { + vcmd->ph_rw.spba = cpu_to_le64((uintptr_t)ppa_buf); + } + + if (ubuf && bufflen) { + ret = blk_rq_map_user(q, rq, NULL, ubuf, bufflen, GFP_KERNEL); + if (ret) + goto err_ppa; + bio = rq->bio; + + if (meta_buf && meta_len) { + metadata = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, + &metadata_dma); + if (!metadata) { + ret = -ENOMEM; + goto err_map; + } + + if (write) { + if (copy_from_user(metadata, + (void __user *)meta_buf, + meta_len)) { + ret = -EFAULT; + goto err_meta; + } + } + vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma); + } + + if (!disk) + goto submit; + + bio->bi_bdev = bdget_disk(disk, 0); + if (!bio->bi_bdev) { + ret = -ENODEV; + goto err_meta; + } + } + +submit: + blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_user_vio); + + wait_for_completion_io(&wait); + + ret = nvme_error_status(rq->errors); + if (result) + *result = rq->errors & 0x7ff; + if (status) + *status = le64_to_cpu(nvme_req(rq)->result.u64); + + if (metadata && !ret && !write) { + if (copy_to_user(meta_buf, (void *)metadata, meta_len)) + ret = -EFAULT; + } +err_meta: + if (meta_buf && meta_len) + dma_pool_free(dev->dma_pool, metadata, metadata_dma); +err_map: + if (bio) { + if (disk && bio->bi_bdev) + bdput(bio->bi_bdev); + blk_rq_unmap_user(bio); + } +err_ppa: + if (ppa_buf && ppa_len) + dma_pool_free(dev->dma_pool, ppa_list, ppa_dma); +err_rq: + blk_mq_free_request(rq); +err_cmd: + return ret; +} + +static int nvme_nvm_submit_vio(struct nvme_ns *ns, + struct nvm_user_vio __user *uvio) +{ + struct nvm_user_vio vio; + struct nvme_nvm_command c; + unsigned int length; + int ret; + + if (copy_from_user(&vio, uvio, sizeof(vio))) + return -EFAULT; + if (vio.flags) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.ph_rw.opcode = vio.opcode; + c.ph_rw.nsid = cpu_to_le32(ns->ns_id); + c.ph_rw.control = cpu_to_le16(vio.control); + c.ph_rw.length = cpu_to_le16(vio.nppas); + + length = (vio.nppas + 1) << ns->lba_shift; + + ret = nvme_nvm_submit_user_cmd(ns->queue, ns, &c, + (void __user *)(uintptr_t)vio.addr, length, + (void __user *)(uintptr_t)vio.metadata, + vio.metadata_len, + (void __user *)(uintptr_t)vio.ppa_list, vio.nppas, + &vio.result, &vio.status, 0); + + if (ret && copy_to_user(uvio, &vio, sizeof(vio))) + return -EFAULT; + + return ret; +} + +static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin, + struct nvm_passthru_vio __user *uvcmd) +{ + struct nvm_passthru_vio vcmd; + struct nvme_nvm_command c; + struct request_queue *q; + unsigned int timeout = 0; + int ret; + + if (copy_from_user(&vcmd, uvcmd, sizeof(vcmd))) + return -EFAULT; + if ((vcmd.opcode != 0xF2) && (!capable(CAP_SYS_ADMIN))) + return -EACCES; + if (vcmd.flags) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.common.opcode = vcmd.opcode; + c.common.nsid = cpu_to_le32(ns->ns_id); + c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3); + /* cdw11-12 */ + c.ph_rw.length = cpu_to_le16(vcmd.nppas); + c.ph_rw.control = cpu_to_le32(vcmd.control); + c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13); + c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14); + c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15); + + if (vcmd.timeout_ms) + timeout = msecs_to_jiffies(vcmd.timeout_ms); + + q = admin ? ns->ctrl->admin_q : ns->queue; + + ret = nvme_nvm_submit_user_cmd(q, ns, + (struct nvme_nvm_command *)&c, + (void __user *)(uintptr_t)vcmd.addr, vcmd.data_len, + (void __user *)(uintptr_t)vcmd.metadata, + vcmd.metadata_len, + (void __user *)(uintptr_t)vcmd.ppa_list, vcmd.nppas, + &vcmd.result, &vcmd.status, timeout); + + if (ret && copy_to_user(uvcmd, &vcmd, sizeof(vcmd))) + return -EFAULT; + + return ret; +} + +int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case NVME_NVM_IOCTL_ADMIN_VIO: + return nvme_nvm_user_vcmd(ns, 1, (void __user *)arg); + case NVME_NVM_IOCTL_IO_VIO: + return nvme_nvm_user_vcmd(ns, 0, (void __user *)arg); + case NVME_NVM_IOCTL_SUBMIT_VIO: + return nvme_nvm_submit_vio(ns, (void __user *)arg); + default: + return -ENOTTY; + } +} + int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) { struct request_queue *q = ns->queue; @@ -622,7 +840,7 @@ static ssize_t nvm_dev_attr_show(struct device *dev, return 0; id = &ndev->identity; - grp = &id->groups[0]; + grp = &id->grp; attr = &dattr->attr; if (strcmp(attr->name, "version") == 0) { @@ -633,10 +851,9 @@ static ssize_t nvm_dev_attr_show(struct device *dev, return scnprintf(page, PAGE_SIZE, "%u\n", id->cap); } else if (strcmp(attr->name, "device_mode") == 0) { return scnprintf(page, PAGE_SIZE, "%u\n", id->dom); + /* kept for compatibility */ } else if (strcmp(attr->name, "media_manager") == 0) { - if (!ndev->mt) - return scnprintf(page, PAGE_SIZE, "%s\n", "none"); - return scnprintf(page, PAGE_SIZE, "%s\n", ndev->mt->name); + return scnprintf(page, PAGE_SIZE, "%s\n", "gennvm"); } else if (strcmp(attr->name, "ppa_format") == 0) { return scnprintf(page, PAGE_SIZE, "0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index aead6d08ed2c..2aa20e3e5675 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -19,6 +19,7 @@ #include <linux/kref.h> #include <linux/blk-mq.h> #include <linux/lightnvm.h> +#include <linux/sed-opal.h> enum { /* @@ -77,6 +78,11 @@ enum nvme_quirks { * readiness, which is done by reading the NVME_CSTS_RDY bit. */ NVME_QUIRK_DELAY_BEFORE_CHK_RDY = (1 << 3), + + /* + * APST should not be used. + */ + NVME_QUIRK_NO_APST = (1 << 4), }; /* @@ -111,6 +117,7 @@ enum nvme_ctrl_state { struct nvme_ctrl { enum nvme_ctrl_state state; + bool identified; spinlock_t lock; const struct nvme_ctrl_ops *ops; struct request_queue *admin_q; @@ -125,6 +132,8 @@ struct nvme_ctrl { struct list_head node; struct ida ns_ida; + struct opal_dev *opal_dev; + char name[12]; char serial[20]; char model[40]; @@ -137,19 +146,26 @@ struct nvme_ctrl { u32 max_hw_sectors; u16 oncs; u16 vid; + u16 oacs; atomic_t abort_limit; u8 event_limit; u8 vwc; u32 vs; u32 sgls; u16 kas; + u8 npss; + u8 apsta; unsigned int kato; bool subsystem; unsigned long quirks; + struct nvme_id_power_state psd[32]; struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; + /* Power saving configuration */ + u64 ps_max_latency_us; + /* Fabrics only */ u16 sqsize; u32 ioccsz; @@ -267,6 +283,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl); void nvme_queue_scan(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); +int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, + bool send); + #define NVME_NR_AERS 1 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, union nvme_result *res); @@ -275,6 +294,10 @@ void nvme_queue_async_events(struct nvme_ctrl *ctrl); void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); void nvme_kill_queues(struct nvme_ctrl *ctrl); +void nvme_unfreeze(struct nvme_ctrl *ctrl); +void nvme_wait_freeze(struct nvme_ctrl *ctrl); +void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); +void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, @@ -318,6 +341,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); int nvme_nvm_register_sysfs(struct nvme_ns *ns); void nvme_nvm_unregister_sysfs(struct nvme_ns *ns); +int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg); #else static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) @@ -335,6 +359,11 @@ static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *i { return 0; } +static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, + unsigned long arg) +{ + return -ENOTTY; +} #endif /* CONFIG_NVM */ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 3faefabf339c..26a5fd05fe88 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -43,6 +43,7 @@ #include <linux/types.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <asm/unaligned.h> +#include <linux/sed-opal.h> #include "nvme.h" @@ -588,7 +589,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, */ if (ns && ns->ms && !blk_integrity_rq(req)) { if (!(ns->pi_type && ns->ms == 8) && - req->cmd_type != REQ_TYPE_DRV_PRIV) { + !blk_rq_is_passthrough(req)) { blk_mq_end_request(req, -EFAULT); return BLK_MQ_RQ_QUEUE_OK; } @@ -612,10 +613,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, spin_lock_irq(&nvmeq->q_lock); if (unlikely(nvmeq->cq_vector < 0)) { - if (ns && !test_bit(NVME_NS_DEAD, &ns->flags)) - ret = BLK_MQ_RQ_QUEUE_BUSY; - else - ret = BLK_MQ_RQ_QUEUE_ERROR; + ret = BLK_MQ_RQ_QUEUE_ERROR; spin_unlock_irq(&nvmeq->q_lock); goto out_cleanup_iod; } @@ -645,7 +643,7 @@ static void nvme_complete_rq(struct request *req) return; } - if (req->cmd_type == REQ_TYPE_DRV_PRIV) + if (blk_rq_is_passthrough(req)) error = req->errors; else error = nvme_error_status(req->errors); @@ -895,12 +893,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) return BLK_EH_HANDLED; } - iod->aborted = 1; - if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; } + iod->aborted = 1; memset(&cmd, 0, sizeof(cmd)); cmd.abort.opcode = nvme_admin_abort_cmd; @@ -1041,9 +1038,10 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, } static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, - int depth) + int depth, int node) { - struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); + struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL, + node); if (!nvmeq) return NULL; @@ -1178,6 +1176,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(dev->dev); dev->admin_tagset.cmd_size = nvme_cmd_size(dev); + dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; dev->admin_tagset.driver_data = dev; if (blk_mq_alloc_tag_set(&dev->admin_tagset)) @@ -1219,7 +1218,8 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) nvmeq = dev->queues[0]; if (!nvmeq) { - nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); + nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, + dev_to_node(dev->dev)); if (!nvmeq) return -ENOMEM; } @@ -1311,7 +1311,9 @@ static int nvme_create_io_queues(struct nvme_dev *dev) int ret = 0; for (i = dev->queue_count; i <= dev->max_qid; i++) { - if (!nvme_alloc_queue(dev, i, dev->q_depth)) { + /* vector == qid - 1, match nvme_create_queue */ + if (!nvme_alloc_queue(dev, i, dev->q_depth, + pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { ret = -ENOMEM; break; } @@ -1673,21 +1675,34 @@ static void nvme_pci_disable(struct nvme_dev *dev) static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { int i, queues; - u32 csts = -1; + bool dead = true; + struct pci_dev *pdev = to_pci_dev(dev->dev); del_timer_sync(&dev->watchdog_timer); mutex_lock(&dev->shutdown_lock); - if (pci_is_enabled(to_pci_dev(dev->dev))) { - nvme_stop_queues(&dev->ctrl); - csts = readl(dev->bar + NVME_REG_CSTS); + if (pci_is_enabled(pdev)) { + u32 csts = readl(dev->bar + NVME_REG_CSTS); + + if (dev->ctrl.state == NVME_CTRL_LIVE) + nvme_start_freeze(&dev->ctrl); + dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || + pdev->error_state != pci_channel_io_normal); } + /* + * Give the controller a chance to complete all entered requests if + * doing a safe shutdown. + */ + if (!dead && shutdown) + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); + nvme_stop_queues(&dev->ctrl); + queues = dev->online_queues - 1; for (i = dev->queue_count - 1; i > 0; i--) nvme_suspend_queue(dev->queues[i]); - if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { + if (dead) { /* A device might become IO incapable very soon during * probe, before the admin queue is configured. Thus, * queue_count can be 0 here. @@ -1702,6 +1717,14 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); + + /* + * The driver will not be starting up queues again if shutting down so + * must flush all entered requests to their failed completion to avoid + * deadlocking blk-mq hot-cpu notifier. + */ + if (shutdown) + nvme_start_queues(&dev->ctrl); mutex_unlock(&dev->shutdown_lock); } @@ -1738,6 +1761,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) if (dev->ctrl.admin_q) blk_put_queue(dev->ctrl.admin_q); kfree(dev->queues); + free_opal_dev(dev->ctrl.opal_dev); kfree(dev); } @@ -1754,6 +1778,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) static void nvme_reset_work(struct work_struct *work) { struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); + bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); int result = -ENODEV; if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING)) @@ -1786,6 +1811,17 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; + if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) { + if (!dev->ctrl.opal_dev) + dev->ctrl.opal_dev = + init_opal_dev(&dev->ctrl, &nvme_sec_submit); + else if (was_suspend) + opal_unlock_from_suspend(dev->ctrl.opal_dev); + } else { + free_opal_dev(dev->ctrl.opal_dev); + dev->ctrl.opal_dev = NULL; + } + result = nvme_setup_io_queues(dev); if (result) goto out; @@ -1811,7 +1847,9 @@ static void nvme_reset_work(struct work_struct *work) nvme_remove_namespaces(&dev->ctrl); } else { nvme_start_queues(&dev->ctrl); + nvme_wait_freeze(&dev->ctrl); nvme_dev_add(dev); + nvme_unfreeze(&dev->ctrl); } if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { @@ -1990,8 +2028,10 @@ static void nvme_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); - if (!pci_device_is_present(pdev)) + if (!pci_device_is_present(pdev)) { nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); + nvme_dev_disable(dev, false); + } flush_work(&dev->reset_work); nvme_uninit_ctrl(&dev->ctrl); @@ -2110,6 +2150,7 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, { 0, } }; MODULE_DEVICE_TABLE(pci, nvme_id_table); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 557f29b1f1bb..47a479f26e5d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -42,28 +42,6 @@ #define NVME_RDMA_MAX_INLINE_SEGMENTS 1 -static const char *const nvme_rdma_cm_status_strs[] = { - [NVME_RDMA_CM_INVALID_LEN] = "invalid length", - [NVME_RDMA_CM_INVALID_RECFMT] = "invalid record format", - [NVME_RDMA_CM_INVALID_QID] = "invalid queue ID", - [NVME_RDMA_CM_INVALID_HSQSIZE] = "invalid host SQ size", - [NVME_RDMA_CM_INVALID_HRQSIZE] = "invalid host RQ size", - [NVME_RDMA_CM_NO_RSC] = "resource not found", - [NVME_RDMA_CM_INVALID_IRD] = "invalid IRD", - [NVME_RDMA_CM_INVALID_ORD] = "Invalid ORD", -}; - -static const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status) -{ - size_t index = status; - - if (index < ARRAY_SIZE(nvme_rdma_cm_status_strs) && - nvme_rdma_cm_status_strs[index]) - return nvme_rdma_cm_status_strs[index]; - else - return "unrecognized reason"; -}; - /* * We handle AEN commands ourselves and don't even let the * block layer know about them. @@ -155,6 +133,10 @@ struct nvme_rdma_ctrl { struct sockaddr addr; struct sockaddr_in addr_in; }; + union { + struct sockaddr src_addr; + struct sockaddr_in src_addr_in; + }; struct nvme_ctrl ctrl; }; @@ -361,8 +343,6 @@ static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl, struct ib_device *ibdev = dev->dev; int ret; - BUG_ON(queue_idx >= ctrl->queue_count); - ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); if (ret) @@ -567,6 +547,7 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, int idx, size_t queue_size) { struct nvme_rdma_queue *queue; + struct sockaddr *src_addr = NULL; int ret; queue = &ctrl->queues[idx]; @@ -589,7 +570,10 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, } queue->cm_error = -ETIMEDOUT; - ret = rdma_resolve_addr(queue->cm_id, NULL, &ctrl->addr, + if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) + src_addr = &ctrl->src_addr; + + ret = rdma_resolve_addr(queue->cm_id, src_addr, &ctrl->addr, NVME_RDMA_CONNECT_TIMEOUT_MS); if (ret) { dev_info(ctrl->ctrl.device, @@ -666,8 +650,22 @@ out_free_queues: static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) { + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + unsigned int nr_io_queues; int i, ret; + nr_io_queues = min(opts->nr_io_queues, num_online_cpus()); + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); + if (ret) + return ret; + + ctrl->queue_count = nr_io_queues + 1; + if (ctrl->queue_count < 2) + return 0; + + dev_info(ctrl->ctrl.device, + "creating %d I/O queues.\n", nr_io_queues); + for (i = 1; i < ctrl->queue_count; i++) { ret = nvme_rdma_init_queue(ctrl, i, ctrl->ctrl.opts->queue_size); @@ -1065,7 +1063,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, * sequencer is not allocated in our driver's tagset and it's * triggered to be freed by blk_cleanup_queue(). So we need to * always mark it as signaled to ensure that the "wr_cqe", which is - * embeded in request's payload, is not freed when __ib_process_cq() + * embedded in request's payload, is not freed when __ib_process_cq() * calls wr_cqe->done(). */ if ((++queue->sig_count % 32) == 0 || flush) @@ -1265,7 +1263,7 @@ static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) dev = nvme_rdma_find_get_device(queue->cm_id); if (!dev) { - dev_err(queue->cm_id->device->dma_device, + dev_err(queue->cm_id->device->dev.parent, "no client data found!\n"); return -ECONNREFUSED; } @@ -1423,7 +1421,7 @@ static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) { struct nvme_command *cmd = nvme_req(rq)->cmd; - if (rq->cmd_type != REQ_TYPE_DRV_PRIV || + if (!blk_rq_is_passthrough(rq) || cmd->common.opcode != nvme_fabrics_command || cmd->fabrics.fctype != nvme_fabrics_type_connect) return false; @@ -1471,7 +1469,7 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(struct nvme_command), DMA_TO_DEVICE); - if (rq->cmd_type == REQ_TYPE_FS && req_op(rq) == REQ_OP_FLUSH) + if (req_op(rq) == REQ_OP_FLUSH) flush = true; ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, req->mr->need_inval ? &req->reg_wr.wr : NULL, flush); @@ -1522,7 +1520,7 @@ static void nvme_rdma_complete_rq(struct request *rq) return; } - if (rq->cmd_type == REQ_TYPE_DRV_PRIV) + if (blk_rq_is_passthrough(rq)) error = rq->errors; else error = nvme_error_status(rq->errors); @@ -1805,20 +1803,8 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl) { - struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; int ret; - ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues); - if (ret) - return ret; - - ctrl->queue_count = opts->nr_io_queues + 1; - if (ctrl->queue_count < 2) - return 0; - - dev_info(ctrl->ctrl.device, - "creating %d I/O queues.\n", opts->nr_io_queues); - ret = nvme_rdma_init_io_queues(ctrl); if (ret) return ret; @@ -1905,6 +1891,16 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, goto out_free_ctrl; } + if (opts->mask & NVMF_OPT_HOST_TRADDR) { + ret = nvme_rdma_parse_ipaddr(&ctrl->src_addr_in, + opts->host_traddr); + if (ret) { + pr_err("malformed src IP address passed: %s\n", + opts->host_traddr); + goto out_free_ctrl; + } + } + if (opts->mask & NVMF_OPT_TRSVCID) { u16 port; @@ -2016,7 +2012,8 @@ out_free_ctrl: static struct nvmf_transport_ops nvme_rdma_transport = { .name = "rdma", .required_opts = NVMF_OPT_TRADDR, - .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY, + .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | + NVMF_OPT_HOST_TRADDR, .create_ctrl = nvme_rdma_create_ctrl, }; @@ -2063,8 +2060,7 @@ static int __init nvme_rdma_init_module(void) return ret; } - nvmf_register_transport(&nvme_rdma_transport); - return 0; + return nvmf_register_transport(&nvme_rdma_transport); } static void __exit nvme_rdma_cleanup_module(void) diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c index a5c09e703bd8..f49ae2758bb7 100644 --- a/drivers/nvme/host/scsi.c +++ b/drivers/nvme/host/scsi.c @@ -43,6 +43,7 @@ #include <asm/unaligned.h> #include <scsi/sg.h> #include <scsi/scsi.h> +#include <scsi/scsi_request.h> #include "nvme.h" @@ -2347,12 +2348,14 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) { - u8 cmd[BLK_MAX_CDB]; + u8 cmd[16]; int retcode; unsigned int opcode; if (hdr->cmdp == NULL) return -EMSGSIZE; + if (hdr->cmd_len > sizeof(cmd)) + return -EINVAL; if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; @@ -2451,8 +2454,6 @@ int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr) return -EFAULT; if (hdr.interface_id != 'S') return -EINVAL; - if (hdr.cmd_len > BLK_MAX_CDB) - return -EINVAL; /* * A positive return code means a NVMe status, which has been |