diff options
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r-- | drivers/nvme/host/core.c | 108 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 2 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 1 | ||||
-rw-r--r-- | drivers/nvme/host/lightnvm.c | 27 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 349 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 78 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 77 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 234 | ||||
-rw-r--r-- | drivers/nvme/host/trace.c | 11 | ||||
-rw-r--r-- | drivers/nvme/host/trace.h | 142 |
10 files changed, 670 insertions, 359 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bf65501e6ed6..dd8ec1dd9219 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -252,7 +252,8 @@ void nvme_complete_rq(struct request *req) trace_nvme_complete_rq(req); if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { - if (nvme_req_needs_failover(req, status)) { + if ((req->cmd_flags & REQ_NVME_MPATH) && + blk_path_error(status)) { nvme_failover_req(req); return; } @@ -617,6 +618,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) return BLK_STS_NOTSUPP; control |= NVME_RW_PRINFO_PRACT; + } else if (req_op(req) == REQ_OP_WRITE) { + t10_pi_prepare(req, ns->pi_type); } switch (ns->pi_type) { @@ -627,8 +630,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, case NVME_NS_DPS_PI_TYPE2: control |= NVME_RW_PRINFO_PRCHK_GUARD | NVME_RW_PRINFO_PRCHK_REF; - cmnd->rw.reftag = cpu_to_le32( - nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); break; } } @@ -638,6 +640,22 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, return 0; } +void nvme_cleanup_cmd(struct request *req) +{ + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && + nvme_req(req)->status == 0) { + struct nvme_ns *ns = req->rq_disk->private_data; + + t10_pi_complete(req, ns->pi_type, + blk_rq_bytes(req) >> ns->lba_shift); + } + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { + kfree(page_address(req->special_vec.bv_page) + + req->special_vec.bv_offset); + } +} +EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); + blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd) { @@ -668,10 +686,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, } cmd->common.command_id = req->tag; - if (ns) - trace_nvme_setup_nvm_cmd(req->q->id, cmd); - else - trace_nvme_setup_admin_cmd(cmd); + trace_nvme_setup_cmd(req, cmd); return ret; } EXPORT_SYMBOL_GPL(nvme_setup_cmd); @@ -864,9 +879,6 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) if (unlikely(ctrl->kato == 0)) return; - INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); - memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); - ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); } @@ -1056,7 +1068,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) EXPORT_SYMBOL_GPL(nvme_set_queue_count); #define NVME_AEN_SUPPORTED \ - (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT) + (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE) static void nvme_enable_aen(struct nvme_ctrl *ctrl) { @@ -1472,6 +1484,12 @@ static void nvme_update_disk_info(struct gendisk *disk, set_capacity(disk, capacity); nvme_config_discard(ns); + + if (id->nsattr & (1 << 0)) + set_disk_ro(disk, true); + else + set_disk_ro(disk, false); + blk_mq_unfreeze_queue(disk->queue); } @@ -2270,21 +2288,16 @@ out_unlock: return ret; } -int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 log_page, void *log, - size_t size, u64 offset) +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, + void *log, size_t size, u64 offset) { struct nvme_command c = { }; unsigned long dwlen = size / 4 - 1; c.get_log_page.opcode = nvme_admin_get_log_page; - - if (ns) - c.get_log_page.nsid = cpu_to_le32(ns->head->ns_id); - else - c.get_log_page.nsid = cpu_to_le32(NVME_NSID_ALL); - + c.get_log_page.nsid = cpu_to_le32(nsid); c.get_log_page.lid = log_page; + c.get_log_page.lsp = lsp; c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1)); c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); @@ -2293,12 +2306,6 @@ int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } -static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log, - size_t size) -{ - return nvme_get_log_ext(ctrl, NULL, log_page, log, size, 0); -} - static int nvme_get_effects_log(struct nvme_ctrl *ctrl) { int ret; @@ -2309,8 +2316,8 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl) if (!ctrl->effects) return 0; - ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects, - sizeof(*ctrl->effects)); + ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, + ctrl->effects, sizeof(*ctrl->effects), 0); if (ret) { kfree(ctrl->effects); ctrl->effects = NULL; @@ -2401,6 +2408,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) nvme_set_queue_limits(ctrl, ctrl->admin_q); ctrl->sgls = le32_to_cpu(id->sgls); ctrl->kas = le16_to_cpu(id->kas); + ctrl->max_namespaces = le32_to_cpu(id->mnan); if (id->rtd3e) { /* us -> s */ @@ -2460,8 +2468,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); } + ret = nvme_mpath_init(ctrl, id); kfree(id); + if (ret < 0) + return ret; + if (ctrl->apst_enabled && !prev_apst_enabled) dev_pm_qos_expose_latency_tolerance(ctrl->device); else if (!ctrl->apst_enabled && prev_apst_enabled) @@ -2680,6 +2692,10 @@ static struct attribute *nvme_ns_id_attrs[] = { &dev_attr_nguid.attr, &dev_attr_eui.attr, &dev_attr_nsid.attr, +#ifdef CONFIG_NVME_MULTIPATH + &dev_attr_ana_grpid.attr, + &dev_attr_ana_state.attr, +#endif NULL, }; @@ -2702,6 +2718,14 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) return 0; } +#ifdef CONFIG_NVME_MULTIPATH + if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { + if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */ + return 0; + if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) + return 0; + } +#endif return a->mode; } @@ -3075,8 +3099,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_get_ctrl(ctrl); - kfree(id); - device_add_disk(ctrl->device, ns->disk); if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, &nvme_ns_id_attr_group)) @@ -3086,8 +3108,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) pr_warn("%s: failed to register lightnvm sysfs group for identification\n", ns->disk->disk_name); - nvme_mpath_add_disk(ns->head); + nvme_mpath_add_disk(ns, id); nvme_fault_inject_init(ns); + kfree(id); + return; out_unlink_ns: mutex_lock(&ctrl->subsys->lock); @@ -3229,7 +3253,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) * raced with us in reading the log page, which could cause us to miss * updates. */ - error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size); + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log, + log_size, 0); if (error) dev_warn(ctrl->device, "reading changed ns log failed: %d\n", error); @@ -3346,9 +3371,9 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) if (!log) return; - if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log))) - dev_warn(ctrl->device, - "Get FW SLOT INFO log error\n"); + if (nvme_get_log(ctrl, NVME_NSID_ALL, 0, NVME_LOG_FW_SLOT, log, + sizeof(*log), 0)) + dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); kfree(log); } @@ -3394,6 +3419,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) case NVME_AER_NOTICE_FW_ACT_STARTING: queue_work(nvme_wq, &ctrl->fw_act_work); break; +#ifdef CONFIG_NVME_MULTIPATH + case NVME_AER_NOTICE_ANA: + if (!ctrl->ana_log_buf) + break; + queue_work(nvme_wq, &ctrl->ana_work); + break; +#endif default: dev_warn(ctrl->device, "async event result %08x\n", result); } @@ -3426,6 +3458,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event); void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { + nvme_mpath_stop(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); flush_work(&ctrl->scan_work); @@ -3463,6 +3496,7 @@ static void nvme_free_ctrl(struct device *dev) ida_simple_remove(&nvme_instance_ida, ctrl->instance); kfree(ctrl->effects); + nvme_mpath_uninit(ctrl); if (subsys) { mutex_lock(&subsys->lock); @@ -3499,6 +3533,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); + INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); + memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); + ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; + ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); if (ret < 0) goto out; diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index f7efe5a58cc7..206d63cb1afc 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -474,7 +474,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl) { - if (ctrl->opts->max_reconnects != -1 && + if (ctrl->opts->max_reconnects == -1 || ctrl->nr_reconnects < ctrl->opts->max_reconnects) return true; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 9bac912173ba..611e70cae754 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1737,6 +1737,7 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_fc_queue *queue = &ctrl->queues[queue_idx]; + nvme_req(rq)->ctrl = &ctrl->ctrl; return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++); } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 41279da799ed..6fe5923c95d4 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -414,12 +414,6 @@ static int nvme_nvm_setup_20(struct nvme_nvm_id20 *id, /* Set compacted version for upper layers */ geo->version = NVM_OCSSD_SPEC_20; - if (!(geo->major_ver_id == 2 && geo->minor_ver_id == 0)) { - pr_err("nvm: OCSSD version not supported (v%d.%d)\n", - geo->major_ver_id, geo->minor_ver_id); - return -EINVAL; - } - geo->num_ch = le16_to_cpu(id->num_grp); geo->num_lun = le16_to_cpu(id->num_pu); geo->all_luns = geo->num_ch * geo->num_lun; @@ -583,7 +577,13 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, struct ppa_addr ppa; size_t left = nchks * sizeof(struct nvme_nvm_chk_meta); size_t log_pos, offset, len; - int ret, i; + int ret, i, max_len; + + /* + * limit requests to maximum 256K to avoid issuing arbitrary large + * requests when the device does not specific a maximum transfer size. + */ + max_len = min_t(unsigned int, ctrl->max_hw_sectors << 9, 256 * 1024); /* Normalize lba address space to obtain log offset */ ppa.ppa = slba; @@ -596,10 +596,11 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, offset = log_pos * sizeof(struct nvme_nvm_chk_meta); while (left) { - len = min_t(unsigned int, left, ctrl->max_hw_sectors << 9); + len = min_t(unsigned int, left, max_len); - ret = nvme_get_log_ext(ctrl, ns, NVME_NVM_LOG_REPORT_CHUNK, - dev_meta, len, offset); + ret = nvme_get_log(ctrl, ns->head->ns_id, + NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len, + offset); if (ret) { dev_err(ctrl->device, "Get REPORT CHUNK log error\n"); break; @@ -662,12 +663,10 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q, rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; - if (rqd->bio) { + if (rqd->bio) blk_init_request_from_bio(rq, rqd->bio); - } else { + else rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); - rq->__data_len = 0; - } return rq; } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 1ffd3e8b13a1..5a9562881d4e 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 Christoph Hellwig. + * Copyright (c) 2017-2018 Christoph Hellwig. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -20,6 +20,11 @@ module_param(multipath, bool, 0444); MODULE_PARM_DESC(multipath, "turn on native support for multiple controllers per subsystem"); +inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) +{ + return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3)); +} + /* * If multipathing is enabled we need to always use the subsystem instance * number for numbering our devices to avoid conflicts between subsystems that @@ -45,6 +50,7 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, void nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; + u16 status = nvme_req(req)->status; unsigned long flags; spin_lock_irqsave(&ns->head->requeue_lock, flags); @@ -52,15 +58,35 @@ void nvme_failover_req(struct request *req) spin_unlock_irqrestore(&ns->head->requeue_lock, flags); blk_mq_end_request(req, 0); - nvme_reset_ctrl(ns->ctrl); - kblockd_schedule_work(&ns->head->requeue_work); -} + switch (status & 0x7ff) { + case NVME_SC_ANA_TRANSITION: + case NVME_SC_ANA_INACCESSIBLE: + case NVME_SC_ANA_PERSISTENT_LOSS: + /* + * If we got back an ANA error we know the controller is alive, + * but not ready to serve this namespaces. The spec suggests + * we should update our general state here, but due to the fact + * that the admin and I/O queues are not serialized that is + * fundamentally racy. So instead just clear the current path, + * mark the the path as pending and kick of a re-read of the ANA + * log page ASAP. + */ + nvme_mpath_clear_current_path(ns); + if (ns->ctrl->ana_log_buf) { + set_bit(NVME_NS_ANA_PENDING, &ns->flags); + queue_work(nvme_wq, &ns->ctrl->ana_work); + } + break; + default: + /* + * Reset the controller for any non-ANA error as we don't know + * what caused the error. + */ + nvme_reset_ctrl(ns->ctrl); + break; + } -bool nvme_req_needs_failover(struct request *req, blk_status_t error) -{ - if (!(req->cmd_flags & REQ_NVME_MPATH)) - return false; - return blk_path_error(error); + kblockd_schedule_work(&ns->head->requeue_work); } void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) @@ -75,25 +101,51 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) up_read(&ctrl->namespaces_rwsem); } +static const char *nvme_ana_state_names[] = { + [0] = "invalid state", + [NVME_ANA_OPTIMIZED] = "optimized", + [NVME_ANA_NONOPTIMIZED] = "non-optimized", + [NVME_ANA_INACCESSIBLE] = "inaccessible", + [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss", + [NVME_ANA_CHANGE] = "change", +}; + static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) { - struct nvme_ns *ns; + struct nvme_ns *ns, *fallback = NULL; list_for_each_entry_rcu(ns, &head->list, siblings) { - if (ns->ctrl->state == NVME_CTRL_LIVE) { + if (ns->ctrl->state != NVME_CTRL_LIVE || + test_bit(NVME_NS_ANA_PENDING, &ns->flags)) + continue; + switch (ns->ana_state) { + case NVME_ANA_OPTIMIZED: rcu_assign_pointer(head->current_path, ns); return ns; + case NVME_ANA_NONOPTIMIZED: + fallback = ns; + break; + default: + break; } } - return NULL; + if (fallback) + rcu_assign_pointer(head->current_path, fallback); + return fallback; +} + +static inline bool nvme_path_is_optimized(struct nvme_ns *ns) +{ + return ns->ctrl->state == NVME_CTRL_LIVE && + ns->ana_state == NVME_ANA_OPTIMIZED; } inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) { struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); - if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE)) + if (unlikely(!ns || !nvme_path_is_optimized(ns))) ns = __nvme_find_path(head); return ns; } @@ -142,7 +194,7 @@ static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) srcu_idx = srcu_read_lock(&head->srcu); ns = srcu_dereference(head->current_path, &head->srcu); - if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE)) + if (likely(ns && nvme_path_is_optimized(ns))) found = ns->queue->poll_fn(q, qc); srcu_read_unlock(&head->srcu, srcu_idx); return found; @@ -176,6 +228,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) struct request_queue *q; bool vwc = false; + mutex_init(&head->lock); bio_list_init(&head->requeue_list); spin_lock_init(&head->requeue_lock); INIT_WORK(&head->requeue_work, nvme_requeue_work); @@ -220,29 +273,232 @@ out: return -ENOMEM; } -void nvme_mpath_add_disk(struct nvme_ns_head *head) +static void nvme_mpath_set_live(struct nvme_ns *ns) { + struct nvme_ns_head *head = ns->head; + + lockdep_assert_held(&ns->head->lock); + if (!head->disk) return; - mutex_lock(&head->subsys->lock); if (!(head->disk->flags & GENHD_FL_UP)) { device_add_disk(&head->subsys->dev, head->disk); if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, &nvme_ns_id_attr_group)) - pr_warn("%s: failed to create sysfs group for identification\n", - head->disk->disk_name); + dev_warn(&head->subsys->dev, + "failed to create id group.\n"); + } + + kblockd_schedule_work(&ns->head->requeue_work); +} + +static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, + int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, + void *)) +{ + void *base = ctrl->ana_log_buf; + size_t offset = sizeof(struct nvme_ana_rsp_hdr); + int error, i; + + lockdep_assert_held(&ctrl->ana_lock); + + for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { + struct nvme_ana_group_desc *desc = base + offset; + u32 nr_nsids = le32_to_cpu(desc->nnsids); + size_t nsid_buf_size = nr_nsids * sizeof(__le32); + + if (WARN_ON_ONCE(desc->grpid == 0)) + return -EINVAL; + if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) + return -EINVAL; + if (WARN_ON_ONCE(desc->state == 0)) + return -EINVAL; + if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) + return -EINVAL; + + offset += sizeof(*desc); + if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) + return -EINVAL; + + error = cb(ctrl, desc, data); + if (error) + return error; + + offset += nsid_buf_size; + if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) + return -EINVAL; + } + + return 0; +} + +static inline bool nvme_state_is_live(enum nvme_ana_state state) +{ + return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; +} + +static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, + struct nvme_ns *ns) +{ + enum nvme_ana_state old; + + mutex_lock(&ns->head->lock); + old = ns->ana_state; + ns->ana_grpid = le32_to_cpu(desc->grpid); + ns->ana_state = desc->state; + clear_bit(NVME_NS_ANA_PENDING, &ns->flags); + + if (nvme_state_is_live(ns->ana_state) && !nvme_state_is_live(old)) + nvme_mpath_set_live(ns); + mutex_unlock(&ns->head->lock); +} + +static int nvme_update_ana_state(struct nvme_ctrl *ctrl, + struct nvme_ana_group_desc *desc, void *data) +{ + u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; + unsigned *nr_change_groups = data; + struct nvme_ns *ns; + + dev_info(ctrl->device, "ANA group %d: %s.\n", + le32_to_cpu(desc->grpid), + nvme_ana_state_names[desc->state]); + + if (desc->state == NVME_ANA_CHANGE) + (*nr_change_groups)++; + + if (!nr_nsids) + return 0; + + down_write(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (ns->head->ns_id != le32_to_cpu(desc->nsids[n])) + continue; + nvme_update_ns_ana_state(desc, ns); + if (++n == nr_nsids) + break; + } + up_write(&ctrl->namespaces_rwsem); + WARN_ON_ONCE(n < nr_nsids); + return 0; +} + +static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only) +{ + u32 nr_change_groups = 0; + int error; + + mutex_lock(&ctrl->ana_lock); + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, + groups_only ? NVME_ANA_LOG_RGO : 0, + ctrl->ana_log_buf, ctrl->ana_log_size, 0); + if (error) { + dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); + goto out_unlock; + } + + error = nvme_parse_ana_log(ctrl, &nr_change_groups, + nvme_update_ana_state); + if (error) + goto out_unlock; + + /* + * In theory we should have an ANATT timer per group as they might enter + * the change state at different times. But that is a lot of overhead + * just to protect against a target that keeps entering new changes + * states while never finishing previous ones. But we'll still + * eventually time out once all groups are in change state, so this + * isn't a big deal. + * + * We also double the ANATT value to provide some slack for transports + * or AEN processing overhead. + */ + if (nr_change_groups) + mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); + else + del_timer_sync(&ctrl->anatt_timer); +out_unlock: + mutex_unlock(&ctrl->ana_lock); + return error; +} + +static void nvme_ana_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); + + nvme_read_ana_log(ctrl, false); +} + +static void nvme_anatt_timeout(struct timer_list *t) +{ + struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); + + dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); + nvme_reset_ctrl(ctrl); +} + +void nvme_mpath_stop(struct nvme_ctrl *ctrl) +{ + if (!nvme_ctrl_use_ana(ctrl)) + return; + del_timer_sync(&ctrl->anatt_timer); + cancel_work_sync(&ctrl->ana_work); +} + +static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); +} +DEVICE_ATTR_RO(ana_grpid); + +static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + + return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); +} +DEVICE_ATTR_RO(ana_state); + +static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl, + struct nvme_ana_group_desc *desc, void *data) +{ + struct nvme_ns *ns = data; + + if (ns->ana_grpid == le32_to_cpu(desc->grpid)) { + nvme_update_ns_ana_state(desc, ns); + return -ENXIO; /* just break out of the loop */ + } + + return 0; +} + +void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) +{ + if (nvme_ctrl_use_ana(ns->ctrl)) { + mutex_lock(&ns->ctrl->ana_lock); + ns->ana_grpid = le32_to_cpu(id->anagrpid); + nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state); + mutex_unlock(&ns->ctrl->ana_lock); + } else { + mutex_lock(&ns->head->lock); + ns->ana_state = NVME_ANA_OPTIMIZED; + nvme_mpath_set_live(ns); + mutex_unlock(&ns->head->lock); } - mutex_unlock(&head->subsys->lock); } void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) return; - sysfs_remove_group(&disk_to_dev(head->disk)->kobj, - &nvme_ns_id_attr_group); - del_gendisk(head->disk); + if (head->disk->flags & GENHD_FL_UP) { + sysfs_remove_group(&disk_to_dev(head->disk)->kobj, + &nvme_ns_id_attr_group); + del_gendisk(head->disk); + } blk_set_queue_dying(head->disk->queue); /* make sure all pending bios are cleaned up */ kblockd_schedule_work(&head->requeue_work); @@ -250,3 +506,52 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) blk_cleanup_queue(head->disk->queue); put_disk(head->disk); } + +int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + int error; + + if (!nvme_ctrl_use_ana(ctrl)) + return 0; + + ctrl->anacap = id->anacap; + ctrl->anatt = id->anatt; + ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); + ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); + + mutex_init(&ctrl->ana_lock); + timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); + ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + + ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc); + if (!(ctrl->anacap & (1 << 6))) + ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32); + + if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) { + dev_err(ctrl->device, + "ANA log page size (%zd) larger than MDTS (%d).\n", + ctrl->ana_log_size, + ctrl->max_hw_sectors << SECTOR_SHIFT); + dev_err(ctrl->device, "disabling ANA support.\n"); + return 0; + } + + INIT_WORK(&ctrl->ana_work, nvme_ana_work); + ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); + if (!ctrl->ana_log_buf) + goto out; + + error = nvme_read_ana_log(ctrl, true); + if (error) + goto out_free_ana_log_buf; + return 0; +out_free_ana_log_buf: + kfree(ctrl->ana_log_buf); +out: + return -ENOMEM; +} + +void nvme_mpath_uninit(struct nvme_ctrl *ctrl) +{ + kfree(ctrl->ana_log_buf); +} + diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 0c4a33df3b2f..bb4a2003c097 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -102,6 +102,7 @@ struct nvme_request { u8 retries; u8 flags; u16 status; + struct nvme_ctrl *ctrl; }; /* @@ -119,6 +120,13 @@ static inline struct nvme_request *nvme_req(struct request *req) return blk_mq_rq_to_pdu(req); } +static inline u16 nvme_req_qid(struct request *req) +{ + if (!req->rq_disk) + return 0; + return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1; +} + /* The below value is the specific amount of delay needed before checking * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was @@ -175,6 +183,7 @@ struct nvme_ctrl { u16 oacs; u16 nssa; u16 nr_streams; + u32 max_namespaces; atomic_t abort_limit; u8 vwc; u32 vs; @@ -197,6 +206,19 @@ struct nvme_ctrl { struct work_struct fw_act_work; unsigned long events; +#ifdef CONFIG_NVME_MULTIPATH + /* asymmetric namespace access: */ + u8 anacap; + u8 anatt; + u32 anagrpmax; + u32 nanagrpid; + struct mutex ana_lock; + struct nvme_ana_rsp_hdr *ana_log_buf; + size_t ana_log_size; + struct timer_list anatt_timer; + struct work_struct ana_work; +#endif + /* Power saving configuration */ u64 ps_max_latency_us; bool apst_enabled; @@ -261,6 +283,7 @@ struct nvme_ns_head { struct bio_list requeue_list; spinlock_t requeue_lock; struct work_struct requeue_work; + struct mutex lock; #endif struct list_head list; struct srcu_struct srcu; @@ -287,6 +310,10 @@ struct nvme_ns { struct nvme_ctrl *ctrl; struct request_queue *queue; struct gendisk *disk; +#ifdef CONFIG_NVME_MULTIPATH + enum nvme_ana_state ana_state; + u32 ana_grpid; +#endif struct list_head siblings; struct nvm_dev *ndev; struct kref kref; @@ -299,8 +326,9 @@ struct nvme_ns { bool ext; u8 pi_type; unsigned long flags; -#define NVME_NS_REMOVING 0 -#define NVME_NS_DEAD 1 +#define NVME_NS_REMOVING 0 +#define NVME_NS_DEAD 1 +#define NVME_NS_ANA_PENDING 2 u16 noiob; #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS @@ -356,14 +384,6 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) return (sector >> (ns->lba_shift - 9)); } -static inline void nvme_cleanup_cmd(struct request *req) -{ - if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - kfree(page_address(req->special_vec.bv_page) + - req->special_vec.bv_offset); - } -} - static inline void nvme_end_request(struct request *req, __le16 status, union nvme_result result) { @@ -420,6 +440,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid); +void nvme_cleanup_cmd(struct request *req); blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, @@ -435,21 +456,24 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); -int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 log_page, void *log, size_t size, u64 offset); +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, + void *log, size_t size, u64 offset); extern const struct attribute_group nvme_ns_id_attr_group; extern const struct block_device_operations nvme_ns_head_ops; #ifdef CONFIG_NVME_MULTIPATH +bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl); void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, struct nvme_ctrl *ctrl, int *flags); void nvme_failover_req(struct request *req); -bool nvme_req_needs_failover(struct request *req, blk_status_t error); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); -void nvme_mpath_add_disk(struct nvme_ns_head *head); +void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); void nvme_mpath_remove_disk(struct nvme_ns_head *head); +int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); +void nvme_mpath_uninit(struct nvme_ctrl *ctrl); +void nvme_mpath_stop(struct nvme_ctrl *ctrl); static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) { @@ -468,7 +492,14 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) kblockd_schedule_work(&head->requeue_work); } +extern struct device_attribute dev_attr_ana_grpid; +extern struct device_attribute dev_attr_ana_state; + #else +static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) +{ + return false; +} /* * Without the multipath code enabled, multiple controller per subsystems are * visible as devices and thus we cannot use the subsystem instance. @@ -482,11 +513,6 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, static inline void nvme_failover_req(struct request *req) { } -static inline bool nvme_req_needs_failover(struct request *req, - blk_status_t error) -{ - return false; -} static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { } @@ -495,7 +521,8 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, { return 0; } -static inline void nvme_mpath_add_disk(struct nvme_ns_head *head) +static inline void nvme_mpath_add_disk(struct nvme_ns *ns, + struct nvme_id_ns *id) { } static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) @@ -507,6 +534,17 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) { } +static inline int nvme_mpath_init(struct nvme_ctrl *ctrl, + struct nvme_id_ctrl *id) +{ + return 0; +} +static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl) +{ +} +static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl) +{ +} #endif /* CONFIG_NVME_MULTIPATH */ #ifdef CONFIG_NVM diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ddd441b1516a..1b9951d2067e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -418,6 +418,8 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, BUG_ON(!nvmeq); iod->nvmeq = nvmeq; + + nvme_req(req)->ctrl = &dev->ctrl; return 0; } @@ -535,73 +537,6 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req) mempool_free(iod->sg, dev->iod_mempool); } -#ifdef CONFIG_BLK_DEV_INTEGRITY -static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) -{ - if (be32_to_cpu(pi->ref_tag) == v) - pi->ref_tag = cpu_to_be32(p); -} - -static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) -{ - if (be32_to_cpu(pi->ref_tag) == p) - pi->ref_tag = cpu_to_be32(v); -} - -/** - * nvme_dif_remap - remaps ref tags to bip seed and physical lba - * - * The virtual start sector is the one that was originally submitted by the - * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical - * start sector may be different. Remap protection information to match the - * physical LBA on writes, and back to the original seed on reads. - * - * Type 0 and 3 do not have a ref tag, so no remapping required. - */ -static void nvme_dif_remap(struct request *req, - void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) -{ - struct nvme_ns *ns = req->rq_disk->private_data; - struct bio_integrity_payload *bip; - struct t10_pi_tuple *pi; - void *p, *pmap; - u32 i, nlb, ts, phys, virt; - - if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) - return; - - bip = bio_integrity(req->bio); - if (!bip) - return; - - pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; - - p = pmap; - virt = bip_get_seed(bip); - phys = nvme_block_nr(ns, blk_rq_pos(req)); - nlb = (blk_rq_bytes(req) >> ns->lba_shift); - ts = ns->disk->queue->integrity.tuple_size; - - for (i = 0; i < nlb; i++, virt++, phys++) { - pi = (struct t10_pi_tuple *)p; - dif_swap(phys, virt, pi); - p += ts; - } - kunmap_atomic(pmap); -} -#else /* CONFIG_BLK_DEV_INTEGRITY */ -static void nvme_dif_remap(struct request *req, - void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) -{ -} -static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) -{ -} -static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) -{ -} -#endif - static void nvme_print_sgl(struct scatterlist *sgl, int nents) { int i; @@ -827,9 +762,6 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1) goto out_unmap; - if (req_op(req) == REQ_OP_WRITE) - nvme_dif_remap(req, nvme_dif_prep); - if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir)) goto out_unmap; } @@ -852,11 +784,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) if (iod->nents) { dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); - if (blk_integrity_rq(req)) { - if (req_op(req) == REQ_OP_READ) - nvme_dif_remap(req, nvme_dif_complete); + if (blk_integrity_rq(req)) dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); - } } nvme_cleanup_cmd(req); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 72e8e8e7d2d7..dc042017c293 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -40,13 +40,14 @@ #define NVME_RDMA_MAX_SEGMENTS 256 -#define NVME_RDMA_MAX_INLINE_SEGMENTS 1 +#define NVME_RDMA_MAX_INLINE_SEGMENTS 4 struct nvme_rdma_device { struct ib_device *dev; struct ib_pd *pd; struct kref ref; struct list_head entry; + unsigned int num_inline_segments; }; struct nvme_rdma_qe { @@ -117,6 +118,7 @@ struct nvme_rdma_ctrl { struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; + bool use_inline_data; }; static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl) @@ -249,7 +251,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) /* +1 for drain */ init_attr.cap.max_recv_wr = queue->queue_size + 1; init_attr.cap.max_recv_sge = 1; - init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS; + init_attr.cap.max_send_sge = 1 + dev->num_inline_segments; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; init_attr.qp_type = IB_QPT_RC; init_attr.send_cq = queue->ib_cq; @@ -286,6 +288,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, struct ib_device *ibdev = dev->dev; int ret; + nvme_req(rq)->ctrl = &ctrl->ctrl; ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); if (ret) @@ -374,6 +377,8 @@ nvme_rdma_find_get_device(struct rdma_cm_id *cm_id) goto out_free_pd; } + ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS, + ndev->dev->attrs.max_send_sge - 1); list_add(&ndev->entry, &device_list); out_unlock: mutex_unlock(&device_list_mutex); @@ -868,6 +873,31 @@ out_free_io_queues: return ret; } +static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, + bool remove) +{ + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_stop_queue(&ctrl->queues[0]); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, + &ctrl->ctrl); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_destroy_admin_queue(ctrl, remove); +} + +static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, + bool remove) +{ + if (ctrl->ctrl.queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); + blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, + &ctrl->ctrl); + if (remove) + nvme_start_queues(&ctrl->ctrl); + nvme_rdma_destroy_io_queues(ctrl, remove); + } +} + static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); @@ -912,21 +942,44 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) } } -static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) +static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) { - struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), - struct nvme_rdma_ctrl, reconnect_work); + int ret = -EINVAL; bool changed; - int ret; - ++ctrl->ctrl.nr_reconnects; - - ret = nvme_rdma_configure_admin_queue(ctrl, false); + ret = nvme_rdma_configure_admin_queue(ctrl, new); if (ret) - goto requeue; + return ret; + + if (ctrl->ctrl.icdoff) { + dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); + goto destroy_admin; + } + + if (!(ctrl->ctrl.sgls & (1 << 2))) { + dev_err(ctrl->ctrl.device, + "Mandatory keyed sgls are not supported!\n"); + goto destroy_admin; + } + + if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) { + dev_warn(ctrl->ctrl.device, + "queue_size %zu > ctrl sqsize %u, clamping down\n", + ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1); + } + + if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { + dev_warn(ctrl->ctrl.device, + "sqsize %u > ctrl maxcmd %u, clamping down\n", + ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd); + ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; + } + + if (ctrl->ctrl.sgls & (1 << 20)) + ctrl->use_inline_data = true; if (ctrl->ctrl.queue_count > 1) { - ret = nvme_rdma_configure_io_queues(ctrl, false); + ret = nvme_rdma_configure_io_queues(ctrl, new); if (ret) goto destroy_admin; } @@ -935,10 +988,31 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (!changed) { /* state change failure is ok if we're in DELETING state */ WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); - return; + ret = -EINVAL; + goto destroy_io; } nvme_start_ctrl(&ctrl->ctrl); + return 0; + +destroy_io: + if (ctrl->ctrl.queue_count > 1) + nvme_rdma_destroy_io_queues(ctrl, new); +destroy_admin: + nvme_rdma_stop_queue(&ctrl->queues[0]); + nvme_rdma_destroy_admin_queue(ctrl, new); + return ret; +} + +static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_rdma_ctrl, reconnect_work); + + ++ctrl->ctrl.nr_reconnects; + + if (nvme_rdma_setup_ctrl(ctrl, false)) + goto requeue; dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n", ctrl->ctrl.nr_reconnects); @@ -947,9 +1021,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) return; -destroy_admin: - nvme_rdma_stop_queue(&ctrl->queues[0]); - nvme_rdma_destroy_admin_queue(ctrl, false); requeue: dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", ctrl->ctrl.nr_reconnects); @@ -962,27 +1033,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl, err_work); nvme_stop_keep_alive(&ctrl->ctrl); - - if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); - nvme_rdma_stop_io_queues(ctrl); - blk_mq_tagset_busy_iter(&ctrl->tag_set, - nvme_cancel_request, &ctrl->ctrl); - nvme_rdma_destroy_io_queues(ctrl, false); - } - - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_stop_queue(&ctrl->queues[0]); - blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, - nvme_cancel_request, &ctrl->ctrl); - nvme_rdma_destroy_admin_queue(ctrl, false); - - /* - * queues are not a live anymore, so restart the queues to fail fast - * new IO - */ - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_teardown_io_queues(ctrl, false); nvme_start_queues(&ctrl->ctrl); + nvme_rdma_teardown_admin_queue(ctrl, false); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we're in DELETING state */ @@ -1089,19 +1142,27 @@ static int nvme_rdma_set_sg_null(struct nvme_command *c) } static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, - struct nvme_rdma_request *req, struct nvme_command *c) + struct nvme_rdma_request *req, struct nvme_command *c, + int count) { struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + struct scatterlist *sgl = req->sg_table.sgl; + struct ib_sge *sge = &req->sge[1]; + u32 len = 0; + int i; - req->sge[1].addr = sg_dma_address(req->sg_table.sgl); - req->sge[1].length = sg_dma_len(req->sg_table.sgl); - req->sge[1].lkey = queue->device->pd->local_dma_lkey; + for (i = 0; i < count; i++, sgl++, sge++) { + sge->addr = sg_dma_address(sgl); + sge->length = sg_dma_len(sgl); + sge->lkey = queue->device->pd->local_dma_lkey; + len += sge->length; + } sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); - sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl)); + sg->length = cpu_to_le32(len); sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; - req->num_sge++; + req->num_sge += count; return 0; } @@ -1194,15 +1255,16 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, goto out_free_table; } - if (count == 1) { + if (count <= dev->num_inline_segments) { if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && + queue->ctrl->use_inline_data && blk_rq_payload_bytes(rq) <= nvme_rdma_inline_data_size(queue)) { - ret = nvme_rdma_map_sg_inline(queue, req, c); + ret = nvme_rdma_map_sg_inline(queue, req, c, count); goto out; } - if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { + if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { ret = nvme_rdma_map_sg_single(queue, req, c); goto out; } @@ -1573,6 +1635,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: nvme_rdma_destroy_queue_ib(queue); + /* fall through */ case RDMA_CM_EVENT_ADDR_ERROR: dev_dbg(queue->ctrl->ctrl.device, "CM error event %d\n", ev->event); @@ -1735,25 +1798,12 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { - if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); - nvme_rdma_stop_io_queues(ctrl); - blk_mq_tagset_busy_iter(&ctrl->tag_set, - nvme_cancel_request, &ctrl->ctrl); - nvme_rdma_destroy_io_queues(ctrl, shutdown); - } - + nvme_rdma_teardown_io_queues(ctrl, shutdown); if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); else nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); - - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_stop_queue(&ctrl->queues[0]); - blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, - nvme_cancel_request, &ctrl->ctrl); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_destroy_admin_queue(ctrl, shutdown); + nvme_rdma_teardown_admin_queue(ctrl, shutdown); } static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) @@ -1765,8 +1815,6 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) { struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work); - int ret; - bool changed; nvme_stop_ctrl(&ctrl->ctrl); nvme_rdma_shutdown_ctrl(ctrl, false); @@ -1777,25 +1825,9 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) return; } - ret = nvme_rdma_configure_admin_queue(ctrl, false); - if (ret) + if (nvme_rdma_setup_ctrl(ctrl, false)) goto out_fail; - if (ctrl->ctrl.queue_count > 1) { - ret = nvme_rdma_configure_io_queues(ctrl, false); - if (ret) - goto out_fail; - } - - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - if (!changed) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); - return; - } - - nvme_start_ctrl(&ctrl->ctrl); - return; out_fail: @@ -1958,49 +1990,10 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); WARN_ON_ONCE(!changed); - ret = nvme_rdma_configure_admin_queue(ctrl, true); + ret = nvme_rdma_setup_ctrl(ctrl, true); if (ret) goto out_uninit_ctrl; - /* sanity check icdoff */ - if (ctrl->ctrl.icdoff) { - dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); - ret = -EINVAL; - goto out_remove_admin_queue; - } - - /* sanity check keyed sgls */ - if (!(ctrl->ctrl.sgls & (1 << 2))) { - dev_err(ctrl->ctrl.device, - "Mandatory keyed sgls are not supported!\n"); - ret = -EINVAL; - goto out_remove_admin_queue; - } - - /* only warn if argument is too large here, will clamp later */ - if (opts->queue_size > ctrl->ctrl.sqsize + 1) { - dev_warn(ctrl->ctrl.device, - "queue_size %zu > ctrl sqsize %u, clamping down\n", - opts->queue_size, ctrl->ctrl.sqsize + 1); - } - - /* warn if maxcmd is lower than sqsize+1 */ - if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { - dev_warn(ctrl->ctrl.device, - "sqsize %u > ctrl maxcmd %u, clamping down\n", - ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd); - ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; - } - - if (opts->nr_io_queues) { - ret = nvme_rdma_configure_io_queues(ctrl, true); - if (ret) - goto out_remove_admin_queue; - } - - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); - dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); @@ -2010,13 +2003,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); mutex_unlock(&nvme_rdma_ctrl_mutex); - nvme_start_ctrl(&ctrl->ctrl); - return &ctrl->ctrl; -out_remove_admin_queue: - nvme_rdma_stop_queue(&ctrl->queues[0]); - nvme_rdma_destroy_admin_queue(ctrl, true); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 41944bbef835..25b0e310f4a8 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -128,3 +128,14 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, return nvme_trace_common(p, cdw10); } } + +const char *nvme_trace_disk_name(struct trace_seq *p, char *name) +{ + const char *ret = trace_seq_buffer_ptr(p); + + if (*name) + trace_seq_printf(p, "disk=%s, ", name); + trace_seq_putc(p, 0); + + return ret; +} diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 01390f0e1671..a490790d6691 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -50,13 +50,8 @@ nvme_admin_opcode_name(nvme_admin_security_recv), \ nvme_admin_opcode_name(nvme_admin_sanitize_nvm)) -const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, - u8 *cdw10); -#define __parse_nvme_admin_cmd(opcode, cdw10) \ - nvme_trace_parse_admin_cmd(p, opcode, cdw10) - #define nvme_opcode_name(opcode) { opcode, #opcode } -#define show_opcode_name(val) \ +#define show_nvm_opcode_name(val) \ __print_symbolic(val, \ nvme_opcode_name(nvme_cmd_flush), \ nvme_opcode_name(nvme_cmd_write), \ @@ -70,85 +65,92 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, nvme_opcode_name(nvme_cmd_resv_acquire), \ nvme_opcode_name(nvme_cmd_resv_release)) -const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, - u8 *cdw10); -#define __parse_nvme_cmd(opcode, cdw10) \ - nvme_trace_parse_nvm_cmd(p, opcode, cdw10) - -TRACE_EVENT(nvme_setup_admin_cmd, - TP_PROTO(struct nvme_command *cmd), - TP_ARGS(cmd), - TP_STRUCT__entry( - __field(u8, opcode) - __field(u8, flags) - __field(u16, cid) - __field(u64, metadata) - __array(u8, cdw10, 24) - ), - TP_fast_assign( - __entry->opcode = cmd->common.opcode; - __entry->flags = cmd->common.flags; - __entry->cid = cmd->common.command_id; - __entry->metadata = le64_to_cpu(cmd->common.metadata); - memcpy(__entry->cdw10, cmd->common.cdw10, - sizeof(__entry->cdw10)); - ), - TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", - __entry->cid, __entry->flags, __entry->metadata, - show_admin_opcode_name(__entry->opcode), - __parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10)) -); - +#define show_opcode_name(qid, opcode) \ + (qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode)) -TRACE_EVENT(nvme_setup_nvm_cmd, - TP_PROTO(int qid, struct nvme_command *cmd), - TP_ARGS(qid, cmd), +const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, + u8 *cdw10); +const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, + u8 *cdw10); + +#define parse_nvme_cmd(qid, opcode, cdw10) \ + (qid ? \ + nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : \ + nvme_trace_parse_admin_cmd(p, opcode, cdw10)) + +const char *nvme_trace_disk_name(struct trace_seq *p, char *name); +#define __print_disk_name(name) \ + nvme_trace_disk_name(p, name) + +#ifndef TRACE_HEADER_MULTI_READ +static inline void __assign_disk_name(char *name, struct gendisk *disk) +{ + if (disk) + memcpy(name, disk->disk_name, DISK_NAME_LEN); + else + memset(name, 0, DISK_NAME_LEN); +} +#endif + +TRACE_EVENT(nvme_setup_cmd, + TP_PROTO(struct request *req, struct nvme_command *cmd), + TP_ARGS(req, cmd), TP_STRUCT__entry( - __field(int, qid) - __field(u8, opcode) - __field(u8, flags) - __field(u16, cid) - __field(u32, nsid) - __field(u64, metadata) - __array(u8, cdw10, 24) + __array(char, disk, DISK_NAME_LEN) + __field(int, ctrl_id) + __field(int, qid) + __field(u8, opcode) + __field(u8, flags) + __field(u16, cid) + __field(u32, nsid) + __field(u64, metadata) + __array(u8, cdw10, 24) ), TP_fast_assign( - __entry->qid = qid; - __entry->opcode = cmd->common.opcode; - __entry->flags = cmd->common.flags; - __entry->cid = cmd->common.command_id; - __entry->nsid = le32_to_cpu(cmd->common.nsid); - __entry->metadata = le64_to_cpu(cmd->common.metadata); - memcpy(__entry->cdw10, cmd->common.cdw10, - sizeof(__entry->cdw10)); + __entry->ctrl_id = nvme_req(req)->ctrl->instance; + __entry->qid = nvme_req_qid(req); + __entry->opcode = cmd->common.opcode; + __entry->flags = cmd->common.flags; + __entry->cid = cmd->common.command_id; + __entry->nsid = le32_to_cpu(cmd->common.nsid); + __entry->metadata = le64_to_cpu(cmd->common.metadata); + __assign_disk_name(__entry->disk, req->rq_disk); + memcpy(__entry->cdw10, cmd->common.cdw10, + sizeof(__entry->cdw10)); ), - TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", - __entry->qid, __entry->nsid, __entry->cid, + TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", + __entry->ctrl_id, __print_disk_name(__entry->disk), + __entry->qid, __entry->cid, __entry->nsid, __entry->flags, __entry->metadata, - show_opcode_name(__entry->opcode), - __parse_nvme_cmd(__entry->opcode, __entry->cdw10)) + show_opcode_name(__entry->qid, __entry->opcode), + parse_nvme_cmd(__entry->qid, __entry->opcode, __entry->cdw10)) ); TRACE_EVENT(nvme_complete_rq, TP_PROTO(struct request *req), TP_ARGS(req), TP_STRUCT__entry( - __field(int, qid) - __field(int, cid) - __field(u64, result) - __field(u8, retries) - __field(u8, flags) - __field(u16, status) + __array(char, disk, DISK_NAME_LEN) + __field(int, ctrl_id) + __field(int, qid) + __field(int, cid) + __field(u64, result) + __field(u8, retries) + __field(u8, flags) + __field(u16, status) ), TP_fast_assign( - __entry->qid = req->q->id; - __entry->cid = req->tag; - __entry->result = le64_to_cpu(nvme_req(req)->result.u64); - __entry->retries = nvme_req(req)->retries; - __entry->flags = nvme_req(req)->flags; - __entry->status = nvme_req(req)->status; + __entry->ctrl_id = nvme_req(req)->ctrl->instance; + __entry->qid = nvme_req_qid(req); + __entry->cid = req->tag; + __entry->result = le64_to_cpu(nvme_req(req)->result.u64); + __entry->retries = nvme_req(req)->retries; + __entry->flags = nvme_req(req)->flags; + __entry->status = nvme_req(req)->status; + __assign_disk_name(__entry->disk, req->rq_disk); ), - TP_printk("qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", + TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", + __entry->ctrl_id, __print_disk_name(__entry->disk), __entry->qid, __entry->cid, __entry->result, __entry->retries, __entry->flags, __entry->status) |