diff options
author | Jens Axboe <axboe@kernel.dk> | 2018-08-06 04:34:09 +0300 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2018-08-06 04:34:09 +0300 |
commit | f87b0f0dfa5496fc4a701c071fa3ce7ad7ca5152 (patch) | |
tree | bfaf5858c80dc1056d30825f35762b7e3dad65d4 | |
parent | 05b9ba4b550ff67d7362608828405f9e389e8988 (diff) | |
parent | b369b30cf510fe94d8884837039362e2ec223cec (diff) | |
download | linux-f87b0f0dfa5496fc4a701c071fa3ce7ad7ca5152.tar.xz |
Merge branch 'nvme-4.19' of git://git.infradead.org/nvme into for-4.19/block2
Pull NVMe changes from Christoph:
"This contains the support for TP4004, Asymmetric Namespace Access,
which makes NVMe multipathing usable in practice."
* 'nvme-4.19' of git://git.infradead.org/nvme:
nvmet: use Retain Async Event bit to clear AEN
nvmet: support configuring ANA groups
nvmet: add minimal ANA support
nvmet: track and limit the number of namespaces per subsystem
nvmet: keep a port pointer in nvmet_ctrl
nvme: add ANA support
nvme: remove nvme_req_needs_failover
nvme: simplify the API for getting log pages
nvme.h: add ANA definitions
nvme.h: add support for the log specific field
Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r-- | drivers/nvme/host/core.c | 69 | ||||
-rw-r--r-- | drivers/nvme/host/lightnvm.c | 5 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 349 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 61 | ||||
-rw-r--r-- | drivers/nvme/target/admin-cmd.c | 104 | ||||
-rw-r--r-- | drivers/nvme/target/configfs.c | 190 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 71 | ||||
-rw-r--r-- | drivers/nvme/target/nvmet.h | 53 | ||||
-rw-r--r-- | include/linux/nvme.h | 52 |
9 files changed, 880 insertions, 74 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9347f20190e5..603fe59756fb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -252,7 +252,8 @@ void nvme_complete_rq(struct request *req) trace_nvme_complete_rq(req); if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { - if (nvme_req_needs_failover(req, status)) { + if ((req->cmd_flags & REQ_NVME_MPATH) && + blk_path_error(status)) { nvme_failover_req(req); return; } @@ -1067,7 +1068,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) EXPORT_SYMBOL_GPL(nvme_set_queue_count); #define NVME_AEN_SUPPORTED \ - (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT) + (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE) static void nvme_enable_aen(struct nvme_ctrl *ctrl) { @@ -2281,21 +2282,16 @@ out_unlock: return ret; } -int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 log_page, void *log, - size_t size, u64 offset) +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, + void *log, size_t size, u64 offset) { struct nvme_command c = { }; unsigned long dwlen = size / 4 - 1; c.get_log_page.opcode = nvme_admin_get_log_page; - - if (ns) - c.get_log_page.nsid = cpu_to_le32(ns->head->ns_id); - else - c.get_log_page.nsid = cpu_to_le32(NVME_NSID_ALL); - + c.get_log_page.nsid = cpu_to_le32(nsid); c.get_log_page.lid = log_page; + c.get_log_page.lsp = lsp; c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1)); c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); @@ -2304,12 +2300,6 @@ int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } -static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log, - size_t size) -{ - return nvme_get_log_ext(ctrl, NULL, log_page, log, size, 0); -} - static int nvme_get_effects_log(struct nvme_ctrl *ctrl) { int ret; @@ -2320,8 +2310,8 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl) if (!ctrl->effects) return 0; - ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects, - sizeof(*ctrl->effects)); + ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, + ctrl->effects, sizeof(*ctrl->effects), 0); if (ret) { kfree(ctrl->effects); ctrl->effects = NULL; @@ -2412,6 +2402,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) nvme_set_queue_limits(ctrl, ctrl->admin_q); ctrl->sgls = le32_to_cpu(id->sgls); ctrl->kas = le16_to_cpu(id->kas); + ctrl->max_namespaces = le32_to_cpu(id->mnan); if (id->rtd3e) { /* us -> s */ @@ -2471,8 +2462,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); } + ret = nvme_mpath_init(ctrl, id); kfree(id); + if (ret < 0) + return ret; + if (ctrl->apst_enabled && !prev_apst_enabled) dev_pm_qos_expose_latency_tolerance(ctrl->device); else if (!ctrl->apst_enabled && prev_apst_enabled) @@ -2691,6 +2686,10 @@ static struct attribute *nvme_ns_id_attrs[] = { &dev_attr_nguid.attr, &dev_attr_eui.attr, &dev_attr_nsid.attr, +#ifdef CONFIG_NVME_MULTIPATH + &dev_attr_ana_grpid.attr, + &dev_attr_ana_state.attr, +#endif NULL, }; @@ -2713,6 +2712,14 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) return 0; } +#ifdef CONFIG_NVME_MULTIPATH + if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { + if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */ + return 0; + if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) + return 0; + } +#endif return a->mode; } @@ -3086,8 +3093,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_get_ctrl(ctrl); - kfree(id); - device_add_disk(ctrl->device, ns->disk); if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, &nvme_ns_id_attr_group)) @@ -3097,8 +3102,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) pr_warn("%s: failed to register lightnvm sysfs group for identification\n", ns->disk->disk_name); - nvme_mpath_add_disk(ns->head); + nvme_mpath_add_disk(ns, id); nvme_fault_inject_init(ns); + kfree(id); + return; out_unlink_ns: mutex_lock(&ctrl->subsys->lock); @@ -3240,7 +3247,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) * raced with us in reading the log page, which could cause us to miss * updates. */ - error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size); + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log, + log_size, 0); if (error) dev_warn(ctrl->device, "reading changed ns log failed: %d\n", error); @@ -3357,9 +3365,9 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) if (!log) return; - if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log))) - dev_warn(ctrl->device, - "Get FW SLOT INFO log error\n"); + if (nvme_get_log(ctrl, NVME_NSID_ALL, 0, NVME_LOG_FW_SLOT, log, + sizeof(*log), 0)) + dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); kfree(log); } @@ -3405,6 +3413,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) case NVME_AER_NOTICE_FW_ACT_STARTING: queue_work(nvme_wq, &ctrl->fw_act_work); break; +#ifdef CONFIG_NVME_MULTIPATH + case NVME_AER_NOTICE_ANA: + if (!ctrl->ana_log_buf) + break; + queue_work(nvme_wq, &ctrl->ana_work); + break; +#endif default: dev_warn(ctrl->device, "async event result %08x\n", result); } @@ -3437,6 +3452,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event); void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { + nvme_mpath_stop(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); flush_work(&ctrl->scan_work); @@ -3474,6 +3490,7 @@ static void nvme_free_ctrl(struct device *dev) ida_simple_remove(&nvme_instance_ida, ctrl->instance); kfree(ctrl->effects); + nvme_mpath_uninit(ctrl); if (subsys) { mutex_lock(&subsys->lock); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index d9e4cccd5b66..7e4cf4eb9d66 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -604,8 +604,9 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, while (left) { len = min_t(unsigned int, left, max_len); - ret = nvme_get_log_ext(ctrl, ns, NVME_NVM_LOG_REPORT_CHUNK, - dev_meta, len, offset); + ret = nvme_get_log(ctrl, ns->head->ns_id, + NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len, + offset); if (ret) { dev_err(ctrl->device, "Get REPORT CHUNK log error\n"); break; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 1ffd3e8b13a1..c643872f8dac 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 Christoph Hellwig. + * Copyright (c) 2017-2018 Christoph Hellwig. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -20,6 +20,11 @@ module_param(multipath, bool, 0444); MODULE_PARM_DESC(multipath, "turn on native support for multiple controllers per subsystem"); +inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) +{ + return multipath && (ctrl->subsys->cmic & (1 << 3)); +} + /* * If multipathing is enabled we need to always use the subsystem instance * number for numbering our devices to avoid conflicts between subsystems that @@ -45,6 +50,7 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, void nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; + u16 status = nvme_req(req)->status; unsigned long flags; spin_lock_irqsave(&ns->head->requeue_lock, flags); @@ -52,15 +58,35 @@ void nvme_failover_req(struct request *req) spin_unlock_irqrestore(&ns->head->requeue_lock, flags); blk_mq_end_request(req, 0); - nvme_reset_ctrl(ns->ctrl); - kblockd_schedule_work(&ns->head->requeue_work); -} + switch (status & 0x7ff) { + case NVME_SC_ANA_TRANSITION: + case NVME_SC_ANA_INACCESSIBLE: + case NVME_SC_ANA_PERSISTENT_LOSS: + /* + * If we got back an ANA error we know the controller is alive, + * but not ready to serve this namespaces. The spec suggests + * we should update our general state here, but due to the fact + * that the admin and I/O queues are not serialized that is + * fundamentally racy. So instead just clear the current path, + * mark the the path as pending and kick of a re-read of the ANA + * log page ASAP. + */ + nvme_mpath_clear_current_path(ns); + if (ns->ctrl->ana_log_buf) { + set_bit(NVME_NS_ANA_PENDING, &ns->flags); + queue_work(nvme_wq, &ns->ctrl->ana_work); + } + break; + default: + /* + * Reset the controller for any non-ANA error as we don't know + * what caused the error. + */ + nvme_reset_ctrl(ns->ctrl); + break; + } -bool nvme_req_needs_failover(struct request *req, blk_status_t error) -{ - if (!(req->cmd_flags & REQ_NVME_MPATH)) - return false; - return blk_path_error(error); + kblockd_schedule_work(&ns->head->requeue_work); } void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) @@ -75,25 +101,51 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) up_read(&ctrl->namespaces_rwsem); } +static const char *nvme_ana_state_names[] = { + [0] = "invalid state", + [NVME_ANA_OPTIMIZED] = "optimized", + [NVME_ANA_NONOPTIMIZED] = "non-optimized", + [NVME_ANA_INACCESSIBLE] = "inaccessible", + [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss", + [NVME_ANA_CHANGE] = "change", +}; + static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) { - struct nvme_ns *ns; + struct nvme_ns *ns, *fallback = NULL; list_for_each_entry_rcu(ns, &head->list, siblings) { - if (ns->ctrl->state == NVME_CTRL_LIVE) { + if (ns->ctrl->state != NVME_CTRL_LIVE || + test_bit(NVME_NS_ANA_PENDING, &ns->flags)) + continue; + switch (ns->ana_state) { + case NVME_ANA_OPTIMIZED: rcu_assign_pointer(head->current_path, ns); return ns; + case NVME_ANA_NONOPTIMIZED: + fallback = ns; + break; + default: + break; } } - return NULL; + if (fallback) + rcu_assign_pointer(head->current_path, fallback); + return fallback; +} + +static inline bool nvme_path_is_optimized(struct nvme_ns *ns) +{ + return ns->ctrl->state == NVME_CTRL_LIVE && + ns->ana_state == NVME_ANA_OPTIMIZED; } inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) { struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); - if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE)) + if (unlikely(!ns || !nvme_path_is_optimized(ns))) ns = __nvme_find_path(head); return ns; } @@ -142,7 +194,7 @@ static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) srcu_idx = srcu_read_lock(&head->srcu); ns = srcu_dereference(head->current_path, &head->srcu); - if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE)) + if (likely(ns && nvme_path_is_optimized(ns))) found = ns->queue->poll_fn(q, qc); srcu_read_unlock(&head->srcu, srcu_idx); return found; @@ -176,6 +228,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) struct request_queue *q; bool vwc = false; + mutex_init(&head->lock); bio_list_init(&head->requeue_list); spin_lock_init(&head->requeue_lock); INIT_WORK(&head->requeue_work, nvme_requeue_work); @@ -220,29 +273,232 @@ out: return -ENOMEM; } -void nvme_mpath_add_disk(struct nvme_ns_head *head) +static void nvme_mpath_set_live(struct nvme_ns *ns) { + struct nvme_ns_head *head = ns->head; + + lockdep_assert_held(&ns->head->lock); + if (!head->disk) return; - mutex_lock(&head->subsys->lock); if (!(head->disk->flags & GENHD_FL_UP)) { device_add_disk(&head->subsys->dev, head->disk); if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, &nvme_ns_id_attr_group)) - pr_warn("%s: failed to create sysfs group for identification\n", - head->disk->disk_name); + dev_warn(&head->subsys->dev, + "failed to create id group.\n"); + } + + kblockd_schedule_work(&ns->head->requeue_work); +} + +static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, + int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, + void *)) +{ + void *base = ctrl->ana_log_buf; + size_t offset = sizeof(struct nvme_ana_rsp_hdr); + int error, i; + + lockdep_assert_held(&ctrl->ana_lock); + + for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { + struct nvme_ana_group_desc *desc = base + offset; + u32 nr_nsids = le32_to_cpu(desc->nnsids); + size_t nsid_buf_size = nr_nsids * sizeof(__le32); + + if (WARN_ON_ONCE(desc->grpid == 0)) + return -EINVAL; + if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) + return -EINVAL; + if (WARN_ON_ONCE(desc->state == 0)) + return -EINVAL; + if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) + return -EINVAL; + + offset += sizeof(*desc); + if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) + return -EINVAL; + + error = cb(ctrl, desc, data); + if (error) + return error; + + offset += nsid_buf_size; + if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) + return -EINVAL; + } + + return 0; +} + +static inline bool nvme_state_is_live(enum nvme_ana_state state) +{ + return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; +} + +static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, + struct nvme_ns *ns) +{ + enum nvme_ana_state old; + + mutex_lock(&ns->head->lock); + old = ns->ana_state; + ns->ana_grpid = le32_to_cpu(desc->grpid); + ns->ana_state = desc->state; + clear_bit(NVME_NS_ANA_PENDING, &ns->flags); + + if (nvme_state_is_live(ns->ana_state) && !nvme_state_is_live(old)) + nvme_mpath_set_live(ns); + mutex_unlock(&ns->head->lock); +} + +static int nvme_update_ana_state(struct nvme_ctrl *ctrl, + struct nvme_ana_group_desc *desc, void *data) +{ + u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; + unsigned *nr_change_groups = data; + struct nvme_ns *ns; + + dev_info(ctrl->device, "ANA group %d: %s.\n", + le32_to_cpu(desc->grpid), + nvme_ana_state_names[desc->state]); + + if (desc->state == NVME_ANA_CHANGE) + (*nr_change_groups)++; + + if (!nr_nsids) + return 0; + + down_write(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (ns->head->ns_id != le32_to_cpu(desc->nsids[n])) + continue; + nvme_update_ns_ana_state(desc, ns); + if (++n == nr_nsids) + break; + } + up_write(&ctrl->namespaces_rwsem); + WARN_ON_ONCE(n < nr_nsids); + return 0; +} + +static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only) +{ + u32 nr_change_groups = 0; + int error; + + mutex_lock(&ctrl->ana_lock); + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, + groups_only ? NVME_ANA_LOG_RGO : 0, + ctrl->ana_log_buf, ctrl->ana_log_size, 0); + if (error) { + dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); + goto out_unlock; + } + + error = nvme_parse_ana_log(ctrl, &nr_change_groups, + nvme_update_ana_state); + if (error) + goto out_unlock; + + /* + * In theory we should have an ANATT timer per group as they might enter + * the change state at different times. But that is a lot of overhead + * just to protect against a target that keeps entering new changes + * states while never finishing previous ones. But we'll still + * eventually time out once all groups are in change state, so this + * isn't a big deal. + * + * We also double the ANATT value to provide some slack for transports + * or AEN processing overhead. + */ + if (nr_change_groups) + mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); + else + del_timer_sync(&ctrl->anatt_timer); +out_unlock: + mutex_unlock(&ctrl->ana_lock); + return error; +} + +static void nvme_ana_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); + + nvme_read_ana_log(ctrl, false); +} + +static void nvme_anatt_timeout(struct timer_list *t) +{ + struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); + + dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); + nvme_reset_ctrl(ctrl); +} + +void nvme_mpath_stop(struct nvme_ctrl *ctrl) +{ + if (!nvme_ctrl_use_ana(ctrl)) + return; + del_timer_sync(&ctrl->anatt_timer); + cancel_work_sync(&ctrl->ana_work); +} + +static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); +} +DEVICE_ATTR_RO(ana_grpid); + +static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + + return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); +} +DEVICE_ATTR_RO(ana_state); + +static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl, + struct nvme_ana_group_desc *desc, void *data) +{ + struct nvme_ns *ns = data; + + if (ns->ana_grpid == le32_to_cpu(desc->grpid)) { + nvme_update_ns_ana_state(desc, ns); + return -ENXIO; /* just break out of the loop */ + } + + return 0; +} + +void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) +{ + if (nvme_ctrl_use_ana(ns->ctrl)) { + mutex_lock(&ns->ctrl->ana_lock); + ns->ana_grpid = le32_to_cpu(id->anagrpid); + nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state); + mutex_unlock(&ns->ctrl->ana_lock); + } else { + mutex_lock(&ns->head->lock); + ns->ana_state = NVME_ANA_OPTIMIZED; + nvme_mpath_set_live(ns); + mutex_unlock(&ns->head->lock); } - mutex_unlock(&head->subsys->lock); } void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) return; - sysfs_remove_group(&disk_to_dev(head->disk)->kobj, - &nvme_ns_id_attr_group); - del_gendisk(head->disk); + if (head->disk->flags & GENHD_FL_UP) { + sysfs_remove_group(&disk_to_dev(head->disk)->kobj, + &nvme_ns_id_attr_group); + del_gendisk(head->disk); + } blk_set_queue_dying(head->disk->queue); /* make sure all pending bios are cleaned up */ kblockd_schedule_work(&head->requeue_work); @@ -250,3 +506,52 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) blk_cleanup_queue(head->disk->queue); put_disk(head->disk); } + +int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + int error; + + if (!nvme_ctrl_use_ana(ctrl)) + return 0; + + ctrl->anacap = id->anacap; + ctrl->anatt = id->anatt; + ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); + ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); + + mutex_init(&ctrl->ana_lock); + timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); + ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + + ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc); + if (!(ctrl->anacap & (1 << 6))) + ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32); + + if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) { + dev_err(ctrl->device, + "ANA log page size (%zd) larger than MDTS (%d).\n", + ctrl->ana_log_size, + ctrl->max_hw_sectors << SECTOR_SHIFT); + dev_err(ctrl->device, "disabling ANA support.\n"); + return 0; + } + + INIT_WORK(&ctrl->ana_work, nvme_ana_work); + ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); + if (!ctrl->ana_log_buf) + goto out; + + error = nvme_read_ana_log(ctrl, true); + if (error) + goto out_free_ana_log_buf; + return 0; +out_free_ana_log_buf: + kfree(ctrl->ana_log_buf); +out: + return -ENOMEM; +} + +void nvme_mpath_uninit(struct nvme_ctrl *ctrl) +{ + kfree(ctrl->ana_log_buf); +} + diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cf970f9543a6..bb4a2003c097 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -183,6 +183,7 @@ struct nvme_ctrl { u16 oacs; u16 nssa; u16 nr_streams; + u32 max_namespaces; atomic_t abort_limit; u8 vwc; u32 vs; @@ -205,6 +206,19 @@ struct nvme_ctrl { struct work_struct fw_act_work; unsigned long events; +#ifdef CONFIG_NVME_MULTIPATH + /* asymmetric namespace access: */ + u8 anacap; + u8 anatt; + u32 anagrpmax; + u32 nanagrpid; + struct mutex ana_lock; + struct nvme_ana_rsp_hdr *ana_log_buf; + size_t ana_log_size; + struct timer_list anatt_timer; + struct work_struct ana_work; +#endif + /* Power saving configuration */ u64 ps_max_latency_us; bool apst_enabled; @@ -269,6 +283,7 @@ struct nvme_ns_head { struct bio_list requeue_list; spinlock_t requeue_lock; struct work_struct requeue_work; + struct mutex lock; #endif struct list_head list; struct srcu_struct srcu; @@ -295,6 +310,10 @@ struct nvme_ns { struct nvme_ctrl *ctrl; struct request_queue *queue; struct gendisk *disk; +#ifdef CONFIG_NVME_MULTIPATH + enum nvme_ana_state ana_state; + u32 ana_grpid; +#endif struct list_head siblings; struct nvm_dev *ndev; struct kref kref; @@ -307,8 +326,9 @@ struct nvme_ns { bool ext; u8 pi_type; unsigned long flags; -#define NVME_NS_REMOVING 0 -#define NVME_NS_DEAD 1 +#define NVME_NS_REMOVING 0 +#define NVME_NS_DEAD 1 +#define NVME_NS_ANA_PENDING 2 u16 noiob; #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS @@ -436,21 +456,24 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); -int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 log_page, void *log, size_t size, u64 offset); +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, + void *log, size_t size, u64 offset); extern const struct attribute_group nvme_ns_id_attr_group; extern const struct block_device_operations nvme_ns_head_ops; #ifdef CONFIG_NVME_MULTIPATH +bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl); void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, struct nvme_ctrl *ctrl, int *flags); void nvme_failover_req(struct request *req); -bool nvme_req_needs_failover(struct request *req, blk_status_t error); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); -void nvme_mpath_add_disk(struct nvme_ns_head *head); +void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); void nvme_mpath_remove_disk(struct nvme_ns_head *head); +int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); +void nvme_mpath_uninit(struct nvme_ctrl *ctrl); +void nvme_mpath_stop(struct nvme_ctrl *ctrl); static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) { @@ -469,7 +492,14 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) kblockd_schedule_work(&head->requeue_work); } +extern struct device_attribute dev_attr_ana_grpid; +extern struct device_attribute dev_attr_ana_state; + #else +static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) +{ + return false; +} /* * Without the multipath code enabled, multiple controller per subsystems are * visible as devices and thus we cannot use the subsystem instance. @@ -483,11 +513,6 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, static inline void nvme_failover_req(struct request *req) { } -static inline bool nvme_req_needs_failover(struct request *req, - blk_status_t error) -{ - return false; -} static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { } @@ -496,7 +521,8 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, { return 0; } -static inline void nvme_mpath_add_disk(struct nvme_ns_head *head) +static inline void nvme_mpath_add_disk(struct nvme_ns *ns, + struct nvme_id_ns *id) { } static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) @@ -508,6 +534,17 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) { } +static inline int nvme_mpath_init(struct nvme_ctrl *ctrl, + struct nvme_id_ctrl *id) +{ + return 0; +} +static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl) +{ +} +static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl) +{ +} #endif /* CONFIG_NVME_MULTIPATH */ #ifdef CONFIG_NVM diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 16a9b24270f9..f517bc562d26 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -19,6 +19,19 @@ #include <asm/unaligned.h> #include "nvmet.h" +/* + * This helper allows us to clear the AEN based on the RAE bit, + * Please use this helper when processing the log pages which are + * associated with the AEN. + */ +static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit) +{ + int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15; + + if (!rae) + clear_bit(aen_bit, &req->sq->ctrl->aen_masked); +} + u32 nvmet_get_log_page_len(struct nvme_command *cmd) { u32 len = le16_to_cpu(cmd->get_log_page.numdu); @@ -176,12 +189,76 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req) if (!status) status = nvmet_zero_sgl(req, len, req->data_len - len); ctrl->nr_changed_ns = 0; - clear_bit(NVME_AEN_CFG_NS_ATTR, &ctrl->aen_masked); + nvmet_clear_aen(req, NVME_AEN_CFG_NS_ATTR); mutex_unlock(&ctrl->lock); out: nvmet_req_complete(req, status); } +static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid, + struct nvme_ana_group_desc *desc) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmet_ns *ns; + u32 count = 0; + + if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) { + rcu_read_lock(); + list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) + if (ns->anagrpid == grpid) + desc->nsids[count++] = cpu_to_le32(ns->nsid); + rcu_read_unlock(); + } + + desc->grpid = cpu_to_le32(grpid); + desc->nnsids = cpu_to_le32(count); + desc->chgcnt = cpu_to_le64(nvmet_ana_chgcnt); + desc->state = req->port->ana_state[grpid]; + memset(desc->rsvd17, 0, sizeof(desc->rsvd17)); + return sizeof(struct nvme_ana_group_desc) + count * sizeof(__le32); +} + +static void nvmet_execute_get_log_page_ana(struct nvmet_req *req) +{ + struct nvme_ana_rsp_hdr hdr = { 0, }; + struct nvme_ana_group_desc *desc; + size_t offset = sizeof(struct nvme_ana_rsp_hdr); /* start beyond hdr */ + size_t len; + u32 grpid; + u16 ngrps = 0; + u16 status; + + status = NVME_SC_INTERNAL; + desc = kmalloc(sizeof(struct nvme_ana_group_desc) + + NVMET_MAX_NAMESPACES * sizeof(__le32), GFP_KERNEL); + if (!desc) + goto out; + + down_read(&nvmet_ana_sem); + for (grpid = 1; grpid <= NVMET_MAX_ANAGRPS; grpid++) { + if (!nvmet_ana_group_enabled[grpid]) + continue; + len = nvmet_format_ana_group(req, grpid, desc); + status = nvmet_copy_to_sgl(req, offset, desc, len); + if (status) + break; + offset += len; + ngrps++; + } + + hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt); + hdr.ngrps = cpu_to_le16(ngrps); + nvmet_clear_aen(req, NVME_AEN_CFG_ANA_CHANGE); + up_read(&nvmet_ana_sem); + + kfree(desc); + + /* copy the header last once we know the number of groups */ + status = nvmet_copy_to_sgl(req, 0, &hdr, sizeof(hdr)); +out: + nvmet_req_complete(req, status); +} + static void nvmet_execute_identify_ctrl(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -213,8 +290,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) * the safest is to leave it as zeroes. */ - /* we support multiple ports and multiples hosts: */ - id->cmic = (1 << 0) | (1 << 1); + /* we support multiple ports, multiples hosts and ANA: */ + id->cmic = (1 << 0) | (1 << 1) | (1 << 3); /* no limit on data transfer sizes for now */ id->mdts = 0; @@ -252,6 +329,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); id->nn = cpu_to_le32(ctrl->subsys->max_nsid); + id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES); id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM | NVME_CTRL_ONCS_WRITE_ZEROES); @@ -281,6 +359,11 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->msdbd = ctrl->ops->msdbd; + id->anacap = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3) | (1 << 4); + id->anatt = 10; /* random value */ + id->anagrpmax = cpu_to_le32(NVMET_MAX_ANAGRPS); + id->nanagrpid = cpu_to_le32(NVMET_MAX_ANAGRPS); + /* * Meh, we don't really support any power state. Fake up the same * values that qemu does. @@ -322,8 +405,15 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) * nuse = ncap = nsze isn't always true, but we have no way to find * that out from the underlying device. */ - id->ncap = id->nuse = id->nsze = - cpu_to_le64(ns->size >> ns->blksize_shift); + id->ncap = id->nsze = cpu_to_le64(ns->size >> ns->blksize_shift); + switch (req->port->ana_state[ns->anagrpid]) { + case NVME_ANA_INACCESSIBLE: + case NVME_ANA_PERSISTENT_LOSS: + break; + default: + id->nuse = id->nsze; + break; + } /* * We just provide a single LBA format that matches what the @@ -337,6 +427,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) * controllers, but also with any other user of the block device. */ id->nmic = (1 << 0); + id->anagrpid = cpu_to_le32(ns->anagrpid); memcpy(&id->nguid, &ns->nguid, sizeof(id->nguid)); @@ -619,6 +710,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) case NVME_LOG_CMD_EFFECTS: req->execute = nvmet_execute_get_log_cmd_effects_ns; return 0; + case NVME_LOG_ANA: + req->execute = nvmet_execute_get_log_page_ana; + return 0; } break; case nvme_admin_identify: diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 3ba5ea5c4376..51f5a8c092b4 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -411,6 +411,39 @@ out_unlock: CONFIGFS_ATTR(nvmet_ns_, device_nguid); +static ssize_t nvmet_ns_ana_grpid_show(struct config_item *item, char *page) +{ + return sprintf(page, "%u\n", to_nvmet_ns(item)->anagrpid); +} + +static ssize_t nvmet_ns_ana_grpid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + u32 oldgrpid, newgrpid; + int ret; + + ret = kstrtou32(page, 0, &newgrpid); + if (ret) + return ret; + + if (newgrpid < 1 || newgrpid > NVMET_MAX_ANAGRPS) + return -EINVAL; + + down_write(&nvmet_ana_sem); + oldgrpid = ns->anagrpid; + nvmet_ana_group_enabled[newgrpid]++; + ns->anagrpid = newgrpid; + nvmet_ana_group_enabled[oldgrpid]--; + nvmet_ana_chgcnt++; + up_write(&nvmet_ana_sem); + + nvmet_send_ana_event(ns->subsys, NULL); + return count; +} + +CONFIGFS_ATTR(nvmet_ns_, ana_grpid); + static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page) { return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled); @@ -468,6 +501,7 @@ static struct configfs_attribute *nvmet_ns_attrs[] = { &nvmet_ns_attr_device_path, &nvmet_ns_attr_device_nguid, &nvmet_ns_attr_device_uuid, + &nvmet_ns_attr_ana_grpid, &nvmet_ns_attr_enable, &nvmet_ns_attr_buffered_io, NULL, @@ -916,6 +950,134 @@ static const struct config_item_type nvmet_referrals_type = { .ct_group_ops = &nvmet_referral_group_ops, }; +static struct { + enum nvme_ana_state state; + const char *name; +} nvmet_ana_state_names[] = { + { NVME_ANA_OPTIMIZED, "optimized" }, + { NVME_ANA_NONOPTIMIZED, "non-optimized" }, + { NVME_ANA_INACCESSIBLE, "inaccessible" }, + { NVME_ANA_PERSISTENT_LOSS, "persistent-loss" }, + { NVME_ANA_CHANGE, "change" }, +}; + +static ssize_t nvmet_ana_group_ana_state_show(struct config_item *item, + char *page) +{ + struct nvmet_ana_group *grp = to_ana_group(item); + enum nvme_ana_state state = grp->port->ana_state[grp->grpid]; + int i; + + for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) { + if (state != nvmet_ana_state_names[i].state) + continue; + return sprintf(page, "%s\n", nvmet_ana_state_names[i].name); + } + + return sprintf(page, "\n"); +} + +static ssize_t nvmet_ana_group_ana_state_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ana_group *grp = to_ana_group(item); + int i; + + for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) { + if (sysfs_streq(page, nvmet_ana_state_names[i].name)) + goto found; + } + + pr_err("Invalid value '%s' for ana_state\n", page); + return -EINVAL; + +found: + down_write(&nvmet_ana_sem); + grp->port->ana_state[grp->grpid] = nvmet_ana_state_names[i].state; + nvmet_ana_chgcnt++; + up_write(&nvmet_ana_sem); + + nvmet_port_send_ana_event(grp->port); + return count; +} + +CONFIGFS_ATTR(nvmet_ana_group_, ana_state); + +static struct configfs_attribute *nvmet_ana_group_attrs[] = { + &nvmet_ana_group_attr_ana_state, + NULL, +}; + +static void nvmet_ana_group_release(struct config_item *item) +{ + struct nvmet_ana_group *grp = to_ana_group(item); + + if (grp == &grp->port->ana_default_group) + return; + + down_write(&nvmet_ana_sem); + grp->port->ana_state[grp->grpid] = NVME_ANA_INACCESSIBLE; + nvmet_ana_group_enabled[grp->grpid]--; + up_write(&nvmet_ana_sem); + + nvmet_port_send_ana_event(grp->port); + kfree(grp); +} + +static struct configfs_item_operations nvmet_ana_group_item_ops = { + .release = nvmet_ana_group_release, +}; + +static const struct config_item_type nvmet_ana_group_type = { + .ct_item_ops = &nvmet_ana_group_item_ops, + .ct_attrs = nvmet_ana_group_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_ana_groups_make_group( + struct config_group *group, const char *name) +{ + struct nvmet_port *port = ana_groups_to_port(&group->cg_item); + struct nvmet_ana_group *grp; + u32 grpid; + int ret; + + ret = kstrtou32(name, 0, &grpid); + if (ret) + goto out; + + ret = -EINVAL; + if (grpid <= 1 || grpid > NVMET_MAX_ANAGRPS) + goto out; + + ret = -ENOMEM; + grp = kzalloc(sizeof(*grp), GFP_KERNEL); + if (!grp) + goto out; + grp->port = port; + grp->grpid = grpid; + + down_write(&nvmet_ana_sem); + nvmet_ana_group_enabled[grpid]++; + up_write(&nvmet_ana_sem); + + nvmet_port_send_ana_event(grp->port); + + config_group_init_type_name(&grp->group, name, &nvmet_ana_group_type); + return &grp->group; +out: + return ERR_PTR(ret); +} + +static struct configfs_group_operations nvmet_ana_groups_group_ops = { + .make_group = nvmet_ana_groups_make_group, +}; + +static const struct config_item_type nvmet_ana_groups_type = { + .ct_group_ops = &nvmet_ana_groups_group_ops, + .ct_owner = THIS_MODULE, +}; + /* * Ports definitions. */ @@ -923,6 +1085,7 @@ static void nvmet_port_release(struct config_item *item) { struct nvmet_port *port = to_nvmet_port(item); + kfree(port->ana_state); kfree(port); } @@ -951,6 +1114,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group, { struct nvmet_port *port; u16 portid; + u32 i; if (kstrtou16(name, 0, &portid)) return ERR_PTR(-EINVAL); @@ -959,6 +1123,20 @@ static struct config_group *nvmet_ports_make(struct config_group *group, if (!port) return ERR_PTR(-ENOMEM); + port->ana_state = kcalloc(NVMET_MAX_ANAGRPS + 1, + sizeof(*port->ana_state), GFP_KERNEL); + if (!port->ana_state) { + kfree(port); + return ERR_PTR(-ENOMEM); + } + + for (i = 1; i <= NVMET_MAX_ANAGRPS; i++) { + if (i == NVMET_DEFAULT_ANA_GRPID) + port->ana_state[1] = NVME_ANA_OPTIMIZED; + else + port->ana_state[i] = NVME_ANA_INACCESSIBLE; + } + INIT_LIST_HEAD(&port->entry); INIT_LIST_HEAD(&port->subsystems); INIT_LIST_HEAD(&port->referrals); @@ -975,6 +1153,18 @@ static struct config_group *nvmet_ports_make(struct config_group *group, "referrals", &nvmet_referrals_type); configfs_add_default_group(&port->referrals_group, &port->group); + config_group_init_type_name(&port->ana_groups_group, + "ana_groups", &nvmet_ana_groups_type); + configfs_add_default_group(&port->ana_groups_group, &port->group); + + port->ana_default_group.port = port; + port->ana_default_group.grpid = NVMET_DEFAULT_ANA_GRPID; + config_group_init_type_name(&port->ana_default_group.group, + __stringify(NVMET_DEFAULT_ANA_GRPID), + &nvmet_ana_group_type); + configfs_add_default_group(&port->ana_default_group.group, + &port->ana_groups_group); + return &port->group; } diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index ddd85715a00a..3ceb7a03bb2a 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -40,6 +40,10 @@ static DEFINE_IDA(cntlid_ida); */ DECLARE_RWSEM(nvmet_config_sem); +u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1]; +u64 nvmet_ana_chgcnt; +DECLARE_RWSEM(nvmet_ana_sem); + static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, const char *subsysnqn); @@ -190,6 +194,33 @@ static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid) } } +void nvmet_send_ana_event(struct nvmet_subsys *subsys, + struct nvmet_port *port) +{ + struct nvmet_ctrl *ctrl; + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (port && ctrl->port != port) + continue; + if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_ANA_CHANGE)) + continue; + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, + NVME_AER_NOTICE_ANA, NVME_LOG_ANA); + } + mutex_unlock(&subsys->lock); +} + +void nvmet_port_send_ana_event(struct nvmet_port *port) +{ + struct nvmet_subsys_link *p; + + down_read(&nvmet_config_sem); + list_for_each_entry(p, &port->subsystems, entry) + nvmet_send_ana_event(p->subsys, port); + up_read(&nvmet_config_sem); +} + int nvmet_register_transport(const struct nvmet_fabrics_ops *ops) { int ret = 0; @@ -337,9 +368,13 @@ static void nvmet_ns_dev_disable(struct nvmet_ns *ns) int nvmet_ns_enable(struct nvmet_ns *ns) { struct nvmet_subsys *subsys = ns->subsys; - int ret = 0; + int ret; mutex_lock(&subsys->lock); + ret = -EMFILE; + if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES) + goto out_unlock; + ret = 0; if (ns->enabled) goto out_unlock; @@ -374,6 +409,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns) list_add_tail_rcu(&ns->dev_link, &old->dev_link); } + subsys->nr_namespaces++; nvmet_ns_changed(subsys, ns->nsid); ns->enabled = true; @@ -414,6 +450,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns) percpu_ref_exit(&ns->ref); mutex_lock(&subsys->lock); + subsys->nr_namespaces--; nvmet_ns_changed(subsys, ns->nsid); nvmet_ns_dev_disable(ns); out_unlock: @@ -424,6 +461,10 @@ void nvmet_ns_free(struct nvmet_ns *ns) { nvmet_ns_disable(ns); + down_write(&nvmet_ana_sem); + nvmet_ana_group_enabled[ns->anagrpid]--; + up_write(&nvmet_ana_sem); + kfree(ns->device_path); kfree(ns); } @@ -441,6 +482,12 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) ns->nsid = nsid; ns->subsys = subsys; + + down_write(&nvmet_ana_sem); + ns->anagrpid = NVMET_DEFAULT_ANA_GRPID; + nvmet_ana_group_enabled[ns->anagrpid]++; + up_write(&nvmet_ana_sem); + uuid_gen(&ns->uuid); ns->buffered_io = false; @@ -548,6 +595,20 @@ int nvmet_sq_init(struct nvmet_sq *sq) } EXPORT_SYMBOL_GPL(nvmet_sq_init); +static inline u16 nvmet_check_ana_state(struct nvmet_port *port, + struct nvmet_ns *ns) +{ + enum nvme_ana_state state = port->ana_state[ns->anagrpid]; + + if (unlikely(state == NVME_ANA_INACCESSIBLE)) + return NVME_SC_ANA_INACCESSIBLE; + if (unlikely(state == NVME_ANA_PERSISTENT_LOSS)) + return NVME_SC_ANA_PERSISTENT_LOSS; + if (unlikely(state == NVME_ANA_CHANGE)) + return NVME_SC_ANA_TRANSITION; + return 0; +} + static u16 nvmet_parse_io_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; @@ -560,6 +621,9 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); if (unlikely(!req->ns)) return NVME_SC_INVALID_NS | NVME_SC_DNR; + ret = nvmet_check_ana_state(req->port, req->ns); + if (unlikely(ret)) + return ret; if (req->ns->file) return nvmet_file_parse_io_cmd(req); @@ -876,6 +940,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, nvmet_init_cap(ctrl); + ctrl->port = req->port; + INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); INIT_LIST_HEAD(&ctrl->async_events); @@ -1115,12 +1181,15 @@ static int __init nvmet_init(void) { int error; + nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1; + buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq", WQ_MEM_RECLAIM, 0); if (!buffered_io_wq) { error = -ENOMEM; goto out; } + error = nvmet_init_discovery(); if (error) goto out; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 688993855402..22941045f46e 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -30,12 +30,11 @@ #define NVMET_ASYNC_EVENTS 4 #define NVMET_ERROR_LOG_SLOTS 128 - /* * Supported optional AENs: */ #define NVMET_AEN_CFG_OPTIONAL \ - NVME_AEN_CFG_NS_ATTR + (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE) /* * Plus mandatory SMART AENs (we'll never send them, but allow enabling them): @@ -64,6 +63,7 @@ struct nvmet_ns { loff_t size; u8 nguid[16]; uuid_t uuid; + u32 anagrpid; bool buffered_io; bool enabled; @@ -98,6 +98,18 @@ struct nvmet_sq { struct completion confirm_done; }; +struct nvmet_ana_group { + struct config_group group; + struct nvmet_port *port; + u32 grpid; +}; + +static inline struct nvmet_ana_group *to_ana_group(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_ana_group, + group); +} + /** * struct nvmet_port - Common structure to keep port * information for the target. @@ -115,6 +127,9 @@ struct nvmet_port { struct list_head subsystems; struct config_group referrals_group; struct list_head referrals; + struct config_group ana_groups_group; + struct nvmet_ana_group ana_default_group; + enum nvme_ana_state *ana_state; void *priv; bool enabled; int inline_data_size; @@ -126,6 +141,13 @@ static inline struct nvmet_port *to_nvmet_port(struct config_item *item) group); } +static inline struct nvmet_port *ana_groups_to_port( + struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_port, + ana_groups_group); +} + struct nvmet_ctrl { struct nvmet_subsys *subsys; struct nvmet_cq **cqs; @@ -140,6 +162,8 @@ struct nvmet_ctrl { u16 cntlid; u32 kato; + struct nvmet_port *port; + u32 aen_enabled; unsigned long aen_masked; struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS]; @@ -168,6 +192,7 @@ struct nvmet_subsys { struct kref ref; struct list_head namespaces; + unsigned int nr_namespaces; unsigned int max_nsid; struct list_head ctrls; @@ -340,6 +365,10 @@ void nvmet_ns_disable(struct nvmet_ns *ns); struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid); void nvmet_ns_free(struct nvmet_ns *ns); +void nvmet_send_ana_event(struct nvmet_subsys *subsys, + struct nvmet_port *port); +void nvmet_port_send_ana_event(struct nvmet_port *port); + int nvmet_register_transport(const struct nvmet_fabrics_ops *ops); void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops); @@ -360,6 +389,22 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd); #define NVMET_QUEUE_SIZE 1024 #define NVMET_NR_QUEUES 128 #define NVMET_MAX_CMD NVMET_QUEUE_SIZE + +/* + * Nice round number that makes a list of nsids fit into a page. + * Should become tunable at some point in the future. + */ +#define NVMET_MAX_NAMESPACES 1024 + +/* + * 0 is not a valid ANA group ID, so we start numbering at 1. + * + * ANA Group 1 exists without manual intervention, has namespaces assigned to it + * by default, and is available in an optimized state through all ports. + */ +#define NVMET_MAX_ANAGRPS 128 +#define NVMET_DEFAULT_ANA_GRPID 1 + #define NVMET_KAS 10 #define NVMET_DISC_KATO 120 @@ -373,6 +418,10 @@ extern struct nvmet_subsys *nvmet_disc_subsys; extern u64 nvmet_genctr; extern struct rw_semaphore nvmet_config_sem; +extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1]; +extern u64 nvmet_ana_chgcnt; +extern struct rw_semaphore nvmet_ana_sem; + bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, const char *hostnqn); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 80dfedcf0bf7..64c9175723de 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -242,7 +242,12 @@ struct nvme_id_ctrl { __le32 sanicap; __le32 hmminds; __le16 hmmaxd; - __u8 rsvd338[174]; + __u8 rsvd338[4]; + __u8 anatt; + __u8 anacap; + __le32 anagrpmax; + __le32 nanagrpid; + __u8 rsvd352[160]; __u8 sqes; __u8 cqes; __le16 maxcmd; @@ -258,7 +263,8 @@ struct nvme_id_ctrl { __le16 acwu; __u8 rsvd534[2]; __le32 sgls; - __u8 rsvd540[228]; + __le32 mnan; + __u8 rsvd544[224]; char subnqn[256]; __u8 rsvd1024[768]; __le32 ioccsz; @@ -312,7 +318,9 @@ struct nvme_id_ns { __le16 nabspf; __le16 noiob; __u8 nvmcap[16]; - __u8 rsvd64[40]; + __u8 rsvd64[28]; + __le32 anagrpid; + __u8 rsvd96[8]; __u8 nguid[16]; __u8 eui64[8]; struct nvme_lbaf lbaf[16]; @@ -425,6 +433,32 @@ struct nvme_effects_log { __u8 resv[2048]; }; +enum nvme_ana_state { + NVME_ANA_OPTIMIZED = 0x01, + NVME_ANA_NONOPTIMIZED = 0x02, + NVME_ANA_INACCESSIBLE = 0x03, + NVME_ANA_PERSISTENT_LOSS = 0x04, + NVME_ANA_CHANGE = 0x0f, +}; + +struct nvme_ana_group_desc { + __le32 grpid; + __le32 nnsids; + __le64 chgcnt; + __u8 state; + __u8 rsvd17[7]; + __le32 nsids[]; +}; + +/* flag for the log specific field of the ANA log */ +#define NVME_ANA_LOG_RGO (1 << 0) + +struct nvme_ana_rsp_hdr { + __le64 chgcnt; + __le16 ngrps; + __le16 rsvd10[3]; +}; + enum { NVME_SMART_CRIT_SPARE = 1 << 0, NVME_SMART_CRIT_TEMPERATURE = 1 << 1, @@ -444,11 +478,13 @@ enum { enum { NVME_AER_NOTICE_NS_CHANGED = 0x00, NVME_AER_NOTICE_FW_ACT_STARTING = 0x01, + NVME_AER_NOTICE_ANA = 0x03, }; enum { NVME_AEN_CFG_NS_ATTR = 1 << 8, NVME_AEN_CFG_FW_ACT = 1 << 9, + NVME_AEN_CFG_ANA_CHANGE = 1 << 11, }; struct nvme_lba_range_type { @@ -763,6 +799,7 @@ enum { NVME_LOG_FW_SLOT = 0x03, NVME_LOG_CHANGED_NS = 0x04, NVME_LOG_CMD_EFFECTS = 0x05, + NVME_LOG_ANA = 0x0c, NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, NVME_FWACT_REPL = (0 << 3), @@ -885,7 +922,7 @@ struct nvme_get_log_page_command { __u64 rsvd2[2]; union nvme_data_ptr dptr; __u8 lid; - __u8 rsvd10; + __u8 lsp; /* upper 4 bits reserved */ __le16 numdl; __le16 numdu; __u16 rsvd11; @@ -1185,6 +1222,13 @@ enum { NVME_SC_ACCESS_DENIED = 0x286, NVME_SC_UNWRITTEN_BLOCK = 0x287, + /* + * Path-related Errors: + */ + NVME_SC_ANA_PERSISTENT_LOSS = 0x301, + NVME_SC_ANA_INACCESSIBLE = 0x302, + NVME_SC_ANA_TRANSITION = 0x303, + NVME_SC_DNR = 0x4000, }; |