summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2018-08-06 04:34:09 +0300
committerJens Axboe <axboe@kernel.dk>2018-08-06 04:34:09 +0300
commitf87b0f0dfa5496fc4a701c071fa3ce7ad7ca5152 (patch)
treebfaf5858c80dc1056d30825f35762b7e3dad65d4
parent05b9ba4b550ff67d7362608828405f9e389e8988 (diff)
parentb369b30cf510fe94d8884837039362e2ec223cec (diff)
downloadlinux-f87b0f0dfa5496fc4a701c071fa3ce7ad7ca5152.tar.xz
Merge branch 'nvme-4.19' of git://git.infradead.org/nvme into for-4.19/block2
Pull NVMe changes from Christoph: "This contains the support for TP4004, Asymmetric Namespace Access, which makes NVMe multipathing usable in practice." * 'nvme-4.19' of git://git.infradead.org/nvme: nvmet: use Retain Async Event bit to clear AEN nvmet: support configuring ANA groups nvmet: add minimal ANA support nvmet: track and limit the number of namespaces per subsystem nvmet: keep a port pointer in nvmet_ctrl nvme: add ANA support nvme: remove nvme_req_needs_failover nvme: simplify the API for getting log pages nvme.h: add ANA definitions nvme.h: add support for the log specific field Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--drivers/nvme/host/core.c69
-rw-r--r--drivers/nvme/host/lightnvm.c5
-rw-r--r--drivers/nvme/host/multipath.c349
-rw-r--r--drivers/nvme/host/nvme.h61
-rw-r--r--drivers/nvme/target/admin-cmd.c104
-rw-r--r--drivers/nvme/target/configfs.c190
-rw-r--r--drivers/nvme/target/core.c71
-rw-r--r--drivers/nvme/target/nvmet.h53
-rw-r--r--include/linux/nvme.h52
9 files changed, 880 insertions, 74 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9347f20190e5..603fe59756fb 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -252,7 +252,8 @@ void nvme_complete_rq(struct request *req)
trace_nvme_complete_rq(req);
if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
- if (nvme_req_needs_failover(req, status)) {
+ if ((req->cmd_flags & REQ_NVME_MPATH) &&
+ blk_path_error(status)) {
nvme_failover_req(req);
return;
}
@@ -1067,7 +1068,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
EXPORT_SYMBOL_GPL(nvme_set_queue_count);
#define NVME_AEN_SUPPORTED \
- (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT)
+ (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE)
static void nvme_enable_aen(struct nvme_ctrl *ctrl)
{
@@ -2281,21 +2282,16 @@ out_unlock:
return ret;
}
-int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
- u8 log_page, void *log,
- size_t size, u64 offset)
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
+ void *log, size_t size, u64 offset)
{
struct nvme_command c = { };
unsigned long dwlen = size / 4 - 1;
c.get_log_page.opcode = nvme_admin_get_log_page;
-
- if (ns)
- c.get_log_page.nsid = cpu_to_le32(ns->head->ns_id);
- else
- c.get_log_page.nsid = cpu_to_le32(NVME_NSID_ALL);
-
+ c.get_log_page.nsid = cpu_to_le32(nsid);
c.get_log_page.lid = log_page;
+ c.get_log_page.lsp = lsp;
c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
@@ -2304,12 +2300,6 @@ int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}
-static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
- size_t size)
-{
- return nvme_get_log_ext(ctrl, NULL, log_page, log, size, 0);
-}
-
static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
{
int ret;
@@ -2320,8 +2310,8 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
if (!ctrl->effects)
return 0;
- ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects,
- sizeof(*ctrl->effects));
+ ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0,
+ ctrl->effects, sizeof(*ctrl->effects), 0);
if (ret) {
kfree(ctrl->effects);
ctrl->effects = NULL;
@@ -2412,6 +2402,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
nvme_set_queue_limits(ctrl, ctrl->admin_q);
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
+ ctrl->max_namespaces = le32_to_cpu(id->mnan);
if (id->rtd3e) {
/* us -> s */
@@ -2471,8 +2462,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
}
+ ret = nvme_mpath_init(ctrl, id);
kfree(id);
+ if (ret < 0)
+ return ret;
+
if (ctrl->apst_enabled && !prev_apst_enabled)
dev_pm_qos_expose_latency_tolerance(ctrl->device);
else if (!ctrl->apst_enabled && prev_apst_enabled)
@@ -2691,6 +2686,10 @@ static struct attribute *nvme_ns_id_attrs[] = {
&dev_attr_nguid.attr,
&dev_attr_eui.attr,
&dev_attr_nsid.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+ &dev_attr_ana_grpid.attr,
+ &dev_attr_ana_state.attr,
+#endif
NULL,
};
@@ -2713,6 +2712,14 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
return 0;
}
+#ifdef CONFIG_NVME_MULTIPATH
+ if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
+ if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */
+ return 0;
+ if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
+ return 0;
+ }
+#endif
return a->mode;
}
@@ -3086,8 +3093,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
nvme_get_ctrl(ctrl);
- kfree(id);
-
device_add_disk(ctrl->device, ns->disk);
if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
&nvme_ns_id_attr_group))
@@ -3097,8 +3102,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
ns->disk->disk_name);
- nvme_mpath_add_disk(ns->head);
+ nvme_mpath_add_disk(ns, id);
nvme_fault_inject_init(ns);
+ kfree(id);
+
return;
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
@@ -3240,7 +3247,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
* raced with us in reading the log page, which could cause us to miss
* updates.
*/
- error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size);
+ error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log,
+ log_size, 0);
if (error)
dev_warn(ctrl->device,
"reading changed ns log failed: %d\n", error);
@@ -3357,9 +3365,9 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
if (!log)
return;
- if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log)))
- dev_warn(ctrl->device,
- "Get FW SLOT INFO log error\n");
+ if (nvme_get_log(ctrl, NVME_NSID_ALL, 0, NVME_LOG_FW_SLOT, log,
+ sizeof(*log), 0))
+ dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
kfree(log);
}
@@ -3405,6 +3413,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
case NVME_AER_NOTICE_FW_ACT_STARTING:
queue_work(nvme_wq, &ctrl->fw_act_work);
break;
+#ifdef CONFIG_NVME_MULTIPATH
+ case NVME_AER_NOTICE_ANA:
+ if (!ctrl->ana_log_buf)
+ break;
+ queue_work(nvme_wq, &ctrl->ana_work);
+ break;
+#endif
default:
dev_warn(ctrl->device, "async event result %08x\n", result);
}
@@ -3437,6 +3452,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
{
+ nvme_mpath_stop(ctrl);
nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work);
flush_work(&ctrl->scan_work);
@@ -3474,6 +3490,7 @@ static void nvme_free_ctrl(struct device *dev)
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
kfree(ctrl->effects);
+ nvme_mpath_uninit(ctrl);
if (subsys) {
mutex_lock(&subsys->lock);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index d9e4cccd5b66..7e4cf4eb9d66 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -604,8 +604,9 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
while (left) {
len = min_t(unsigned int, left, max_len);
- ret = nvme_get_log_ext(ctrl, ns, NVME_NVM_LOG_REPORT_CHUNK,
- dev_meta, len, offset);
+ ret = nvme_get_log(ctrl, ns->head->ns_id,
+ NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len,
+ offset);
if (ret) {
dev_err(ctrl->device, "Get REPORT CHUNK log error\n");
break;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 1ffd3e8b13a1..c643872f8dac 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 Christoph Hellwig.
+ * Copyright (c) 2017-2018 Christoph Hellwig.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -20,6 +20,11 @@ module_param(multipath, bool, 0444);
MODULE_PARM_DESC(multipath,
"turn on native support for multiple controllers per subsystem");
+inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
+{
+ return multipath && (ctrl->subsys->cmic & (1 << 3));
+}
+
/*
* If multipathing is enabled we need to always use the subsystem instance
* number for numbering our devices to avoid conflicts between subsystems that
@@ -45,6 +50,7 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
void nvme_failover_req(struct request *req)
{
struct nvme_ns *ns = req->q->queuedata;
+ u16 status = nvme_req(req)->status;
unsigned long flags;
spin_lock_irqsave(&ns->head->requeue_lock, flags);
@@ -52,15 +58,35 @@ void nvme_failover_req(struct request *req)
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
blk_mq_end_request(req, 0);
- nvme_reset_ctrl(ns->ctrl);
- kblockd_schedule_work(&ns->head->requeue_work);
-}
+ switch (status & 0x7ff) {
+ case NVME_SC_ANA_TRANSITION:
+ case NVME_SC_ANA_INACCESSIBLE:
+ case NVME_SC_ANA_PERSISTENT_LOSS:
+ /*
+ * If we got back an ANA error we know the controller is alive,
+ * but not ready to serve this namespaces. The spec suggests
+ * we should update our general state here, but due to the fact
+ * that the admin and I/O queues are not serialized that is
+ * fundamentally racy. So instead just clear the current path,
+ * mark the the path as pending and kick of a re-read of the ANA
+ * log page ASAP.
+ */
+ nvme_mpath_clear_current_path(ns);
+ if (ns->ctrl->ana_log_buf) {
+ set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+ queue_work(nvme_wq, &ns->ctrl->ana_work);
+ }
+ break;
+ default:
+ /*
+ * Reset the controller for any non-ANA error as we don't know
+ * what caused the error.
+ */
+ nvme_reset_ctrl(ns->ctrl);
+ break;
+ }
-bool nvme_req_needs_failover(struct request *req, blk_status_t error)
-{
- if (!(req->cmd_flags & REQ_NVME_MPATH))
- return false;
- return blk_path_error(error);
+ kblockd_schedule_work(&ns->head->requeue_work);
}
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
@@ -75,25 +101,51 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
up_read(&ctrl->namespaces_rwsem);
}
+static const char *nvme_ana_state_names[] = {
+ [0] = "invalid state",
+ [NVME_ANA_OPTIMIZED] = "optimized",
+ [NVME_ANA_NONOPTIMIZED] = "non-optimized",
+ [NVME_ANA_INACCESSIBLE] = "inaccessible",
+ [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss",
+ [NVME_ANA_CHANGE] = "change",
+};
+
static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
{
- struct nvme_ns *ns;
+ struct nvme_ns *ns, *fallback = NULL;
list_for_each_entry_rcu(ns, &head->list, siblings) {
- if (ns->ctrl->state == NVME_CTRL_LIVE) {
+ if (ns->ctrl->state != NVME_CTRL_LIVE ||
+ test_bit(NVME_NS_ANA_PENDING, &ns->flags))
+ continue;
+ switch (ns->ana_state) {
+ case NVME_ANA_OPTIMIZED:
rcu_assign_pointer(head->current_path, ns);
return ns;
+ case NVME_ANA_NONOPTIMIZED:
+ fallback = ns;
+ break;
+ default:
+ break;
}
}
- return NULL;
+ if (fallback)
+ rcu_assign_pointer(head->current_path, fallback);
+ return fallback;
+}
+
+static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
+{
+ return ns->ctrl->state == NVME_CTRL_LIVE &&
+ ns->ana_state == NVME_ANA_OPTIMIZED;
}
inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
{
struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
- if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE))
+ if (unlikely(!ns || !nvme_path_is_optimized(ns)))
ns = __nvme_find_path(head);
return ns;
}
@@ -142,7 +194,7 @@ static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
srcu_idx = srcu_read_lock(&head->srcu);
ns = srcu_dereference(head->current_path, &head->srcu);
- if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE))
+ if (likely(ns && nvme_path_is_optimized(ns)))
found = ns->queue->poll_fn(q, qc);
srcu_read_unlock(&head->srcu, srcu_idx);
return found;
@@ -176,6 +228,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
struct request_queue *q;
bool vwc = false;
+ mutex_init(&head->lock);
bio_list_init(&head->requeue_list);
spin_lock_init(&head->requeue_lock);
INIT_WORK(&head->requeue_work, nvme_requeue_work);
@@ -220,29 +273,232 @@ out:
return -ENOMEM;
}
-void nvme_mpath_add_disk(struct nvme_ns_head *head)
+static void nvme_mpath_set_live(struct nvme_ns *ns)
{
+ struct nvme_ns_head *head = ns->head;
+
+ lockdep_assert_held(&ns->head->lock);
+
if (!head->disk)
return;
- mutex_lock(&head->subsys->lock);
if (!(head->disk->flags & GENHD_FL_UP)) {
device_add_disk(&head->subsys->dev, head->disk);
if (sysfs_create_group(&disk_to_dev(head->disk)->kobj,
&nvme_ns_id_attr_group))
- pr_warn("%s: failed to create sysfs group for identification\n",
- head->disk->disk_name);
+ dev_warn(&head->subsys->dev,
+ "failed to create id group.\n");
+ }
+
+ kblockd_schedule_work(&ns->head->requeue_work);
+}
+
+static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
+ int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
+ void *))
+{
+ void *base = ctrl->ana_log_buf;
+ size_t offset = sizeof(struct nvme_ana_rsp_hdr);
+ int error, i;
+
+ lockdep_assert_held(&ctrl->ana_lock);
+
+ for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
+ struct nvme_ana_group_desc *desc = base + offset;
+ u32 nr_nsids = le32_to_cpu(desc->nnsids);
+ size_t nsid_buf_size = nr_nsids * sizeof(__le32);
+
+ if (WARN_ON_ONCE(desc->grpid == 0))
+ return -EINVAL;
+ if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
+ return -EINVAL;
+ if (WARN_ON_ONCE(desc->state == 0))
+ return -EINVAL;
+ if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
+ return -EINVAL;
+
+ offset += sizeof(*desc);
+ if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
+ return -EINVAL;
+
+ error = cb(ctrl, desc, data);
+ if (error)
+ return error;
+
+ offset += nsid_buf_size;
+ if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static inline bool nvme_state_is_live(enum nvme_ana_state state)
+{
+ return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
+}
+
+static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
+ struct nvme_ns *ns)
+{
+ enum nvme_ana_state old;
+
+ mutex_lock(&ns->head->lock);
+ old = ns->ana_state;
+ ns->ana_grpid = le32_to_cpu(desc->grpid);
+ ns->ana_state = desc->state;
+ clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
+
+ if (nvme_state_is_live(ns->ana_state) && !nvme_state_is_live(old))
+ nvme_mpath_set_live(ns);
+ mutex_unlock(&ns->head->lock);
+}
+
+static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
+ struct nvme_ana_group_desc *desc, void *data)
+{
+ u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
+ unsigned *nr_change_groups = data;
+ struct nvme_ns *ns;
+
+ dev_info(ctrl->device, "ANA group %d: %s.\n",
+ le32_to_cpu(desc->grpid),
+ nvme_ana_state_names[desc->state]);
+
+ if (desc->state == NVME_ANA_CHANGE)
+ (*nr_change_groups)++;
+
+ if (!nr_nsids)
+ return 0;
+
+ down_write(&ctrl->namespaces_rwsem);
+ list_for_each_entry(ns, &ctrl->namespaces, list) {
+ if (ns->head->ns_id != le32_to_cpu(desc->nsids[n]))
+ continue;
+ nvme_update_ns_ana_state(desc, ns);
+ if (++n == nr_nsids)
+ break;
+ }
+ up_write(&ctrl->namespaces_rwsem);
+ WARN_ON_ONCE(n < nr_nsids);
+ return 0;
+}
+
+static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only)
+{
+ u32 nr_change_groups = 0;
+ int error;
+
+ mutex_lock(&ctrl->ana_lock);
+ error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA,
+ groups_only ? NVME_ANA_LOG_RGO : 0,
+ ctrl->ana_log_buf, ctrl->ana_log_size, 0);
+ if (error) {
+ dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
+ goto out_unlock;
+ }
+
+ error = nvme_parse_ana_log(ctrl, &nr_change_groups,
+ nvme_update_ana_state);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * In theory we should have an ANATT timer per group as they might enter
+ * the change state at different times. But that is a lot of overhead
+ * just to protect against a target that keeps entering new changes
+ * states while never finishing previous ones. But we'll still
+ * eventually time out once all groups are in change state, so this
+ * isn't a big deal.
+ *
+ * We also double the ANATT value to provide some slack for transports
+ * or AEN processing overhead.
+ */
+ if (nr_change_groups)
+ mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
+ else
+ del_timer_sync(&ctrl->anatt_timer);
+out_unlock:
+ mutex_unlock(&ctrl->ana_lock);
+ return error;
+}
+
+static void nvme_ana_work(struct work_struct *work)
+{
+ struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
+
+ nvme_read_ana_log(ctrl, false);
+}
+
+static void nvme_anatt_timeout(struct timer_list *t)
+{
+ struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
+
+ dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
+ nvme_reset_ctrl(ctrl);
+}
+
+void nvme_mpath_stop(struct nvme_ctrl *ctrl)
+{
+ if (!nvme_ctrl_use_ana(ctrl))
+ return;
+ del_timer_sync(&ctrl->anatt_timer);
+ cancel_work_sync(&ctrl->ana_work);
+}
+
+static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
+}
+DEVICE_ATTR_RO(ana_grpid);
+
+static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+
+ return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
+}
+DEVICE_ATTR_RO(ana_state);
+
+static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl,
+ struct nvme_ana_group_desc *desc, void *data)
+{
+ struct nvme_ns *ns = data;
+
+ if (ns->ana_grpid == le32_to_cpu(desc->grpid)) {
+ nvme_update_ns_ana_state(desc, ns);
+ return -ENXIO; /* just break out of the loop */
+ }
+
+ return 0;
+}
+
+void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+ if (nvme_ctrl_use_ana(ns->ctrl)) {
+ mutex_lock(&ns->ctrl->ana_lock);
+ ns->ana_grpid = le32_to_cpu(id->anagrpid);
+ nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state);
+ mutex_unlock(&ns->ctrl->ana_lock);
+ } else {
+ mutex_lock(&ns->head->lock);
+ ns->ana_state = NVME_ANA_OPTIMIZED;
+ nvme_mpath_set_live(ns);
+ mutex_unlock(&ns->head->lock);
}
- mutex_unlock(&head->subsys->lock);
}
void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
if (!head->disk)
return;
- sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
- &nvme_ns_id_attr_group);
- del_gendisk(head->disk);
+ if (head->disk->flags & GENHD_FL_UP) {
+ sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
+ &nvme_ns_id_attr_group);
+ del_gendisk(head->disk);
+ }
blk_set_queue_dying(head->disk->queue);
/* make sure all pending bios are cleaned up */
kblockd_schedule_work(&head->requeue_work);
@@ -250,3 +506,52 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
blk_cleanup_queue(head->disk->queue);
put_disk(head->disk);
}
+
+int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+{
+ int error;
+
+ if (!nvme_ctrl_use_ana(ctrl))
+ return 0;
+
+ ctrl->anacap = id->anacap;
+ ctrl->anatt = id->anatt;
+ ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
+ ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
+
+ mutex_init(&ctrl->ana_lock);
+ timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
+ ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
+ ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
+ if (!(ctrl->anacap & (1 << 6)))
+ ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
+
+ if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
+ dev_err(ctrl->device,
+ "ANA log page size (%zd) larger than MDTS (%d).\n",
+ ctrl->ana_log_size,
+ ctrl->max_hw_sectors << SECTOR_SHIFT);
+ dev_err(ctrl->device, "disabling ANA support.\n");
+ return 0;
+ }
+
+ INIT_WORK(&ctrl->ana_work, nvme_ana_work);
+ ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
+ if (!ctrl->ana_log_buf)
+ goto out;
+
+ error = nvme_read_ana_log(ctrl, true);
+ if (error)
+ goto out_free_ana_log_buf;
+ return 0;
+out_free_ana_log_buf:
+ kfree(ctrl->ana_log_buf);
+out:
+ return -ENOMEM;
+}
+
+void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
+{
+ kfree(ctrl->ana_log_buf);
+}
+
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cf970f9543a6..bb4a2003c097 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -183,6 +183,7 @@ struct nvme_ctrl {
u16 oacs;
u16 nssa;
u16 nr_streams;
+ u32 max_namespaces;
atomic_t abort_limit;
u8 vwc;
u32 vs;
@@ -205,6 +206,19 @@ struct nvme_ctrl {
struct work_struct fw_act_work;
unsigned long events;
+#ifdef CONFIG_NVME_MULTIPATH
+ /* asymmetric namespace access: */
+ u8 anacap;
+ u8 anatt;
+ u32 anagrpmax;
+ u32 nanagrpid;
+ struct mutex ana_lock;
+ struct nvme_ana_rsp_hdr *ana_log_buf;
+ size_t ana_log_size;
+ struct timer_list anatt_timer;
+ struct work_struct ana_work;
+#endif
+
/* Power saving configuration */
u64 ps_max_latency_us;
bool apst_enabled;
@@ -269,6 +283,7 @@ struct nvme_ns_head {
struct bio_list requeue_list;
spinlock_t requeue_lock;
struct work_struct requeue_work;
+ struct mutex lock;
#endif
struct list_head list;
struct srcu_struct srcu;
@@ -295,6 +310,10 @@ struct nvme_ns {
struct nvme_ctrl *ctrl;
struct request_queue *queue;
struct gendisk *disk;
+#ifdef CONFIG_NVME_MULTIPATH
+ enum nvme_ana_state ana_state;
+ u32 ana_grpid;
+#endif
struct list_head siblings;
struct nvm_dev *ndev;
struct kref kref;
@@ -307,8 +326,9 @@ struct nvme_ns {
bool ext;
u8 pi_type;
unsigned long flags;
-#define NVME_NS_REMOVING 0
-#define NVME_NS_DEAD 1
+#define NVME_NS_REMOVING 0
+#define NVME_NS_DEAD 1
+#define NVME_NS_ANA_PENDING 2
u16 noiob;
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
@@ -436,21 +456,24 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
-int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
- u8 log_page, void *log, size_t size, u64 offset);
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
+ void *log, size_t size, u64 offset);
extern const struct attribute_group nvme_ns_id_attr_group;
extern const struct block_device_operations nvme_ns_head_ops;
#ifdef CONFIG_NVME_MULTIPATH
+bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
struct nvme_ctrl *ctrl, int *flags);
void nvme_failover_req(struct request *req);
-bool nvme_req_needs_failover(struct request *req, blk_status_t error);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
-void nvme_mpath_add_disk(struct nvme_ns_head *head);
+void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
+int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
+void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
+void nvme_mpath_stop(struct nvme_ctrl *ctrl);
static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
{
@@ -469,7 +492,14 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
kblockd_schedule_work(&head->requeue_work);
}
+extern struct device_attribute dev_attr_ana_grpid;
+extern struct device_attribute dev_attr_ana_state;
+
#else
+static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
+{
+ return false;
+}
/*
* Without the multipath code enabled, multiple controller per subsystems are
* visible as devices and thus we cannot use the subsystem instance.
@@ -483,11 +513,6 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
static inline void nvme_failover_req(struct request *req)
{
}
-static inline bool nvme_req_needs_failover(struct request *req,
- blk_status_t error)
-{
- return false;
-}
static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{
}
@@ -496,7 +521,8 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
{
return 0;
}
-static inline void nvme_mpath_add_disk(struct nvme_ns_head *head)
+static inline void nvme_mpath_add_disk(struct nvme_ns *ns,
+ struct nvme_id_ns *id)
{
}
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
@@ -508,6 +534,17 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
{
}
+static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
+ struct nvme_id_ctrl *id)
+{
+ return 0;
+}
+static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
+{
+}
+static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl)
+{
+}
#endif /* CONFIG_NVME_MULTIPATH */
#ifdef CONFIG_NVM
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 16a9b24270f9..f517bc562d26 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -19,6 +19,19 @@
#include <asm/unaligned.h>
#include "nvmet.h"
+/*
+ * This helper allows us to clear the AEN based on the RAE bit,
+ * Please use this helper when processing the log pages which are
+ * associated with the AEN.
+ */
+static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit)
+{
+ int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15;
+
+ if (!rae)
+ clear_bit(aen_bit, &req->sq->ctrl->aen_masked);
+}
+
u32 nvmet_get_log_page_len(struct nvme_command *cmd)
{
u32 len = le16_to_cpu(cmd->get_log_page.numdu);
@@ -176,12 +189,76 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req)
if (!status)
status = nvmet_zero_sgl(req, len, req->data_len - len);
ctrl->nr_changed_ns = 0;
- clear_bit(NVME_AEN_CFG_NS_ATTR, &ctrl->aen_masked);
+ nvmet_clear_aen(req, NVME_AEN_CFG_NS_ATTR);
mutex_unlock(&ctrl->lock);
out:
nvmet_req_complete(req, status);
}
+static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid,
+ struct nvme_ana_group_desc *desc)
+{
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvmet_ns *ns;
+ u32 count = 0;
+
+ if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link)
+ if (ns->anagrpid == grpid)
+ desc->nsids[count++] = cpu_to_le32(ns->nsid);
+ rcu_read_unlock();
+ }
+
+ desc->grpid = cpu_to_le32(grpid);
+ desc->nnsids = cpu_to_le32(count);
+ desc->chgcnt = cpu_to_le64(nvmet_ana_chgcnt);
+ desc->state = req->port->ana_state[grpid];
+ memset(desc->rsvd17, 0, sizeof(desc->rsvd17));
+ return sizeof(struct nvme_ana_group_desc) + count * sizeof(__le32);
+}
+
+static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
+{
+ struct nvme_ana_rsp_hdr hdr = { 0, };
+ struct nvme_ana_group_desc *desc;
+ size_t offset = sizeof(struct nvme_ana_rsp_hdr); /* start beyond hdr */
+ size_t len;
+ u32 grpid;
+ u16 ngrps = 0;
+ u16 status;
+
+ status = NVME_SC_INTERNAL;
+ desc = kmalloc(sizeof(struct nvme_ana_group_desc) +
+ NVMET_MAX_NAMESPACES * sizeof(__le32), GFP_KERNEL);
+ if (!desc)
+ goto out;
+
+ down_read(&nvmet_ana_sem);
+ for (grpid = 1; grpid <= NVMET_MAX_ANAGRPS; grpid++) {
+ if (!nvmet_ana_group_enabled[grpid])
+ continue;
+ len = nvmet_format_ana_group(req, grpid, desc);
+ status = nvmet_copy_to_sgl(req, offset, desc, len);
+ if (status)
+ break;
+ offset += len;
+ ngrps++;
+ }
+
+ hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt);
+ hdr.ngrps = cpu_to_le16(ngrps);
+ nvmet_clear_aen(req, NVME_AEN_CFG_ANA_CHANGE);
+ up_read(&nvmet_ana_sem);
+
+ kfree(desc);
+
+ /* copy the header last once we know the number of groups */
+ status = nvmet_copy_to_sgl(req, 0, &hdr, sizeof(hdr));
+out:
+ nvmet_req_complete(req, status);
+}
+
static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
@@ -213,8 +290,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
* the safest is to leave it as zeroes.
*/
- /* we support multiple ports and multiples hosts: */
- id->cmic = (1 << 0) | (1 << 1);
+ /* we support multiple ports, multiples hosts and ANA: */
+ id->cmic = (1 << 0) | (1 << 1) | (1 << 3);
/* no limit on data transfer sizes for now */
id->mdts = 0;
@@ -252,6 +329,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
id->nn = cpu_to_le32(ctrl->subsys->max_nsid);
+ id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES);
id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM |
NVME_CTRL_ONCS_WRITE_ZEROES);
@@ -281,6 +359,11 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
id->msdbd = ctrl->ops->msdbd;
+ id->anacap = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3) | (1 << 4);
+ id->anatt = 10; /* random value */
+ id->anagrpmax = cpu_to_le32(NVMET_MAX_ANAGRPS);
+ id->nanagrpid = cpu_to_le32(NVMET_MAX_ANAGRPS);
+
/*
* Meh, we don't really support any power state. Fake up the same
* values that qemu does.
@@ -322,8 +405,15 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
* nuse = ncap = nsze isn't always true, but we have no way to find
* that out from the underlying device.
*/
- id->ncap = id->nuse = id->nsze =
- cpu_to_le64(ns->size >> ns->blksize_shift);
+ id->ncap = id->nsze = cpu_to_le64(ns->size >> ns->blksize_shift);
+ switch (req->port->ana_state[ns->anagrpid]) {
+ case NVME_ANA_INACCESSIBLE:
+ case NVME_ANA_PERSISTENT_LOSS:
+ break;
+ default:
+ id->nuse = id->nsze;
+ break;
+ }
/*
* We just provide a single LBA format that matches what the
@@ -337,6 +427,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
* controllers, but also with any other user of the block device.
*/
id->nmic = (1 << 0);
+ id->anagrpid = cpu_to_le32(ns->anagrpid);
memcpy(&id->nguid, &ns->nguid, sizeof(id->nguid));
@@ -619,6 +710,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
case NVME_LOG_CMD_EFFECTS:
req->execute = nvmet_execute_get_log_cmd_effects_ns;
return 0;
+ case NVME_LOG_ANA:
+ req->execute = nvmet_execute_get_log_page_ana;
+ return 0;
}
break;
case nvme_admin_identify:
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 3ba5ea5c4376..51f5a8c092b4 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -411,6 +411,39 @@ out_unlock:
CONFIGFS_ATTR(nvmet_ns_, device_nguid);
+static ssize_t nvmet_ns_ana_grpid_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%u\n", to_nvmet_ns(item)->anagrpid);
+}
+
+static ssize_t nvmet_ns_ana_grpid_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_ns *ns = to_nvmet_ns(item);
+ u32 oldgrpid, newgrpid;
+ int ret;
+
+ ret = kstrtou32(page, 0, &newgrpid);
+ if (ret)
+ return ret;
+
+ if (newgrpid < 1 || newgrpid > NVMET_MAX_ANAGRPS)
+ return -EINVAL;
+
+ down_write(&nvmet_ana_sem);
+ oldgrpid = ns->anagrpid;
+ nvmet_ana_group_enabled[newgrpid]++;
+ ns->anagrpid = newgrpid;
+ nvmet_ana_group_enabled[oldgrpid]--;
+ nvmet_ana_chgcnt++;
+ up_write(&nvmet_ana_sem);
+
+ nvmet_send_ana_event(ns->subsys, NULL);
+ return count;
+}
+
+CONFIGFS_ATTR(nvmet_ns_, ana_grpid);
+
static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page)
{
return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled);
@@ -468,6 +501,7 @@ static struct configfs_attribute *nvmet_ns_attrs[] = {
&nvmet_ns_attr_device_path,
&nvmet_ns_attr_device_nguid,
&nvmet_ns_attr_device_uuid,
+ &nvmet_ns_attr_ana_grpid,
&nvmet_ns_attr_enable,
&nvmet_ns_attr_buffered_io,
NULL,
@@ -916,6 +950,134 @@ static const struct config_item_type nvmet_referrals_type = {
.ct_group_ops = &nvmet_referral_group_ops,
};
+static struct {
+ enum nvme_ana_state state;
+ const char *name;
+} nvmet_ana_state_names[] = {
+ { NVME_ANA_OPTIMIZED, "optimized" },
+ { NVME_ANA_NONOPTIMIZED, "non-optimized" },
+ { NVME_ANA_INACCESSIBLE, "inaccessible" },
+ { NVME_ANA_PERSISTENT_LOSS, "persistent-loss" },
+ { NVME_ANA_CHANGE, "change" },
+};
+
+static ssize_t nvmet_ana_group_ana_state_show(struct config_item *item,
+ char *page)
+{
+ struct nvmet_ana_group *grp = to_ana_group(item);
+ enum nvme_ana_state state = grp->port->ana_state[grp->grpid];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) {
+ if (state != nvmet_ana_state_names[i].state)
+ continue;
+ return sprintf(page, "%s\n", nvmet_ana_state_names[i].name);
+ }
+
+ return sprintf(page, "\n");
+}
+
+static ssize_t nvmet_ana_group_ana_state_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_ana_group *grp = to_ana_group(item);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) {
+ if (sysfs_streq(page, nvmet_ana_state_names[i].name))
+ goto found;
+ }
+
+ pr_err("Invalid value '%s' for ana_state\n", page);
+ return -EINVAL;
+
+found:
+ down_write(&nvmet_ana_sem);
+ grp->port->ana_state[grp->grpid] = nvmet_ana_state_names[i].state;
+ nvmet_ana_chgcnt++;
+ up_write(&nvmet_ana_sem);
+
+ nvmet_port_send_ana_event(grp->port);
+ return count;
+}
+
+CONFIGFS_ATTR(nvmet_ana_group_, ana_state);
+
+static struct configfs_attribute *nvmet_ana_group_attrs[] = {
+ &nvmet_ana_group_attr_ana_state,
+ NULL,
+};
+
+static void nvmet_ana_group_release(struct config_item *item)
+{
+ struct nvmet_ana_group *grp = to_ana_group(item);
+
+ if (grp == &grp->port->ana_default_group)
+ return;
+
+ down_write(&nvmet_ana_sem);
+ grp->port->ana_state[grp->grpid] = NVME_ANA_INACCESSIBLE;
+ nvmet_ana_group_enabled[grp->grpid]--;
+ up_write(&nvmet_ana_sem);
+
+ nvmet_port_send_ana_event(grp->port);
+ kfree(grp);
+}
+
+static struct configfs_item_operations nvmet_ana_group_item_ops = {
+ .release = nvmet_ana_group_release,
+};
+
+static const struct config_item_type nvmet_ana_group_type = {
+ .ct_item_ops = &nvmet_ana_group_item_ops,
+ .ct_attrs = nvmet_ana_group_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_group *nvmet_ana_groups_make_group(
+ struct config_group *group, const char *name)
+{
+ struct nvmet_port *port = ana_groups_to_port(&group->cg_item);
+ struct nvmet_ana_group *grp;
+ u32 grpid;
+ int ret;
+
+ ret = kstrtou32(name, 0, &grpid);
+ if (ret)
+ goto out;
+
+ ret = -EINVAL;
+ if (grpid <= 1 || grpid > NVMET_MAX_ANAGRPS)
+ goto out;
+
+ ret = -ENOMEM;
+ grp = kzalloc(sizeof(*grp), GFP_KERNEL);
+ if (!grp)
+ goto out;
+ grp->port = port;
+ grp->grpid = grpid;
+
+ down_write(&nvmet_ana_sem);
+ nvmet_ana_group_enabled[grpid]++;
+ up_write(&nvmet_ana_sem);
+
+ nvmet_port_send_ana_event(grp->port);
+
+ config_group_init_type_name(&grp->group, name, &nvmet_ana_group_type);
+ return &grp->group;
+out:
+ return ERR_PTR(ret);
+}
+
+static struct configfs_group_operations nvmet_ana_groups_group_ops = {
+ .make_group = nvmet_ana_groups_make_group,
+};
+
+static const struct config_item_type nvmet_ana_groups_type = {
+ .ct_group_ops = &nvmet_ana_groups_group_ops,
+ .ct_owner = THIS_MODULE,
+};
+
/*
* Ports definitions.
*/
@@ -923,6 +1085,7 @@ static void nvmet_port_release(struct config_item *item)
{
struct nvmet_port *port = to_nvmet_port(item);
+ kfree(port->ana_state);
kfree(port);
}
@@ -951,6 +1114,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
{
struct nvmet_port *port;
u16 portid;
+ u32 i;
if (kstrtou16(name, 0, &portid))
return ERR_PTR(-EINVAL);
@@ -959,6 +1123,20 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
if (!port)
return ERR_PTR(-ENOMEM);
+ port->ana_state = kcalloc(NVMET_MAX_ANAGRPS + 1,
+ sizeof(*port->ana_state), GFP_KERNEL);
+ if (!port->ana_state) {
+ kfree(port);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 1; i <= NVMET_MAX_ANAGRPS; i++) {
+ if (i == NVMET_DEFAULT_ANA_GRPID)
+ port->ana_state[1] = NVME_ANA_OPTIMIZED;
+ else
+ port->ana_state[i] = NVME_ANA_INACCESSIBLE;
+ }
+
INIT_LIST_HEAD(&port->entry);
INIT_LIST_HEAD(&port->subsystems);
INIT_LIST_HEAD(&port->referrals);
@@ -975,6 +1153,18 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
"referrals", &nvmet_referrals_type);
configfs_add_default_group(&port->referrals_group, &port->group);
+ config_group_init_type_name(&port->ana_groups_group,
+ "ana_groups", &nvmet_ana_groups_type);
+ configfs_add_default_group(&port->ana_groups_group, &port->group);
+
+ port->ana_default_group.port = port;
+ port->ana_default_group.grpid = NVMET_DEFAULT_ANA_GRPID;
+ config_group_init_type_name(&port->ana_default_group.group,
+ __stringify(NVMET_DEFAULT_ANA_GRPID),
+ &nvmet_ana_group_type);
+ configfs_add_default_group(&port->ana_default_group.group,
+ &port->ana_groups_group);
+
return &port->group;
}
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index ddd85715a00a..3ceb7a03bb2a 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -40,6 +40,10 @@ static DEFINE_IDA(cntlid_ida);
*/
DECLARE_RWSEM(nvmet_config_sem);
+u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
+u64 nvmet_ana_chgcnt;
+DECLARE_RWSEM(nvmet_ana_sem);
+
static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
const char *subsysnqn);
@@ -190,6 +194,33 @@ static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
}
}
+void nvmet_send_ana_event(struct nvmet_subsys *subsys,
+ struct nvmet_port *port)
+{
+ struct nvmet_ctrl *ctrl;
+
+ mutex_lock(&subsys->lock);
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+ if (port && ctrl->port != port)
+ continue;
+ if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_ANA_CHANGE))
+ continue;
+ nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
+ NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
+ }
+ mutex_unlock(&subsys->lock);
+}
+
+void nvmet_port_send_ana_event(struct nvmet_port *port)
+{
+ struct nvmet_subsys_link *p;
+
+ down_read(&nvmet_config_sem);
+ list_for_each_entry(p, &port->subsystems, entry)
+ nvmet_send_ana_event(p->subsys, port);
+ up_read(&nvmet_config_sem);
+}
+
int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
{
int ret = 0;
@@ -337,9 +368,13 @@ static void nvmet_ns_dev_disable(struct nvmet_ns *ns)
int nvmet_ns_enable(struct nvmet_ns *ns)
{
struct nvmet_subsys *subsys = ns->subsys;
- int ret = 0;
+ int ret;
mutex_lock(&subsys->lock);
+ ret = -EMFILE;
+ if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
+ goto out_unlock;
+ ret = 0;
if (ns->enabled)
goto out_unlock;
@@ -374,6 +409,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
list_add_tail_rcu(&ns->dev_link, &old->dev_link);
}
+ subsys->nr_namespaces++;
nvmet_ns_changed(subsys, ns->nsid);
ns->enabled = true;
@@ -414,6 +450,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
percpu_ref_exit(&ns->ref);
mutex_lock(&subsys->lock);
+ subsys->nr_namespaces--;
nvmet_ns_changed(subsys, ns->nsid);
nvmet_ns_dev_disable(ns);
out_unlock:
@@ -424,6 +461,10 @@ void nvmet_ns_free(struct nvmet_ns *ns)
{
nvmet_ns_disable(ns);
+ down_write(&nvmet_ana_sem);
+ nvmet_ana_group_enabled[ns->anagrpid]--;
+ up_write(&nvmet_ana_sem);
+
kfree(ns->device_path);
kfree(ns);
}
@@ -441,6 +482,12 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
ns->nsid = nsid;
ns->subsys = subsys;
+
+ down_write(&nvmet_ana_sem);
+ ns->anagrpid = NVMET_DEFAULT_ANA_GRPID;
+ nvmet_ana_group_enabled[ns->anagrpid]++;
+ up_write(&nvmet_ana_sem);
+
uuid_gen(&ns->uuid);
ns->buffered_io = false;
@@ -548,6 +595,20 @@ int nvmet_sq_init(struct nvmet_sq *sq)
}
EXPORT_SYMBOL_GPL(nvmet_sq_init);
+static inline u16 nvmet_check_ana_state(struct nvmet_port *port,
+ struct nvmet_ns *ns)
+{
+ enum nvme_ana_state state = port->ana_state[ns->anagrpid];
+
+ if (unlikely(state == NVME_ANA_INACCESSIBLE))
+ return NVME_SC_ANA_INACCESSIBLE;
+ if (unlikely(state == NVME_ANA_PERSISTENT_LOSS))
+ return NVME_SC_ANA_PERSISTENT_LOSS;
+ if (unlikely(state == NVME_ANA_CHANGE))
+ return NVME_SC_ANA_TRANSITION;
+ return 0;
+}
+
static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
@@ -560,6 +621,9 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
if (unlikely(!req->ns))
return NVME_SC_INVALID_NS | NVME_SC_DNR;
+ ret = nvmet_check_ana_state(req->port, req->ns);
+ if (unlikely(ret))
+ return ret;
if (req->ns->file)
return nvmet_file_parse_io_cmd(req);
@@ -876,6 +940,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
nvmet_init_cap(ctrl);
+ ctrl->port = req->port;
+
INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
INIT_LIST_HEAD(&ctrl->async_events);
@@ -1115,12 +1181,15 @@ static int __init nvmet_init(void)
{
int error;
+ nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1;
+
buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
WQ_MEM_RECLAIM, 0);
if (!buffered_io_wq) {
error = -ENOMEM;
goto out;
}
+
error = nvmet_init_discovery();
if (error)
goto out;
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 688993855402..22941045f46e 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -30,12 +30,11 @@
#define NVMET_ASYNC_EVENTS 4
#define NVMET_ERROR_LOG_SLOTS 128
-
/*
* Supported optional AENs:
*/
#define NVMET_AEN_CFG_OPTIONAL \
- NVME_AEN_CFG_NS_ATTR
+ (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE)
/*
* Plus mandatory SMART AENs (we'll never send them, but allow enabling them):
@@ -64,6 +63,7 @@ struct nvmet_ns {
loff_t size;
u8 nguid[16];
uuid_t uuid;
+ u32 anagrpid;
bool buffered_io;
bool enabled;
@@ -98,6 +98,18 @@ struct nvmet_sq {
struct completion confirm_done;
};
+struct nvmet_ana_group {
+ struct config_group group;
+ struct nvmet_port *port;
+ u32 grpid;
+};
+
+static inline struct nvmet_ana_group *to_ana_group(struct config_item *item)
+{
+ return container_of(to_config_group(item), struct nvmet_ana_group,
+ group);
+}
+
/**
* struct nvmet_port - Common structure to keep port
* information for the target.
@@ -115,6 +127,9 @@ struct nvmet_port {
struct list_head subsystems;
struct config_group referrals_group;
struct list_head referrals;
+ struct config_group ana_groups_group;
+ struct nvmet_ana_group ana_default_group;
+ enum nvme_ana_state *ana_state;
void *priv;
bool enabled;
int inline_data_size;
@@ -126,6 +141,13 @@ static inline struct nvmet_port *to_nvmet_port(struct config_item *item)
group);
}
+static inline struct nvmet_port *ana_groups_to_port(
+ struct config_item *item)
+{
+ return container_of(to_config_group(item), struct nvmet_port,
+ ana_groups_group);
+}
+
struct nvmet_ctrl {
struct nvmet_subsys *subsys;
struct nvmet_cq **cqs;
@@ -140,6 +162,8 @@ struct nvmet_ctrl {
u16 cntlid;
u32 kato;
+ struct nvmet_port *port;
+
u32 aen_enabled;
unsigned long aen_masked;
struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS];
@@ -168,6 +192,7 @@ struct nvmet_subsys {
struct kref ref;
struct list_head namespaces;
+ unsigned int nr_namespaces;
unsigned int max_nsid;
struct list_head ctrls;
@@ -340,6 +365,10 @@ void nvmet_ns_disable(struct nvmet_ns *ns);
struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid);
void nvmet_ns_free(struct nvmet_ns *ns);
+void nvmet_send_ana_event(struct nvmet_subsys *subsys,
+ struct nvmet_port *port);
+void nvmet_port_send_ana_event(struct nvmet_port *port);
+
int nvmet_register_transport(const struct nvmet_fabrics_ops *ops);
void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops);
@@ -360,6 +389,22 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd);
#define NVMET_QUEUE_SIZE 1024
#define NVMET_NR_QUEUES 128
#define NVMET_MAX_CMD NVMET_QUEUE_SIZE
+
+/*
+ * Nice round number that makes a list of nsids fit into a page.
+ * Should become tunable at some point in the future.
+ */
+#define NVMET_MAX_NAMESPACES 1024
+
+/*
+ * 0 is not a valid ANA group ID, so we start numbering at 1.
+ *
+ * ANA Group 1 exists without manual intervention, has namespaces assigned to it
+ * by default, and is available in an optimized state through all ports.
+ */
+#define NVMET_MAX_ANAGRPS 128
+#define NVMET_DEFAULT_ANA_GRPID 1
+
#define NVMET_KAS 10
#define NVMET_DISC_KATO 120
@@ -373,6 +418,10 @@ extern struct nvmet_subsys *nvmet_disc_subsys;
extern u64 nvmet_genctr;
extern struct rw_semaphore nvmet_config_sem;
+extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
+extern u64 nvmet_ana_chgcnt;
+extern struct rw_semaphore nvmet_ana_sem;
+
bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
const char *hostnqn);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 80dfedcf0bf7..64c9175723de 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -242,7 +242,12 @@ struct nvme_id_ctrl {
__le32 sanicap;
__le32 hmminds;
__le16 hmmaxd;
- __u8 rsvd338[174];
+ __u8 rsvd338[4];
+ __u8 anatt;
+ __u8 anacap;
+ __le32 anagrpmax;
+ __le32 nanagrpid;
+ __u8 rsvd352[160];
__u8 sqes;
__u8 cqes;
__le16 maxcmd;
@@ -258,7 +263,8 @@ struct nvme_id_ctrl {
__le16 acwu;
__u8 rsvd534[2];
__le32 sgls;
- __u8 rsvd540[228];
+ __le32 mnan;
+ __u8 rsvd544[224];
char subnqn[256];
__u8 rsvd1024[768];
__le32 ioccsz;
@@ -312,7 +318,9 @@ struct nvme_id_ns {
__le16 nabspf;
__le16 noiob;
__u8 nvmcap[16];
- __u8 rsvd64[40];
+ __u8 rsvd64[28];
+ __le32 anagrpid;
+ __u8 rsvd96[8];
__u8 nguid[16];
__u8 eui64[8];
struct nvme_lbaf lbaf[16];
@@ -425,6 +433,32 @@ struct nvme_effects_log {
__u8 resv[2048];
};
+enum nvme_ana_state {
+ NVME_ANA_OPTIMIZED = 0x01,
+ NVME_ANA_NONOPTIMIZED = 0x02,
+ NVME_ANA_INACCESSIBLE = 0x03,
+ NVME_ANA_PERSISTENT_LOSS = 0x04,
+ NVME_ANA_CHANGE = 0x0f,
+};
+
+struct nvme_ana_group_desc {
+ __le32 grpid;
+ __le32 nnsids;
+ __le64 chgcnt;
+ __u8 state;
+ __u8 rsvd17[7];
+ __le32 nsids[];
+};
+
+/* flag for the log specific field of the ANA log */
+#define NVME_ANA_LOG_RGO (1 << 0)
+
+struct nvme_ana_rsp_hdr {
+ __le64 chgcnt;
+ __le16 ngrps;
+ __le16 rsvd10[3];
+};
+
enum {
NVME_SMART_CRIT_SPARE = 1 << 0,
NVME_SMART_CRIT_TEMPERATURE = 1 << 1,
@@ -444,11 +478,13 @@ enum {
enum {
NVME_AER_NOTICE_NS_CHANGED = 0x00,
NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
+ NVME_AER_NOTICE_ANA = 0x03,
};
enum {
NVME_AEN_CFG_NS_ATTR = 1 << 8,
NVME_AEN_CFG_FW_ACT = 1 << 9,
+ NVME_AEN_CFG_ANA_CHANGE = 1 << 11,
};
struct nvme_lba_range_type {
@@ -763,6 +799,7 @@ enum {
NVME_LOG_FW_SLOT = 0x03,
NVME_LOG_CHANGED_NS = 0x04,
NVME_LOG_CMD_EFFECTS = 0x05,
+ NVME_LOG_ANA = 0x0c,
NVME_LOG_DISC = 0x70,
NVME_LOG_RESERVATION = 0x80,
NVME_FWACT_REPL = (0 << 3),
@@ -885,7 +922,7 @@ struct nvme_get_log_page_command {
__u64 rsvd2[2];
union nvme_data_ptr dptr;
__u8 lid;
- __u8 rsvd10;
+ __u8 lsp; /* upper 4 bits reserved */
__le16 numdl;
__le16 numdu;
__u16 rsvd11;
@@ -1185,6 +1222,13 @@ enum {
NVME_SC_ACCESS_DENIED = 0x286,
NVME_SC_UNWRITTEN_BLOCK = 0x287,
+ /*
+ * Path-related Errors:
+ */
+ NVME_SC_ANA_PERSISTENT_LOSS = 0x301,
+ NVME_SC_ANA_INACCESSIBLE = 0x302,
+ NVME_SC_ANA_TRANSITION = 0x303,
+
NVME_SC_DNR = 0x4000,
};