summaryrefslogtreecommitdiff
path: root/drivers/nvme/host
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r--drivers/nvme/host/Kconfig2
-rw-r--r--drivers/nvme/host/core.c293
-rw-r--r--drivers/nvme/host/fabrics.c8
-rw-r--r--drivers/nvme/host/fc.c17
-rw-r--r--drivers/nvme/host/multipath.c28
-rw-r--r--drivers/nvme/host/nvme.h6
-rw-r--r--drivers/nvme/host/pci.c103
-rw-r--r--drivers/nvme/host/rdma.c19
-rw-r--r--drivers/nvme/host/tcp.c136
9 files changed, 361 insertions, 251 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index b9358db83e96..9c17ed32be64 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -32,8 +32,6 @@ config NVME_HWMON
a hardware monitoring device will be created for each NVMe drive
in the system.
- If unsure, say N.
-
config NVME_FABRICS
tristate
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index a4d8c90ee7cc..f3c037f5a9ba 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -6,6 +6,7 @@
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
+#include <linux/compat.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/hdreg.h>
@@ -171,7 +172,6 @@ static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
nvme_remove_namespaces(ctrl);
ctrl->ops->delete_ctrl(ctrl);
nvme_uninit_ctrl(ctrl);
- nvme_put_ctrl(ctrl);
}
static void nvme_delete_ctrl_work(struct work_struct *work)
@@ -192,21 +192,16 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
-static int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
+static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
{
- int ret = 0;
-
/*
* Keep a reference until nvme_do_delete_ctrl() complete,
* since ->delete_ctrl can free the controller.
*/
nvme_get_ctrl(ctrl);
- if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
- ret = -EBUSY;
- if (!ret)
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
nvme_do_delete_ctrl(ctrl);
nvme_put_ctrl(ctrl);
- return ret;
}
static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
@@ -291,11 +286,8 @@ void nvme_complete_rq(struct request *req)
nvme_req(req)->ctrl->comp_seen = true;
if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
- if ((req->cmd_flags & REQ_NVME_MPATH) &&
- blk_path_error(status)) {
- nvme_failover_req(req);
+ if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req))
return;
- }
if (!blk_queue_dying(req->q)) {
nvme_retry_req(req);
@@ -1055,6 +1047,43 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
return error;
}
+static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
+ struct nvme_ns_id_desc *cur)
+{
+ const char *warn_str = "ctrl returned bogus length:";
+ void *data = cur;
+
+ switch (cur->nidt) {
+ case NVME_NIDT_EUI64:
+ if (cur->nidl != NVME_NIDT_EUI64_LEN) {
+ dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
+ warn_str, cur->nidl);
+ return -1;
+ }
+ memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
+ return NVME_NIDT_EUI64_LEN;
+ case NVME_NIDT_NGUID:
+ if (cur->nidl != NVME_NIDT_NGUID_LEN) {
+ dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
+ warn_str, cur->nidl);
+ return -1;
+ }
+ memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
+ return NVME_NIDT_NGUID_LEN;
+ case NVME_NIDT_UUID:
+ if (cur->nidl != NVME_NIDT_UUID_LEN) {
+ dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
+ warn_str, cur->nidl);
+ return -1;
+ }
+ uuid_copy(&ids->uuid, data + sizeof(*cur));
+ return NVME_NIDT_UUID_LEN;
+ default:
+ /* Skip unknown types */
+ return cur->nidl;
+ }
+}
+
static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
struct nvme_ns_ids *ids)
{
@@ -1074,8 +1103,17 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
NVME_IDENTIFY_DATA_SIZE);
- if (status)
+ if (status) {
+ dev_warn(ctrl->device,
+ "Identify Descriptors failed (%d)\n", status);
+ /*
+ * Don't treat an error as fatal, as we potentially already
+ * have a NGUID or EUI-64.
+ */
+ if (status > 0 && !(status & NVME_SC_DNR))
+ status = 0;
goto free_data;
+ }
for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
struct nvme_ns_id_desc *cur = data + pos;
@@ -1083,42 +1121,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
if (cur->nidl == 0)
break;
- switch (cur->nidt) {
- case NVME_NIDT_EUI64:
- if (cur->nidl != NVME_NIDT_EUI64_LEN) {
- dev_warn(ctrl->device,
- "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n",
- cur->nidl);
- goto free_data;
- }
- len = NVME_NIDT_EUI64_LEN;
- memcpy(ids->eui64, data + pos + sizeof(*cur), len);
- break;
- case NVME_NIDT_NGUID:
- if (cur->nidl != NVME_NIDT_NGUID_LEN) {
- dev_warn(ctrl->device,
- "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n",
- cur->nidl);
- goto free_data;
- }
- len = NVME_NIDT_NGUID_LEN;
- memcpy(ids->nguid, data + pos + sizeof(*cur), len);
- break;
- case NVME_NIDT_UUID:
- if (cur->nidl != NVME_NIDT_UUID_LEN) {
- dev_warn(ctrl->device,
- "ctrl returned bogus length: %d for NVME_NIDT_UUID\n",
- cur->nidl);
- goto free_data;
- }
- len = NVME_NIDT_UUID_LEN;
- uuid_copy(&ids->uuid, data + pos + sizeof(*cur));
- break;
- default:
- /* Skip unknown types */
- len = cur->nidl;
- break;
- }
+ len = nvme_process_ns_desc(ctrl, ids, cur);
+ if (len < 0)
+ goto free_data;
len += sizeof(*cur);
}
@@ -1248,6 +1253,18 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl)
queue_work(nvme_wq, &ctrl->async_event_work);
}
+/*
+ * Convert integer values from ioctl structures to user pointers, silently
+ * ignoring the upper bits in the compat case to match behaviour of 32-bit
+ * kernels.
+ */
+static void __user *nvme_to_user_ptr(uintptr_t ptrval)
+{
+ if (in_compat_syscall())
+ ptrval = (compat_uptr_t)ptrval;
+ return (void __user *)ptrval;
+}
+
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
{
struct nvme_user_io io;
@@ -1271,7 +1288,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
length = (io.nblocks + 1) << ns->lba_shift;
meta_len = (io.nblocks + 1) * ns->ms;
- metadata = (void __user *)(uintptr_t)io.metadata;
+ metadata = nvme_to_user_ptr(io.metadata);
if (ns->ext) {
length += meta_len;
@@ -1294,7 +1311,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
c.rw.appmask = cpu_to_le16(io.appmask);
return nvme_submit_user_cmd(ns->queue, &c,
- (void __user *)(uintptr_t)io.addr, length,
+ nvme_to_user_ptr(io.addr), length,
metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
}
@@ -1414,9 +1431,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
- (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
- (void __user *)(uintptr_t)cmd.metadata,
- cmd.metadata_len, 0, &result, timeout);
+ nvme_to_user_ptr(cmd.addr), cmd.data_len,
+ nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
+ 0, &result, timeout);
nvme_passthru_end(ctrl, effects);
if (status >= 0) {
@@ -1461,8 +1478,8 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
- (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
- (void __user *)(uintptr_t)cmd.metadata, cmd.metadata_len,
+ nvme_to_user_ptr(cmd.addr), cmd.data_len,
+ nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
0, &cmd.result, timeout);
nvme_passthru_end(ctrl, effects);
@@ -1584,6 +1601,47 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
return ret;
}
+#ifdef CONFIG_COMPAT
+struct nvme_user_io32 {
+ __u8 opcode;
+ __u8 flags;
+ __u16 control;
+ __u16 nblocks;
+ __u16 rsvd;
+ __u64 metadata;
+ __u64 addr;
+ __u64 slba;
+ __u32 dsmgmt;
+ __u32 reftag;
+ __u16 apptag;
+ __u16 appmask;
+} __attribute__((__packed__));
+
+#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32)
+
+static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ /*
+ * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO
+ * between 32 bit programs and 64 bit kernel.
+ * The cause is that the results of sizeof(struct nvme_user_io),
+ * which is used to define NVME_IOCTL_SUBMIT_IO,
+ * are not same between 32 bit compiler and 64 bit compiler.
+ * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling
+ * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs.
+ * Other IOCTL numbers are same between 32 bit and 64 bit.
+ * So there is nothing to do regarding to other IOCTL numbers.
+ */
+ if (cmd == NVME_IOCTL_SUBMIT_IO32)
+ return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg);
+
+ return nvme_ioctl(bdev, mode, cmd, arg);
+}
+#else
+#define nvme_compat_ioctl NULL
+#endif /* CONFIG_COMPAT */
+
static int nvme_open(struct block_device *bdev, fmode_t mode)
{
struct nvme_ns *ns = bdev->bd_disk->private_data;
@@ -1721,26 +1779,15 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns)
static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
struct nvme_id_ns *id, struct nvme_ns_ids *ids)
{
- int ret = 0;
-
memset(ids, 0, sizeof(*ids));
if (ctrl->vs >= NVME_VS(1, 1, 0))
memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
if (ctrl->vs >= NVME_VS(1, 2, 0))
memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
- if (ctrl->vs >= NVME_VS(1, 3, 0)) {
- /* Don't treat error as fatal we potentially
- * already have a NGUID or EUI-64
- */
- ret = nvme_identify_ns_descs(ctrl, nsid, ids);
- if (ret)
- dev_warn(ctrl->device,
- "Identify Descriptors failed (%d)\n", ret);
- if (ret > 0)
- ret = 0;
- }
- return ret;
+ if (ctrl->vs >= NVME_VS(1, 3, 0))
+ return nvme_identify_ns_descs(ctrl, nsid, ids);
+ return 0;
}
static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
@@ -1810,7 +1857,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
ns->lba_shift > PAGE_SHIFT)
capacity = 0;
- set_capacity(disk, capacity);
+ set_capacity_revalidate_and_notify(disk, capacity, false);
nvme_config_discard(disk, ns);
nvme_config_write_zeroes(disk, ns);
@@ -1850,6 +1897,13 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
if (ns->head->disk) {
nvme_update_disk_info(ns->head->disk, ns, id);
blk_queue_stack_limits(ns->head->disk->queue, ns->queue);
+ if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) {
+ struct backing_dev_info *info =
+ ns->head->disk->queue->backing_dev_info;
+
+ info->capabilities |= BDI_CAP_STABLE_WRITES;
+ }
+
revalidate_disk(ns->head->disk);
}
#endif
@@ -2027,7 +2081,7 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit);
static const struct block_device_operations nvme_fops = {
.owner = THIS_MODULE,
.ioctl = nvme_ioctl,
- .compat_ioctl = nvme_ioctl,
+ .compat_ioctl = nvme_compat_ioctl,
.open = nvme_open,
.release = nvme_release,
.getgeo = nvme_getgeo,
@@ -2055,7 +2109,7 @@ const struct block_device_operations nvme_ns_head_ops = {
.open = nvme_ns_head_open,
.release = nvme_ns_head_release,
.ioctl = nvme_ioctl,
- .compat_ioctl = nvme_ioctl,
+ .compat_ioctl = nvme_compat_ioctl,
.getgeo = nvme_getgeo,
.pr_ops = &nvme_pr_ops,
};
@@ -2074,13 +2128,13 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
if ((csts & NVME_CSTS_RDY) == bit)
break;
- msleep(100);
+ usleep_range(1000, 2000);
if (fatal_signal_pending(current))
return -EINTR;
if (time_after(jiffies, timeout)) {
dev_err(ctrl->device,
- "Device not ready; aborting %s\n", enabled ?
- "initialisation" : "reset");
+ "Device not ready; aborting %s, CSTS=0x%x\n",
+ enabled ? "initialisation" : "reset", csts);
return -ENODEV;
}
}
@@ -2591,8 +2645,7 @@ static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
lockdep_assert_held(&nvme_subsystems_lock);
list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
- if (tmp->state == NVME_CTRL_DELETING ||
- tmp->state == NVME_CTRL_DEAD)
+ if (nvme_state_terminal(tmp))
continue;
if (tmp->cntlid == ctrl->cntlid) {
@@ -3193,6 +3246,10 @@ static ssize_t nvme_sysfs_delete(struct device *dev,
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ /* Can't delete non-created controllers */
+ if (!ctrl->created)
+ return -EBUSY;
+
if (device_remove_file_self(dev, attr))
nvme_delete_ctrl_sync(ctrl);
return count;
@@ -3242,6 +3299,26 @@ static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
}
static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
+static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->opts->host->nqn);
+}
+static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
+
+static ssize_t nvme_sysfs_show_hostid(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ return snprintf(buf, PAGE_SIZE, "%pU\n", &ctrl->opts->host->id);
+}
+static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL);
+
static ssize_t nvme_sysfs_show_address(struct device *dev,
struct device_attribute *attr,
char *buf)
@@ -3267,6 +3344,8 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_numa_node.attr,
&dev_attr_queue_count.attr,
&dev_attr_sqsize.attr,
+ &dev_attr_hostnqn.attr,
+ &dev_attr_hostid.attr,
NULL
};
@@ -3280,6 +3359,10 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
return 0;
if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
return 0;
+ if (a == &dev_attr_hostnqn.attr && !ctrl->opts)
+ return 0;
+ if (a == &dev_attr_hostid.attr && !ctrl->opts)
+ return 0;
return a->mode;
}
@@ -3294,7 +3377,7 @@ static const struct attribute_group *nvme_dev_attr_groups[] = {
NULL,
};
-static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys,
+static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys,
unsigned nsid)
{
struct nvme_ns_head *h;
@@ -3327,7 +3410,8 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys,
}
static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
- unsigned nsid, struct nvme_id_ns *id)
+ unsigned nsid, struct nvme_id_ns *id,
+ struct nvme_ns_ids *ids)
{
struct nvme_ns_head *head;
size_t size = sizeof(*head);
@@ -3350,12 +3434,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
goto out_ida_remove;
head->subsys = ctrl->subsys;
head->ns_id = nsid;
+ head->ids = *ids;
kref_init(&head->ref);
- ret = nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
- if (ret)
- goto out_cleanup_srcu;
-
ret = __nvme_check_ids(ctrl->subsys, head);
if (ret) {
dev_err(ctrl->device,
@@ -3390,24 +3471,23 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
struct nvme_ctrl *ctrl = ns->ctrl;
bool is_shared = id->nmic & (1 << 0);
struct nvme_ns_head *head = NULL;
+ struct nvme_ns_ids ids;
int ret = 0;
+ ret = nvme_report_ns_ids(ctrl, nsid, id, &ids);
+ if (ret)
+ goto out;
+
mutex_lock(&ctrl->subsys->lock);
if (is_shared)
- head = __nvme_find_ns_head(ctrl->subsys, nsid);
+ head = nvme_find_ns_head(ctrl->subsys, nsid);
if (!head) {
- head = nvme_alloc_ns_head(ctrl, nsid, id);
+ head = nvme_alloc_ns_head(ctrl, nsid, id, &ids);
if (IS_ERR(head)) {
ret = PTR_ERR(head);
goto out_unlock;
}
} else {
- struct nvme_ns_ids ids;
-
- ret = nvme_report_ns_ids(ctrl, nsid, id, &ids);
- if (ret)
- goto out_unlock;
-
if (!nvme_ns_ids_equal(&head->ids, &ids)) {
dev_err(ctrl->device,
"IDs don't match for shared namespace %d\n",
@@ -3422,6 +3502,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
out_unlock:
mutex_unlock(&ctrl->subsys->lock);
+out:
if (ret > 0)
ret = blk_status_to_errno(nvme_error_status(ret));
return ret;
@@ -3480,7 +3561,7 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
return 0;
}
-static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns;
struct gendisk *disk;
@@ -3490,13 +3571,11 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
- return -ENOMEM;
+ return;
ns->queue = blk_mq_init_queue(ctrl->tagset);
- if (IS_ERR(ns->queue)) {
- ret = PTR_ERR(ns->queue);
+ if (IS_ERR(ns->queue))
goto out_free_ns;
- }
if (ctrl->opts && ctrl->opts->data_digest)
ns->queue->backing_dev_info->capabilities
@@ -3519,10 +3598,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
if (ret)
goto out_free_queue;
- if (id->ncap == 0) {
- ret = -EINVAL;
+ if (id->ncap == 0) /* no namespace (legacy quirk) */
goto out_free_id;
- }
ret = nvme_init_ns_head(ns, nsid, id);
if (ret)
@@ -3531,10 +3608,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
nvme_set_disk_name(disk_name, ns, ctrl, &flags);
disk = alloc_disk_node(0, node);
- if (!disk) {
- ret = -ENOMEM;
+ if (!disk)
goto out_unlink_ns;
- }
disk->fops = &nvme_fops;
disk->private_data = ns;
@@ -3565,8 +3640,10 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
kfree(id);
- return 0;
+ return;
out_put_disk:
+ /* prevent double queue cleanup */
+ ns->disk->queue = NULL;
put_disk(ns->disk);
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
@@ -3579,9 +3656,6 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
blk_cleanup_queue(ns->queue);
out_free_ns:
kfree(ns);
- if (ret > 0)
- ret = blk_status_to_errno(nvme_error_status(ret));
- return ret;
}
static void nvme_ns_remove(struct nvme_ns *ns)
@@ -3987,6 +4061,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
nvme_queue_scan(ctrl);
nvme_start_queues(ctrl);
}
+ ctrl->created = true;
}
EXPORT_SYMBOL_GPL(nvme_start_ctrl);
@@ -3995,6 +4070,7 @@ void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
nvme_fault_inject_fini(&ctrl->fault_inject);
dev_pm_qos_hide_latency_tolerance(ctrl->device);
cdev_device_del(&ctrl->cdev, ctrl->device);
+ nvme_put_ctrl(ctrl);
}
EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
@@ -4077,6 +4153,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
if (ret)
goto out_release_instance;
+ nvme_get_ctrl(ctrl);
cdev_init(&ctrl->cdev, &nvme_dev_fops);
ctrl->cdev.owner = ops->module;
ret = cdev_device_add(&ctrl->cdev, ctrl->device);
@@ -4095,6 +4172,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
return 0;
out_free_name:
+ nvme_put_ctrl(ctrl);
kfree_const(ctrl->device->kobj.name);
out_release_instance:
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
@@ -4299,6 +4377,7 @@ static void __exit nvme_core_exit(void)
destroy_workqueue(nvme_delete_wq);
destroy_workqueue(nvme_reset_wq);
destroy_workqueue(nvme_wq);
+ ida_destroy(&nvme_instance_ida);
}
MODULE_LICENSE("GPL");
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 74b8818ac9a1..2a6c8190eeb7 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -105,14 +105,14 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
int len = 0;
if (ctrl->opts->mask & NVMF_OPT_TRADDR)
- len += snprintf(buf, size, "traddr=%s", ctrl->opts->traddr);
+ len += scnprintf(buf, size, "traddr=%s", ctrl->opts->traddr);
if (ctrl->opts->mask & NVMF_OPT_TRSVCID)
- len += snprintf(buf + len, size - len, "%strsvcid=%s",
+ len += scnprintf(buf + len, size - len, "%strsvcid=%s",
(len) ? "," : "", ctrl->opts->trsvcid);
if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR)
- len += snprintf(buf + len, size - len, "%shost_traddr=%s",
+ len += scnprintf(buf + len, size - len, "%shost_traddr=%s",
(len) ? "," : "", ctrl->opts->host_traddr);
- len += snprintf(buf + len, size - len, "\n");
+ len += scnprintf(buf + len, size - len, "\n");
return len;
}
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 5a70ac395d53..7dfc4a2ecf1e 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -342,8 +342,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo,
!template->ls_req || !template->fcp_io ||
!template->ls_abort || !template->fcp_abort ||
!template->max_hw_queues || !template->max_sgl_segments ||
- !template->max_dif_sgl_segments || !template->dma_boundary ||
- !template->module) {
+ !template->max_dif_sgl_segments || !template->dma_boundary) {
ret = -EINVAL;
goto out_reghost_failed;
}
@@ -2016,7 +2015,6 @@ nvme_fc_ctrl_free(struct kref *ref)
{
struct nvme_fc_ctrl *ctrl =
container_of(ref, struct nvme_fc_ctrl, ref);
- struct nvme_fc_lport *lport = ctrl->lport;
unsigned long flags;
if (ctrl->ctrl.tagset) {
@@ -2043,7 +2041,6 @@ nvme_fc_ctrl_free(struct kref *ref)
if (ctrl->ctrl.opts)
nvmf_free_options(ctrl->ctrl.opts);
kfree(ctrl);
- module_put(lport->ops->module);
}
static void
@@ -3074,15 +3071,10 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
goto out_fail;
}
- if (!try_module_get(lport->ops->module)) {
- ret = -EUNATCH;
- goto out_free_ctrl;
- }
-
idx = ida_simple_get(&nvme_fc_ctrl_cnt, 0, 0, GFP_KERNEL);
if (idx < 0) {
ret = -ENOSPC;
- goto out_mod_put;
+ goto out_free_ctrl;
}
ctrl->ctrl.opts = opts;
@@ -3181,10 +3173,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
goto fail_ctrl;
}
- nvme_get_ctrl(&ctrl->ctrl);
-
if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) {
- nvme_put_ctrl(&ctrl->ctrl);
dev_err(ctrl->ctrl.device,
"NVME-FC{%d}: failed to schedule initial connect\n",
ctrl->cnum);
@@ -3235,8 +3224,6 @@ out_free_queues:
out_free_ida:
put_device(ctrl->dev);
ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum);
-out_mod_put:
- module_put(lport->ops->module);
out_free_ctrl:
kfree(ctrl);
out_fail:
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index a11900cf3a36..54603bd3e02d 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -64,17 +64,12 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
}
}
-void nvme_failover_req(struct request *req)
+bool nvme_failover_req(struct request *req)
{
struct nvme_ns *ns = req->q->queuedata;
u16 status = nvme_req(req)->status;
unsigned long flags;
- spin_lock_irqsave(&ns->head->requeue_lock, flags);
- blk_steal_bios(&ns->head->requeue_list, req);
- spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
- blk_mq_end_request(req, 0);
-
switch (status & 0x7ff) {
case NVME_SC_ANA_TRANSITION:
case NVME_SC_ANA_INACCESSIBLE:
@@ -103,15 +98,17 @@ void nvme_failover_req(struct request *req)
nvme_mpath_clear_current_path(ns);
break;
default:
- /*
- * Reset the controller for any non-ANA error as we don't know
- * what caused the error.
- */
- nvme_reset_ctrl(ns->ctrl);
- break;
+ /* This was a non-ANA error so follow the normal error path. */
+ return false;
}
+ spin_lock_irqsave(&ns->head->requeue_lock, flags);
+ blk_steal_bios(&ns->head->requeue_list, req);
+ spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
+ blk_mq_end_request(req, 0);
+
kblockd_schedule_work(&ns->head->requeue_work);
+ return true;
}
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
@@ -377,11 +374,10 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
return 0;
- q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node);
+ q = blk_alloc_queue(nvme_ns_head_make_request, ctrl->numa_node);
if (!q)
goto out;
q->queuedata = head;
- blk_queue_make_request(q, nvme_ns_head_make_request);
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
/* set to a default value for 512 until disk is validated */
blk_queue_logical_block_size(q, 512);
@@ -514,7 +510,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
if (!nr_nsids)
return 0;
- down_write(&ctrl->namespaces_rwsem);
+ down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
unsigned nsid = le32_to_cpu(desc->nsids[n]);
@@ -525,7 +521,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
if (++n == nr_nsids)
break;
}
- up_write(&ctrl->namespaces_rwsem);
+ up_read(&ctrl->namespaces_rwsem);
return 0;
}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1024fec7914c..2e04a36296d9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -259,6 +259,7 @@ struct nvme_ctrl {
struct nvme_command ka_cmd;
struct work_struct fw_act_work;
unsigned long events;
+ bool created;
#ifdef CONFIG_NVME_MULTIPATH
/* asymmetric namespace access: */
@@ -550,7 +551,7 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
struct nvme_ctrl *ctrl, int *flags);
-void nvme_failover_req(struct request *req);
+bool nvme_failover_req(struct request *req);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
@@ -599,8 +600,9 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
}
-static inline void nvme_failover_req(struct request *req)
+static inline bool nvme_failover_req(struct request *req)
{
+ return false;
}
static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d3f23d6254e4..cc46e250fcac 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -971,39 +971,34 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
nvme_end_request(req, cqe->status, cqe->result);
}
-static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end)
-{
- while (start != end) {
- nvme_handle_cqe(nvmeq, start);
- if (++start == nvmeq->q_depth)
- start = 0;
- }
-}
-
static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
{
- if (nvmeq->cq_head == nvmeq->q_depth - 1) {
+ u16 tmp = nvmeq->cq_head + 1;
+
+ if (tmp == nvmeq->q_depth) {
nvmeq->cq_head = 0;
- nvmeq->cq_phase = !nvmeq->cq_phase;
+ nvmeq->cq_phase ^= 1;
} else {
- nvmeq->cq_head++;
+ nvmeq->cq_head = tmp;
}
}
-static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
- u16 *end, unsigned int tag)
+static inline int nvme_process_cq(struct nvme_queue *nvmeq)
{
int found = 0;
- *start = nvmeq->cq_head;
while (nvme_cqe_pending(nvmeq)) {
- if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag)
- found++;
+ found++;
+ /*
+ * load-load control dependency between phase and the rest of
+ * the cqe requires a full read memory barrier
+ */
+ dma_rmb();
+ nvme_handle_cqe(nvmeq, nvmeq->cq_head);
nvme_update_cq_head(nvmeq);
}
- *end = nvmeq->cq_head;
- if (*start != *end)
+ if (found)
nvme_ring_cq_doorbell(nvmeq);
return found;
}
@@ -1012,21 +1007,16 @@ static irqreturn_t nvme_irq(int irq, void *data)
{
struct nvme_queue *nvmeq = data;
irqreturn_t ret = IRQ_NONE;
- u16 start, end;
/*
* The rmb/wmb pair ensures we see all updates from a previous run of
* the irq handler, even if that was on another CPU.
*/
rmb();
- nvme_process_cq(nvmeq, &start, &end, -1);
+ if (nvme_process_cq(nvmeq))
+ ret = IRQ_HANDLED;
wmb();
- if (start != end) {
- nvme_complete_cqes(nvmeq, start, end);
- return IRQ_HANDLED;
- }
-
return ret;
}
@@ -1039,46 +1029,30 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
}
/*
- * Poll for completions any queue, including those not dedicated to polling.
+ * Poll for completions for any interrupt driven queue
* Can be called from any context.
*/
-static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
+static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
{
struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
- u16 start, end;
- int found;
- /*
- * For a poll queue we need to protect against the polling thread
- * using the CQ lock. For normal interrupt driven threads we have
- * to disable the interrupt to avoid racing with it.
- */
- if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) {
- spin_lock(&nvmeq->cq_poll_lock);
- found = nvme_process_cq(nvmeq, &start, &end, tag);
- spin_unlock(&nvmeq->cq_poll_lock);
- } else {
- disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
- found = nvme_process_cq(nvmeq, &start, &end, tag);
- enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
- }
+ WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags));
- nvme_complete_cqes(nvmeq, start, end);
- return found;
+ disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
+ nvme_process_cq(nvmeq);
+ enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
}
static int nvme_poll(struct blk_mq_hw_ctx *hctx)
{
struct nvme_queue *nvmeq = hctx->driver_data;
- u16 start, end;
bool found;
if (!nvme_cqe_pending(nvmeq))
return 0;
spin_lock(&nvmeq->cq_poll_lock);
- found = nvme_process_cq(nvmeq, &start, &end, -1);
- nvme_complete_cqes(nvmeq, start, end);
+ found = nvme_process_cq(nvmeq);
spin_unlock(&nvmeq->cq_poll_lock);
return found;
@@ -1255,7 +1229,12 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
/*
* Did we miss an interrupt?
*/
- if (nvme_poll_irqdisable(nvmeq, req->tag)) {
+ if (test_bit(NVMEQ_POLLED, &nvmeq->flags))
+ nvme_poll(req->mq_hctx);
+ else
+ nvme_poll_irqdisable(nvmeq);
+
+ if (blk_mq_request_completed(req)) {
dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, completion polled\n",
req->tag, nvmeq->qid);
@@ -1398,23 +1377,23 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
else
nvme_disable_ctrl(&dev->ctrl);
- nvme_poll_irqdisable(nvmeq, -1);
+ nvme_poll_irqdisable(nvmeq);
}
/*
* Called only on a device that has been disabled and after all other threads
- * that can check this device's completion queues have synced. This is the
- * last chance for the driver to see a natural completion before
- * nvme_cancel_request() terminates all incomplete requests.
+ * that can check this device's completion queues have synced, except
+ * nvme_poll(). This is the last chance for the driver to see a natural
+ * completion before nvme_cancel_request() terminates all incomplete requests.
*/
static void nvme_reap_pending_cqes(struct nvme_dev *dev)
{
- u16 start, end;
int i;
for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
- nvme_process_cq(&dev->queues[i], &start, &end, -1);
- nvme_complete_cqes(&dev->queues[i], start, end);
+ spin_lock(&dev->queues[i].cq_poll_lock);
+ nvme_process_cq(&dev->queues[i]);
+ spin_unlock(&dev->queues[i].cq_poll_lock);
}
}
@@ -2503,13 +2482,13 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
struct nvme_dev *dev = to_nvme_dev(ctrl);
nvme_dbbuf_dma_free(dev);
- put_device(dev->dev);
nvme_free_tagset(dev);
if (dev->ctrl.admin_q)
blk_put_queue(dev->ctrl.admin_q);
- kfree(dev->queues);
free_opal_dev(dev->ctrl.opal_dev);
mempool_destroy(dev->iod_mempool);
+ put_device(dev->dev);
+ kfree(dev->queues);
kfree(dev);
}
@@ -2689,7 +2668,7 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
{
struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);
- return snprintf(buf, size, "%s", dev_name(&pdev->dev));
+ return snprintf(buf, size, "%s\n", dev_name(&pdev->dev));
}
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
@@ -2835,7 +2814,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
nvme_reset_ctrl(&dev->ctrl);
- nvme_get_ctrl(&dev->ctrl);
async_schedule(nvme_async_probe, dev);
return 0;
@@ -2907,10 +2885,9 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_free_host_mem(dev);
nvme_dev_remove_admin(dev);
nvme_free_queues(dev, 0);
- nvme_uninit_ctrl(&dev->ctrl);
nvme_release_prp_pools(dev);
nvme_dev_unmap(dev);
- nvme_put_ctrl(&dev->ctrl);
+ nvme_uninit_ctrl(&dev->ctrl);
}
#ifdef CONFIG_PM_SLEEP
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 0fe08c4dfd2f..cac8a930396a 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -142,14 +142,6 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
static const struct blk_mq_ops nvme_rdma_mq_ops;
static const struct blk_mq_ops nvme_rdma_admin_mq_ops;
-/* XXX: really should move to a generic header sooner or later.. */
-static inline void put_unaligned_le24(u32 val, u8 *p)
-{
- *p++ = val;
- *p++ = val >> 8;
- *p++ = val >> 16;
-}
-
static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
{
return queue - queue->ctrl->queues;
@@ -1024,8 +1016,13 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
if (!changed) {
- /* state change failure is ok if we're in DELETING state */
+ /*
+ * state change failure is ok if we're in DELETING state,
+ * unless we're during creation of a new controller to
+ * avoid races with teardown flow.
+ */
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
+ WARN_ON_ONCE(new);
ret = -EINVAL;
goto destroy_io;
}
@@ -1345,7 +1342,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
int ret;
sge->addr = qe->dma;
- sge->length = sizeof(struct nvme_command),
+ sge->length = sizeof(struct nvme_command);
sge->lkey = queue->device->pd->local_dma_lkey;
wr.next = NULL;
@@ -2045,8 +2042,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
- nvme_get_ctrl(&ctrl->ctrl);
-
mutex_lock(&nvme_rdma_ctrl_mutex);
list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
mutex_unlock(&nvme_rdma_ctrl_mutex);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 49d4373b84eb..c15a92163c1f 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -20,6 +20,16 @@
struct nvme_tcp_queue;
+/* Define the socket priority to use for connections were it is desirable
+ * that the NIC consider performing optimized packet processing or filtering.
+ * A non-zero value being sufficient to indicate general consideration of any
+ * possible optimization. Making it a module param allows for alternative
+ * values that may be unique for some NIC implementations.
+ */
+static int so_priority;
+module_param(so_priority, int, 0644);
+MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
+
enum nvme_tcp_send_state {
NVME_TCP_SEND_CMD_PDU = 0,
NVME_TCP_SEND_H2C_PDU,
@@ -164,16 +174,14 @@ static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
{
struct request *rq;
- unsigned int bytes;
if (unlikely(nvme_tcp_async_req(req)))
return false; /* async events don't have a request */
rq = blk_mq_rq_from_pdu(req);
- bytes = blk_rq_payload_bytes(rq);
- return rq_data_dir(rq) == WRITE && bytes &&
- bytes <= nvme_tcp_inline_data_size(req->queue);
+ return rq_data_dir(rq) == WRITE && req->data_len &&
+ req->data_len <= nvme_tcp_inline_data_size(req->queue);
}
static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
@@ -1017,8 +1025,15 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
if (req->state == NVME_TCP_SEND_DDGST)
ret = nvme_tcp_try_send_ddgst(req);
done:
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
ret = 0;
+ } else if (ret < 0) {
+ dev_err(queue->ctrl->ctrl.device,
+ "failed to send request %d\n", ret);
+ if (ret != -EPIPE && ret != -ECONNRESET)
+ nvme_tcp_fail_request(queue->request);
+ nvme_tcp_done_send_req(queue);
+ }
return ret;
}
@@ -1049,25 +1064,16 @@ static void nvme_tcp_io_work(struct work_struct *w)
int result;
result = nvme_tcp_try_send(queue);
- if (result > 0) {
+ if (result > 0)
pending = true;
- } else if (unlikely(result < 0)) {
- dev_err(queue->ctrl->ctrl.device,
- "failed to send request %d\n", result);
-
- /*
- * Fail the request unless peer closed the connection,
- * in which case error recovery flow will complete all.
- */
- if ((result != -EPIPE) && (result != -ECONNRESET))
- nvme_tcp_fail_request(queue->request);
- nvme_tcp_done_send_req(queue);
- return;
- }
+ else if (unlikely(result < 0))
+ break;
result = nvme_tcp_try_recv(queue);
if (result > 0)
pending = true;
+ else if (unlikely(result < 0))
+ return;
if (!pending)
return;
@@ -1248,13 +1254,67 @@ free_icreq:
return ret;
}
+static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
+{
+ return nvme_tcp_queue_id(queue) == 0;
+}
+
+static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_ctrl *ctrl = queue->ctrl;
+ int qid = nvme_tcp_queue_id(queue);
+
+ return !nvme_tcp_admin_queue(queue) &&
+ qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
+}
+
+static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_ctrl *ctrl = queue->ctrl;
+ int qid = nvme_tcp_queue_id(queue);
+
+ return !nvme_tcp_admin_queue(queue) &&
+ !nvme_tcp_default_queue(queue) &&
+ qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+ ctrl->io_queues[HCTX_TYPE_READ];
+}
+
+static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_ctrl *ctrl = queue->ctrl;
+ int qid = nvme_tcp_queue_id(queue);
+
+ return !nvme_tcp_admin_queue(queue) &&
+ !nvme_tcp_default_queue(queue) &&
+ !nvme_tcp_read_queue(queue) &&
+ qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+ ctrl->io_queues[HCTX_TYPE_READ] +
+ ctrl->io_queues[HCTX_TYPE_POLL];
+}
+
+static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_ctrl *ctrl = queue->ctrl;
+ int qid = nvme_tcp_queue_id(queue);
+ int n = 0;
+
+ if (nvme_tcp_default_queue(queue))
+ n = qid - 1;
+ else if (nvme_tcp_read_queue(queue))
+ n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
+ else if (nvme_tcp_poll_queue(queue))
+ n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
+ ctrl->io_queues[HCTX_TYPE_READ] - 1;
+ queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+}
+
static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
int qid, size_t queue_size)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
struct linger sol = { .l_onoff = 1, .l_linger = 0 };
- int ret, opt, rcv_pdu_size, n;
+ int ret, opt, rcv_pdu_size;
queue->ctrl = ctrl;
INIT_LIST_HEAD(&queue->send_list);
@@ -1309,6 +1369,17 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
goto err_sock;
}
+ if (so_priority > 0) {
+ ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY,
+ (char *)&so_priority, sizeof(so_priority));
+ if (ret) {
+ dev_err(ctrl->ctrl.device,
+ "failed to set SO_PRIORITY sock opt, ret %d\n",
+ ret);
+ goto err_sock;
+ }
+ }
+
/* Set socket type of service */
if (nctrl->opts->tos >= 0) {
opt = nctrl->opts->tos;
@@ -1322,11 +1393,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
}
queue->sock->sk->sk_allocation = GFP_ATOMIC;
- if (!qid)
- n = 0;
- else
- n = (qid - 1) % num_online_cpus();
- queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+ nvme_tcp_set_queue_io_cpu(queue);
queue->request = NULL;
queue->data_remaining = 0;
queue->ddgst_remaining = 0;
@@ -1861,8 +1928,13 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
}
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
- /* state change failure is ok if we're in DELETING state */
+ /*
+ * state change failure is ok if we're in DELETING state,
+ * unless we're during creation of a new controller to
+ * avoid races with teardown flow.
+ */
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+ WARN_ON_ONCE(new);
ret = -EINVAL;
goto destroy_io;
}
@@ -2090,7 +2162,9 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
c->common.flags |= NVME_CMD_SGL_METABUF;
- if (rq_data_dir(rq) == WRITE && req->data_len &&
+ if (!blk_rq_nr_phys_segments(rq))
+ nvme_tcp_set_sg_null(c);
+ else if (rq_data_dir(rq) == WRITE &&
req->data_len <= nvme_tcp_inline_data_size(queue))
nvme_tcp_set_sg_inline(queue, c, req->data_len);
else
@@ -2117,7 +2191,8 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
req->data_sent = 0;
req->pdu_len = 0;
req->pdu_sent = 0;
- req->data_len = blk_rq_payload_bytes(rq);
+ req->data_len = blk_rq_nr_phys_segments(rq) ?
+ blk_rq_payload_bytes(rq) : 0;
req->curr_bio = rq->bio;
if (rq_data_dir(rq) == WRITE &&
@@ -2224,6 +2299,9 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
struct nvme_tcp_queue *queue = hctx->driver_data;
struct sock *sk = queue->sock->sk;
+ if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
+ return 0;
+
if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
sk_busy_loop(sk, true);
nvme_tcp_try_recv(queue);
@@ -2359,8 +2437,6 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
- nvme_get_ctrl(&ctrl->ctrl);
-
mutex_lock(&nvme_tcp_ctrl_mutex);
list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
mutex_unlock(&nvme_tcp_ctrl_mutex);