diff options
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r-- | drivers/nvme/host/core.c | 130 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 11 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.h | 10 | ||||
-rw-r--r-- | drivers/nvme/host/fault_inject.c | 2 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 50 | ||||
-rw-r--r-- | drivers/nvme/host/lightnvm.c | 16 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 96 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 26 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 132 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 26 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 42 | ||||
-rw-r--r-- | drivers/nvme/host/trace.c | 24 | ||||
-rw-r--r-- | drivers/nvme/host/trace.h | 12 |
13 files changed, 330 insertions, 247 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6a9dd68c0f4f..470601980794 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVM Express device driver * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include <linux/blkdev.h> @@ -151,11 +143,8 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync); -static void nvme_delete_ctrl_work(struct work_struct *work) +static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl) { - struct nvme_ctrl *ctrl = - container_of(work, struct nvme_ctrl, delete_work); - dev_info(ctrl->device, "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn); @@ -167,6 +156,14 @@ static void nvme_delete_ctrl_work(struct work_struct *work) nvme_put_ctrl(ctrl); } +static void nvme_delete_ctrl_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, delete_work); + + nvme_do_delete_ctrl(ctrl); +} + int nvme_delete_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) @@ -177,22 +174,22 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_delete_ctrl); -int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) +static int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) { int ret = 0; /* - * Keep a reference until the work is flushed since ->delete_ctrl - * can free the controller. + * Keep a reference until nvme_do_delete_ctrl() complete, + * since ->delete_ctrl can free the controller. */ nvme_get_ctrl(ctrl); - ret = nvme_delete_ctrl(ctrl); + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) + ret = -EBUSY; if (!ret) - flush_work(&ctrl->delete_work); + nvme_do_delete_ctrl(ctrl); nvme_put_ctrl(ctrl); return ret; } -EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync); static inline bool nvme_ns_has_pi(struct nvme_ns *ns) { @@ -611,6 +608,22 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, return BLK_STS_OK; } +static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, + struct request *req, struct nvme_command *cmnd) +{ + if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) + return nvme_setup_discard(ns, req, cmnd); + + cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes; + cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id); + cmnd->write_zeroes.slba = + cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->write_zeroes.length = + cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + cmnd->write_zeroes.control = 0; + return BLK_STS_OK; +} + static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd) { @@ -705,7 +718,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, nvme_setup_flush(ns, cmd); break; case REQ_OP_WRITE_ZEROES: - /* currently only aliased to deallocate for a few ctrls: */ + ret = nvme_setup_write_zeroes(ns, req, cmd); + break; case REQ_OP_DISCARD: ret = nvme_setup_discard(ns, req, cmd); break; @@ -1236,7 +1250,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (ns) { if (ctrl->effects) effects = le32_to_cpu(ctrl->effects->iocs[opcode]); - if (effects & ~NVME_CMD_EFFECTS_CSUPP) + if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) dev_warn(ctrl->device, "IO command:%02x has unhandled effects:%08x\n", opcode, effects); @@ -1481,10 +1495,10 @@ static void nvme_set_chunk_size(struct nvme_ns *ns) blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); } -static void nvme_config_discard(struct nvme_ns *ns) +static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) { struct nvme_ctrl *ctrl = ns->ctrl; - struct request_queue *queue = ns->queue; + struct request_queue *queue = disk->queue; u32 size = queue_logical_block_size(queue); if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) { @@ -1512,6 +1526,32 @@ static void nvme_config_discard(struct nvme_ns *ns) blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); } +static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) +{ + u32 max_sectors; + unsigned short bs = 1 << ns->lba_shift; + + if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) || + (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) + return; + /* + * Even though NVMe spec explicitly states that MDTS is not + * applicable to the write-zeroes:- "The restriction does not apply to + * commands that do not transfer data between the host and the + * controller (e.g., Write Uncorrectable ro Write Zeroes command).". + * In order to be more cautious use controller's max_hw_sectors value + * to configure the maximum sectors for the write-zeroes which is + * configured based on the controller's MDTS field in the + * nvme_init_identify() if available. + */ + if (ns->ctrl->max_hw_sectors == UINT_MAX) + max_sectors = ((u32)(USHRT_MAX + 1) * bs) >> 9; + else + max_sectors = ((u32)(ns->ctrl->max_hw_sectors + 1) * bs) >> 9; + + blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors); +} + static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, struct nvme_id_ns *id, struct nvme_ns_ids *ids) { @@ -1565,7 +1605,9 @@ static void nvme_update_disk_info(struct gendisk *disk, capacity = 0; set_capacity(disk, capacity); - nvme_config_discard(ns); + + nvme_config_discard(disk, ns); + nvme_config_write_zeroes(disk, ns); if (id->nsattr & (1 << 0)) set_disk_ro(disk, true); @@ -2280,6 +2322,9 @@ static struct attribute *nvme_subsys_attrs[] = { &subsys_attr_serial.attr, &subsys_attr_firmware_rev.attr, &subsys_attr_subsysnqn.attr, +#ifdef CONFIG_NVME_MULTIPATH + &subsys_attr_iopolicy.attr, +#endif NULL, }; @@ -2332,6 +2377,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); subsys->vendor_id = le16_to_cpu(id->vid); subsys->cmic = id->cmic; +#ifdef CONFIG_NVME_MULTIPATH + subsys->iopolicy = NVME_IOPOLICY_NUMA; +#endif subsys->dev.class = nvme_subsys_class; subsys->dev.release = nvme_release_subsystem; @@ -3163,21 +3211,23 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) return 0; } -static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) +static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns; struct gendisk *disk; struct nvme_id_ns *id; char disk_name[DISK_NAME_LEN]; - int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT; + int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret; ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) - return; + return -ENOMEM; ns->queue = blk_mq_init_queue(ctrl->tagset); - if (IS_ERR(ns->queue)) + if (IS_ERR(ns->queue)) { + ret = PTR_ERR(ns->queue); goto out_free_ns; + } blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) @@ -3193,20 +3243,27 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_set_queue_limits(ctrl, ns->queue); id = nvme_identify_ns(ctrl, nsid); - if (!id) + if (!id) { + ret = -EIO; goto out_free_queue; + } - if (id->ncap == 0) + if (id->ncap == 0) { + ret = -EINVAL; goto out_free_id; + } - if (nvme_init_ns_head(ns, nsid, id)) + ret = nvme_init_ns_head(ns, nsid, id); + if (ret) goto out_free_id; nvme_setup_streams_ns(ctrl, ns); nvme_set_disk_name(disk_name, ns, ctrl, &flags); disk = alloc_disk_node(0, node); - if (!disk) + if (!disk) { + ret = -ENOMEM; goto out_unlink_ns; + } disk->fops = &nvme_fops; disk->private_data = ns; @@ -3218,7 +3275,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) __nvme_revalidate_disk(disk, id); if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { - if (nvme_nvm_register(ns, disk_name, node)) { + ret = nvme_nvm_register(ns, disk_name, node); + if (ret) { dev_warn(ctrl->device, "LightNVM init failure\n"); goto out_put_disk; } @@ -3236,19 +3294,21 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_fault_inject_init(ns); kfree(id); - return; + return 0; out_put_disk: put_disk(ns->disk); out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); mutex_unlock(&ctrl->subsys->lock); + nvme_put_ns_head(ns->head); out_free_id: kfree(id); out_free_queue: blk_cleanup_queue(ns->queue); out_free_ns: kfree(ns); + return ret; } static void nvme_ns_remove(struct nvme_ns *ns) @@ -3596,8 +3656,6 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fw_act_work); - if (ctrl->ops->stop_ctrl) - ctrl->ops->stop_ctrl(ctrl); } EXPORT_SYMBOL_GPL(nvme_stop_ctrl); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 3eb908c50e1a..d4cb826f58ff 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVMe over Fabrics common host code. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/init.h> @@ -430,6 +422,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue); * @qid: NVMe I/O queue number for the new I/O connection between * host and target (note qid == 0 is illegal as this is * the Admin queue, per NVMe standard). + * @poll: Whether or not to poll for the completion of the connect cmd. * * This function issues a fabrics-protocol connection * of a NVMe I/O queue (via NVMe Fabrics "Connect" command) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 478343b73e38..3044d8b99a24 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -1,15 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * NVMe over Fabrics common host code. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _NVME_FABRICS_H #define _NVME_FABRICS_H 1 diff --git a/drivers/nvme/host/fault_inject.c b/drivers/nvme/host/fault_inject.c index 02632266ac06..4cfd2c9222d4 100644 --- a/drivers/nvme/host/fault_inject.c +++ b/drivers/nvme/host/fault_inject.c @@ -1,8 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fault injection support for nvme. * * Copyright (c) 2018, Oracle and/or its affiliates - * */ #include <linux/moduleparam.h> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 89accc76d71c..f3b9d91ba0df 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1,18 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016 Avago Technologies. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful. - * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES, - * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A - * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO - * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID. - * See the GNU General Public License for more details, a copy of which - * can be found in the file COPYING included with this package - * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> @@ -2119,7 +2107,7 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, freq->sg_cnt = 0; - if (!blk_rq_payload_bytes(rq)) + if (!blk_rq_nr_phys_segments(rq)) return 0; freq->sg_table.sgl = freq->first_sgl; @@ -2316,12 +2304,23 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret) return ret; - data_len = blk_rq_payload_bytes(rq); - if (data_len) + /* + * nvme core doesn't quite treat the rq opaquely. Commands such + * as WRITE ZEROES will return a non-zero rq payload_bytes yet + * there is no actual payload to be transferred. + * To get it right, key data transmission on there being 1 or + * more physical segments in the sg list. If there is no + * physical segments, there is no payload. + */ + if (blk_rq_nr_phys_segments(rq)) { + data_len = blk_rq_payload_bytes(rq); io_dir = ((rq_data_dir(rq) == WRITE) ? NVMEFC_FCP_WRITE : NVMEFC_FCP_READ); - else + } else { + data_len = 0; io_dir = NVMEFC_FCP_NODATA; + } + return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir); } @@ -2476,6 +2475,7 @@ static int nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + u32 prior_ioq_cnt = ctrl->ctrl.queue_count - 1; unsigned int nr_io_queues; int ret; @@ -2488,6 +2488,13 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) return ret; } + if (!nr_io_queues && prior_ioq_cnt) { + dev_info(ctrl->ctrl.device, + "Fail Reconnect: At least 1 io queue " + "required (was %d)\n", prior_ioq_cnt); + return -ENOSPC; + } + ctrl->ctrl.queue_count = nr_io_queues + 1; /* check for io queues existing */ if (ctrl->ctrl.queue_count == 1) @@ -2501,6 +2508,10 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queues; + if (prior_ioq_cnt != nr_io_queues) + dev_info(ctrl->ctrl.device, + "reconnect: revising io queue count from %d to %d\n", + prior_ioq_cnt, nr_io_queues); blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); return 0; @@ -3018,7 +3029,10 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->ctrl.opts = opts; ctrl->ctrl.nr_reconnects = 0; - ctrl->ctrl.numa_node = dev_to_node(lport->dev); + if (lport->dev) + ctrl->ctrl.numa_node = dev_to_node(lport->dev); + else + ctrl->ctrl.numa_node = NUMA_NO_NODE; INIT_LIST_HEAD(&ctrl->ctrl_list); ctrl->lport = lport; ctrl->rport = rport; diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index b759c25c89c8..949e29e1d782 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -1,23 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * nvme-lightnvm.c - LightNVM NVMe device * * Copyright (C) 2014-2015 IT University of Copenhagen * Initial release: Matias Bjorling <mb@lightnvm.io> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, - * USA. - * */ #include "nvme.h" diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index b9fff3b8ed1b..2839bb70badf 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2017-2018 Christoph Hellwig. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include <linux/moduleparam.h> @@ -141,7 +133,10 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) test_bit(NVME_NS_ANA_PENDING, &ns->flags)) continue; - distance = node_distance(node, ns->ctrl->numa_node); + if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) + distance = node_distance(node, ns->ctrl->numa_node); + else + distance = LOCAL_DISTANCE; switch (ns->ana_state) { case NVME_ANA_OPTIMIZED: @@ -168,6 +163,47 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) return found; } +static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, + struct nvme_ns *ns) +{ + ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, + siblings); + if (ns) + return ns; + return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); +} + +static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, + int node, struct nvme_ns *old) +{ + struct nvme_ns *ns, *found, *fallback = NULL; + + if (list_is_singular(&head->list)) + return old; + + for (ns = nvme_next_ns(head, old); + ns != old; + ns = nvme_next_ns(head, ns)) { + if (ns->ctrl->state != NVME_CTRL_LIVE || + test_bit(NVME_NS_ANA_PENDING, &ns->flags)) + continue; + + if (ns->ana_state == NVME_ANA_OPTIMIZED) { + found = ns; + goto out; + } + if (ns->ana_state == NVME_ANA_NONOPTIMIZED) + fallback = ns; + } + + if (!fallback) + return NULL; + found = fallback; +out: + rcu_assign_pointer(head->current_path[node], found); + return found; +} + static inline bool nvme_path_is_optimized(struct nvme_ns *ns) { return ns->ctrl->state == NVME_CTRL_LIVE && @@ -180,6 +216,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) struct nvme_ns *ns; ns = srcu_dereference(head->current_path[node], &head->srcu); + if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns) + ns = nvme_round_robin_path(head, node, ns); if (unlikely(!ns || !nvme_path_is_optimized(ns))) ns = __nvme_find_path(head, node); return ns; @@ -471,6 +509,44 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl) cancel_work_sync(&ctrl->ana_work); } +#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ + struct device_attribute subsys_attr_##_name = \ + __ATTR(_name, _mode, _show, _store) + +static const char *nvme_iopolicy_names[] = { + [NVME_IOPOLICY_NUMA] = "numa", + [NVME_IOPOLICY_RR] = "round-robin", +}; + +static ssize_t nvme_subsys_iopolicy_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_subsystem *subsys = + container_of(dev, struct nvme_subsystem, dev); + + return sprintf(buf, "%s\n", + nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); +} + +static ssize_t nvme_subsys_iopolicy_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_subsystem *subsys = + container_of(dev, struct nvme_subsystem, dev); + int i; + + for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { + if (sysfs_streq(buf, nvme_iopolicy_names[i])) { + WRITE_ONCE(subsys->iopolicy, i); + return count; + } + } + + return -EINVAL; +} +SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, + nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); + static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index c4a1bb41abf0..527d64545023 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _NVME_H @@ -95,6 +87,11 @@ enum nvme_quirks { * Ignore device provided subnqn. */ NVME_QUIRK_IGNORE_DEV_SUBNQN = (1 << 8), + + /* + * Broken Write Zeroes. + */ + NVME_QUIRK_DISABLE_WRITE_ZEROES = (1 << 9), }; /* @@ -252,6 +249,11 @@ struct nvme_ctrl { unsigned long discard_page_busy; }; +enum nvme_iopolicy { + NVME_IOPOLICY_NUMA, + NVME_IOPOLICY_RR, +}; + struct nvme_subsystem { int instance; struct device dev; @@ -271,6 +273,9 @@ struct nvme_subsystem { u8 cmic; u16 vendor_id; struct ida ns_ida; +#ifdef CONFIG_NVME_MULTIPATH + enum nvme_iopolicy iopolicy; +#endif }; /* @@ -364,7 +369,6 @@ struct nvme_ctrl_ops { void (*submit_async_event)(struct nvme_ctrl *ctrl); void (*delete_ctrl)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); - void (*stop_ctrl)(struct nvme_ctrl *ctrl); }; #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS @@ -459,7 +463,6 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); -int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, void *log, size_t size, u64 offset); @@ -492,6 +495,7 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; +extern struct device_attribute subsys_attr_iopolicy; #else static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7fee665ec45e..a90cf5d63aac 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVM Express device driver * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include <linux/aer.h> @@ -157,6 +149,8 @@ static int queue_count_set(const char *val, const struct kernel_param *kp) int n = 0, ret; ret = kstrtoint(val, 10, &n); + if (ret) + return ret; if (n > num_possible_cpus()) n = num_possible_cpus(); @@ -2041,53 +2035,52 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) return ret; } -/* irq_queues covers admin queue */ -static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues) +/* + * nirqs is the number of interrupts available for write and read + * queues. The core already reserved an interrupt for the admin queue. + */ +static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs) { - unsigned int this_w_queues = write_queues; - - WARN_ON(!irq_queues); + struct nvme_dev *dev = affd->priv; + unsigned int nr_read_queues; /* - * Setup read/write queue split, assign admin queue one independent - * irq vector if irq_queues is > 1. + * If there is no interupt available for queues, ensure that + * the default queue is set to 1. The affinity set size is + * also set to one, but the irq core ignores it for this case. + * + * If only one interrupt is available or 'write_queue' == 0, combine + * write and read queues. + * + * If 'write_queues' > 0, ensure it leaves room for at least one read + * queue. */ - if (irq_queues <= 2) { - dev->io_queues[HCTX_TYPE_DEFAULT] = 1; - dev->io_queues[HCTX_TYPE_READ] = 0; - return; - } - - /* - * If 'write_queues' is set, ensure it leaves room for at least - * one read queue and one admin queue - */ - if (this_w_queues >= irq_queues) - this_w_queues = irq_queues - 2; - - /* - * If 'write_queues' is set to zero, reads and writes will share - * a queue set. - */ - if (!this_w_queues) { - dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues - 1; - dev->io_queues[HCTX_TYPE_READ] = 0; + if (!nrirqs) { + nrirqs = 1; + nr_read_queues = 0; + } else if (nrirqs == 1 || !write_queues) { + nr_read_queues = 0; + } else if (write_queues >= nrirqs) { + nr_read_queues = 1; } else { - dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues; - dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues - 1; + nr_read_queues = nrirqs - write_queues; } + + dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; + affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; + dev->io_queues[HCTX_TYPE_READ] = nr_read_queues; + affd->set_size[HCTX_TYPE_READ] = nr_read_queues; + affd->nr_sets = nr_read_queues ? 2 : 1; } static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) { struct pci_dev *pdev = to_pci_dev(dev->dev); - int irq_sets[2]; struct irq_affinity affd = { - .pre_vectors = 1, - .nr_sets = ARRAY_SIZE(irq_sets), - .sets = irq_sets, + .pre_vectors = 1, + .calc_sets = nvme_calc_irq_sets, + .priv = dev, }; - int result = 0; unsigned int irq_queues, this_p_queues; /* @@ -2103,51 +2096,12 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) } dev->io_queues[HCTX_TYPE_POLL] = this_p_queues; - /* - * For irq sets, we have to ask for minvec == maxvec. This passes - * any reduction back to us, so we can adjust our queue counts and - * IRQ vector needs. - */ - do { - nvme_calc_io_queues(dev, irq_queues); - irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT]; - irq_sets[1] = dev->io_queues[HCTX_TYPE_READ]; - if (!irq_sets[1]) - affd.nr_sets = 1; - - /* - * If we got a failure and we're down to asking for just - * 1 + 1 queues, just ask for a single vector. We'll share - * that between the single IO queue and the admin queue. - * Otherwise, we assign one independent vector to admin queue. - */ - if (irq_queues > 1) - irq_queues = irq_sets[0] + irq_sets[1] + 1; - - result = pci_alloc_irq_vectors_affinity(pdev, irq_queues, - irq_queues, - PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); - - /* - * Need to reduce our vec counts. If we get ENOSPC, the - * platform should support mulitple vecs, we just need - * to decrease our ask. If we get EINVAL, the platform - * likely does not. Back down to ask for just one vector. - */ - if (result == -ENOSPC) { - irq_queues--; - if (!irq_queues) - return result; - continue; - } else if (result == -EINVAL) { - irq_queues = 1; - continue; - } else if (result <= 0) - return -EIO; - break; - } while (1); + /* Initialize for the single interrupt case */ + dev->io_queues[HCTX_TYPE_DEFAULT] = 1; + dev->io_queues[HCTX_TYPE_READ] = 0; - return result; + return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, + PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); } static void nvme_disable_io_queues(struct nvme_dev *dev) @@ -2983,7 +2937,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ - .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, + .driver_data = NVME_QUIRK_IDENTIFY_CNS | + NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ @@ -3024,6 +2979,7 @@ static struct pci_driver nvme_driver = { static int __init nvme_init(void) { + BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); return pci_register_driver(&nvme_driver); } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 52abc3a6de12..11a5ecae78c8 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVMe over Fabrics RDMA host code. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> @@ -942,14 +934,6 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, } } -static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - - cancel_work_sync(&ctrl->err_work); - cancel_delayed_work_sync(&ctrl->reconnect_work); -} - static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); @@ -1158,7 +1142,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, struct nvme_rdma_device *dev = queue->device; struct ib_device *ibdev = dev->dev; - if (!blk_rq_payload_bytes(rq)) + if (!blk_rq_nr_phys_segments(rq)) return; if (req->mr) { @@ -1281,7 +1265,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, c->common.flags |= NVME_CMD_SGL_METABUF; - if (!blk_rq_payload_bytes(rq)) + if (!blk_rq_nr_phys_segments(rq)) return nvme_rdma_set_sg_null(c); req->sg_table.sgl = req->first_sgl; @@ -1854,6 +1838,9 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { + cancel_work_sync(&ctrl->err_work); + cancel_delayed_work_sync(&ctrl->reconnect_work); + nvme_rdma_teardown_io_queues(ctrl, shutdown); if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); @@ -1902,7 +1889,6 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .submit_async_event = nvme_rdma_submit_async_event, .delete_ctrl = nvme_rdma_delete_ctrl, .get_address = nvmf_get_address, - .stop_ctrl = nvme_rdma_stop_ctrl, }; /* diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 5f0a00425242..e7e08889865e 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -463,6 +463,15 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue, queue->data_remaining = le32_to_cpu(pdu->data_length); + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS && + unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag %#x SUCCESS set but not last PDU\n", + nvme_tcp_queue_id(queue), rq->tag); + nvme_tcp_error_recovery(&queue->ctrl->ctrl); + return -EPROTO; + } + return 0; } @@ -618,6 +627,14 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, return ret; } +static inline void nvme_tcp_end_request(struct request *rq, __le16 status) +{ + union nvme_result res = {}; + + nvme_end_request(rq, cpu_to_le16(status << 1), res); +} + + static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { @@ -685,6 +702,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; } else { + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) + nvme_tcp_end_request(rq, NVME_SC_SUCCESS); nvme_tcp_init_recv_ctx(queue); } } @@ -695,6 +714,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { + struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; char *ddgst = (char *)&queue->recv_ddgst; size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining); off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining; @@ -718,6 +738,13 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, return -EIO; } + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { + struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), + pdu->command_id); + + nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + } + nvme_tcp_init_recv_ctx(queue); return 0; } @@ -815,10 +842,7 @@ static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) static void nvme_tcp_fail_request(struct nvme_tcp_request *req) { - union nvme_result res = {}; - - nvme_end_request(blk_mq_rq_from_pdu(req), - cpu_to_le16(NVME_SC_DATA_XFER_ERROR), res); + nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_DATA_XFER_ERROR); } static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) @@ -1822,6 +1846,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) { + cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work); + cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); + nvme_tcp_teardown_io_queues(ctrl, shutdown); if (shutdown) nvme_shutdown_ctrl(ctrl); @@ -1859,12 +1886,6 @@ out_fail: nvme_tcp_reconnect_or_remove(ctrl); } -static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl) -{ - cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work); - cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); -} - static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); @@ -2115,7 +2136,6 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { .submit_async_event = nvme_tcp_submit_async_event, .delete_ctrl = nvme_tcp_delete_ctrl, .get_address = nvmf_get_address, - .stop_ctrl = nvme_tcp_stop_ctrl, }; static bool diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 5566dda3237a..5f24ea7a28eb 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVM Express device driver tracepoints * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include <asm/unaligned.h> @@ -58,7 +50,19 @@ static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10) return ret; } +static const char *nvme_trace_admin_get_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sel = cdw10[1] & 0x7; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11); + trace_seq_putc(p, 0); + return ret; +} static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) { @@ -109,6 +113,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, return nvme_trace_create_cq(p, cdw10); case nvme_admin_identify: return nvme_trace_admin_identify(p, cdw10); + case nvme_admin_get_features: + return nvme_trace_admin_get_features(p, cdw10); default: return nvme_trace_common(p, cdw10); } diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 3564120aa7b3..97d3c77365b8 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -1,15 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * NVM Express device driver tracepoints * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #undef TRACE_SYSTEM @@ -116,7 +108,7 @@ TRACE_EVENT(nvme_setup_cmd, __entry->metadata = le64_to_cpu(cmd->common.metadata); __assign_disk_name(__entry->disk, req->rq_disk); memcpy(__entry->cdw10, &cmd->common.cdw10, - 6 * sizeof(__entry->cdw10)); + sizeof(__entry->cdw10)); ), TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", __entry->ctrl_id, __print_disk_name(__entry->disk), |